This commit is contained in:
2026-01-06 16:19:29 -08:00
parent 41c11d39fd
commit edc33f4528
10 changed files with 1192 additions and 881 deletions

View File

@@ -8,6 +8,9 @@ import requests
import sys
import json
import subprocess
from API.HTTP import HTTPClient
from ProviderCore.base import SearchResult
try: # Optional dependency for IMDb scraping
from imdbinfo.services import search_title # type: ignore
except ImportError: # pragma: no cover - optional
@@ -15,6 +18,7 @@ except ImportError: # pragma: no cover - optional
from SYS.logger import log, debug
from SYS.metadata import imdb_tag
from SYS.json_table import normalize_record
try: # Optional dependency
import musicbrainzngs # type: ignore
@@ -892,6 +896,524 @@ class YtdlpMetadataProvider(MetadataProvider):
return out
def _coerce_archive_field_list(value: Any) -> List[str]:
"""Coerce an Archive.org metadata field to a list of strings."""
if value is None:
return []
if isinstance(value, list):
out: List[str] = []
for v in value:
try:
s = str(v).strip()
except Exception:
continue
if s:
out.append(s)
return out
if isinstance(value, (tuple, set)):
out = []
for v in value:
try:
s = str(v).strip()
except Exception:
continue
if s:
out.append(s)
return out
try:
s = str(value).strip()
except Exception:
return []
return [s] if s else []
def archive_item_metadata_to_tags(archive_id: str,
item_metadata: Dict[str, Any]) -> List[str]:
"""Coerce Archive.org metadata into a stable set of bibliographic tags."""
archive_id_clean = str(archive_id or "").strip()
meta = item_metadata if isinstance(item_metadata, dict) else {}
tags: List[str] = []
seen: set[str] = set()
def _add(tag: str) -> None:
try:
t = str(tag).strip()
except Exception:
return
if not t:
return
if t.lower() in seen:
return
seen.add(t.lower())
tags.append(t)
if archive_id_clean:
_add(f"internet_archive:{archive_id_clean}")
for title in _coerce_archive_field_list(meta.get("title"))[:1]:
_add(f"title:{title}")
creators: List[str] = []
creators.extend(_coerce_archive_field_list(meta.get("creator")))
creators.extend(_coerce_archive_field_list(meta.get("author")))
for creator in creators[:3]:
_add(f"author:{creator}")
for publisher in _coerce_archive_field_list(meta.get("publisher"))[:3]:
_add(f"publisher:{publisher}")
for date_val in _coerce_archive_field_list(meta.get("date"))[:1]:
_add(f"publish_date:{date_val}")
for year_val in _coerce_archive_field_list(meta.get("year"))[:1]:
_add(f"publish_date:{year_val}")
for lang in _coerce_archive_field_list(meta.get("language"))[:3]:
_add(f"language:{lang}")
for subj in _coerce_archive_field_list(meta.get("subject"))[:15]:
if len(subj) > 200:
subj = subj[:200]
_add(subj)
def _clean_isbn(raw: str) -> str:
return str(raw or "").replace("-", "").strip()
for isbn in _coerce_archive_field_list(meta.get("isbn"))[:10]:
isbn_clean = _clean_isbn(isbn)
if isbn_clean:
_add(f"isbn:{isbn_clean}")
identifiers: List[str] = []
identifiers.extend(_coerce_archive_field_list(meta.get("identifier")))
identifiers.extend(_coerce_archive_field_list(meta.get("external-identifier")))
added_other = 0
for ident in identifiers:
ident_s = str(ident or "").strip()
if not ident_s:
continue
low = ident_s.lower()
if low.startswith("urn:isbn:"):
val = _clean_isbn(ident_s.split(":", 2)[-1])
if val:
_add(f"isbn:{val}")
continue
if low.startswith("isbn:"):
val = _clean_isbn(ident_s.split(":", 1)[-1])
if val:
_add(f"isbn:{val}")
continue
if low.startswith("urn:oclc:"):
val = ident_s.split(":", 2)[-1].strip()
if val:
_add(f"oclc:{val}")
continue
if low.startswith("oclc:"):
val = ident_s.split(":", 1)[-1].strip()
if val:
_add(f"oclc:{val}")
continue
if low.startswith("urn:lccn:"):
val = ident_s.split(":", 2)[-1].strip()
if val:
_add(f"lccn:{val}")
continue
if low.startswith("lccn:"):
val = ident_s.split(":", 1)[-1].strip()
if val:
_add(f"lccn:{val}")
continue
if low.startswith("doi:"):
val = ident_s.split(":", 1)[-1].strip()
if val:
_add(f"doi:{val}")
continue
if archive_id_clean and low == archive_id_clean.lower():
continue
if added_other >= 5:
continue
if len(ident_s) > 200:
ident_s = ident_s[:200]
_add(f"identifier:{ident_s}")
added_other += 1
return tags
def fetch_archive_item_metadata(archive_id: str,
*,
timeout: int = 8) -> Dict[str, Any]:
ident = str(archive_id or "").strip()
if not ident:
return {}
resp = requests.get(f"https://archive.org/metadata/{ident}", timeout=int(timeout))
resp.raise_for_status()
data = resp.json() if resp is not None else {}
if not isinstance(data, dict):
return {}
meta = data.get("metadata")
return meta if isinstance(meta, dict) else {}
def scrape_isbn_metadata(isbn: str) -> List[str]:
"""Scrape metadata tags for an ISBN using OpenLibrary's books API."""
new_tags: List[str] = []
isbn_clean = str(isbn or "").replace("isbn:", "").replace("-", "").strip()
if not isbn_clean:
return []
url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json"
try:
with HTTPClient() as client:
response = client.get(url)
response.raise_for_status()
data = json.loads(response.content.decode("utf-8"))
except Exception as exc:
log(f"Failed to fetch ISBN metadata: {exc}", file=sys.stderr)
return []
if not data:
log(f"No ISBN metadata found for: {isbn}")
return []
book_data = next(iter(data.values()), None)
if not isinstance(book_data, dict):
return []
if "title" in book_data:
new_tags.append(f"title:{book_data['title']}")
authors = book_data.get("authors")
if isinstance(authors, list):
for author in authors[:3]:
if isinstance(author, dict) and author.get("name"):
new_tags.append(f"author:{author['name']}")
if book_data.get("publish_date"):
new_tags.append(f"publish_date:{book_data['publish_date']}")
publishers = book_data.get("publishers")
if isinstance(publishers, list) and publishers:
pub = publishers[0]
if isinstance(pub, dict) and pub.get("name"):
new_tags.append(f"publisher:{pub['name']}")
if "description" in book_data:
desc = book_data.get("description")
if isinstance(desc, dict) and "value" in desc:
desc = desc.get("value")
if desc:
desc_str = str(desc).strip()
if desc_str:
new_tags.append(f"description:{desc_str[:200]}")
page_count = book_data.get("number_of_pages")
if isinstance(page_count, int) and page_count > 0:
new_tags.append(f"pages:{page_count}")
identifiers = book_data.get("identifiers")
if isinstance(identifiers, dict):
def _first(value: Any) -> Any:
if isinstance(value, list) and value:
return value[0]
return value
for key, ns in (
("openlibrary", "openlibrary"),
("lccn", "lccn"),
("oclc", "oclc"),
("goodreads", "goodreads"),
("librarything", "librarything"),
("doi", "doi"),
("internet_archive", "internet_archive"),
):
val = _first(identifiers.get(key))
if val:
new_tags.append(f"{ns}:{val}")
debug(f"Found {len(new_tags)} tag(s) from ISBN lookup")
return new_tags
def scrape_openlibrary_metadata(olid: str) -> List[str]:
"""Scrape metadata tags for an OpenLibrary ID using the edition JSON endpoint."""
new_tags: List[str] = []
olid_text = str(olid or "").strip()
if not olid_text:
return []
olid_norm = olid_text
try:
if not olid_norm.startswith("OL"):
olid_norm = f"OL{olid_norm}"
if not olid_norm.endswith("M"):
olid_norm = f"{olid_norm}M"
except Exception:
olid_norm = olid_text
new_tags.append(f"openlibrary:{olid_norm}")
olid_clean = olid_text.replace("OL", "").replace("M", "")
if not olid_clean.isdigit():
olid_clean = olid_text
if not olid_text.startswith("OL"):
url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
else:
url = f"https://openlibrary.org/books/{olid_text}.json"
try:
with HTTPClient() as client:
response = client.get(url)
response.raise_for_status()
data = json.loads(response.content.decode("utf-8"))
except Exception as exc:
log(f"Failed to fetch OpenLibrary metadata: {exc}", file=sys.stderr)
return []
if not isinstance(data, dict) or not data:
log(f"No OpenLibrary metadata found for: {olid_text}")
return []
if "title" in data:
new_tags.append(f"title:{data['title']}")
authors = data.get("authors")
if isinstance(authors, list):
for author in authors[:3]:
if isinstance(author, dict) and author.get("name"):
new_tags.append(f"author:{author['name']}")
continue
author_key = None
if isinstance(author, dict):
if isinstance(author.get("author"), dict):
author_key = author.get("author", {}).get("key")
if not author_key:
author_key = author.get("key")
if isinstance(author_key, str) and author_key.startswith("/"):
try:
author_url = f"https://openlibrary.org{author_key}.json"
with HTTPClient(timeout=10) as client:
author_resp = client.get(author_url)
author_resp.raise_for_status()
author_data = json.loads(author_resp.content.decode("utf-8"))
if isinstance(author_data, dict) and author_data.get("name"):
new_tags.append(f"author:{author_data['name']}")
continue
except Exception:
pass
if isinstance(author, str) and author:
new_tags.append(f"author:{author}")
if data.get("publish_date"):
new_tags.append(f"publish_date:{data['publish_date']}")
publishers = data.get("publishers")
if isinstance(publishers, list) and publishers:
pub = publishers[0]
if isinstance(pub, dict) and pub.get("name"):
new_tags.append(f"publisher:{pub['name']}")
elif isinstance(pub, str) and pub:
new_tags.append(f"publisher:{pub}")
if "description" in data:
desc = data.get("description")
if isinstance(desc, dict) and "value" in desc:
desc = desc.get("value")
if desc:
desc_str = str(desc).strip()
if desc_str:
new_tags.append(f"description:{desc_str[:200]}")
page_count = data.get("number_of_pages")
if isinstance(page_count, int) and page_count > 0:
new_tags.append(f"pages:{page_count}")
subjects = data.get("subjects")
if isinstance(subjects, list):
for subject in subjects[:10]:
if isinstance(subject, str):
subject_clean = subject.strip()
if subject_clean and subject_clean not in new_tags:
new_tags.append(subject_clean)
identifiers = data.get("identifiers")
if isinstance(identifiers, dict):
def _first(value: Any) -> Any:
if isinstance(value, list) and value:
return value[0]
return value
for key, ns in (
("isbn_10", "isbn_10"),
("isbn_13", "isbn_13"),
("lccn", "lccn"),
("oclc_numbers", "oclc"),
("goodreads", "goodreads"),
("internet_archive", "internet_archive"),
):
val = _first(identifiers.get(key))
if val:
new_tags.append(f"{ns}:{val}")
ocaid = data.get("ocaid")
if isinstance(ocaid, str) and ocaid.strip():
new_tags.append(f"internet_archive:{ocaid.strip()}")
debug(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup")
return new_tags
SAMPLE_ITEMS: List[Dict[str, Any]] = [
{
"title": "Sample OpenLibrary book",
"path": "https://openlibrary.org/books/OL123M",
"openlibrary_id": "OL123M",
"archive_id": "samplearchive123",
"availability": "borrow",
"availability_reason": "sample",
"direct_url": "https://archive.org/download/sample.pdf",
"author_name": ["OpenLibrary Demo"],
"first_publish_year": 2023,
"ia": ["samplearchive123"],
},
]
try:
from typing import Iterable
from SYS.result_table_api import ColumnSpec, ResultModel, metadata_column, title_column
from SYS.result_table_adapters import register_provider
def _ensure_search_result(item: Any) -> SearchResult:
if isinstance(item, SearchResult):
return item
if isinstance(item, dict):
data = dict(item)
title = str(data.get("title") or data.get("name") or "OpenLibrary")
path = str(data.get("path") or data.get("url") or "")
detail = str(data.get("detail") or "")
annotations = list(data.get("annotations") or [])
media_kind = str(data.get("media_kind") or "book")
return SearchResult(
table="openlibrary",
title=title,
path=path,
detail=detail,
annotations=annotations,
media_kind=media_kind,
columns=data.get("columns") or [],
full_metadata={**data, "raw": dict(item)},
)
return SearchResult(
table="openlibrary",
title=str(item or "OpenLibrary"),
path="",
detail="",
annotations=[],
media_kind="book",
full_metadata={"raw": {}},
)
def _adapter(items: Iterable[Any]) -> Iterable[ResultModel]:
for item in items:
sr = _ensure_search_result(item)
metadata = dict(getattr(sr, "full_metadata", {}) or {})
raw = metadata.get("raw")
if isinstance(raw, dict):
normalized = normalize_record(raw)
for key, val in normalized.items():
metadata.setdefault(key, val)
def _make_url() -> str:
candidate = (
metadata.get("selection_url") or
metadata.get("direct_url") or
metadata.get("url") or
metadata.get("path") or
sr.path or
""
)
return str(candidate or "").strip()
selection_url = _make_url()
if selection_url:
metadata["selection_url"] = selection_url
authors_value = metadata.get("authors_display") or metadata.get("authors") or metadata.get("author_name") or ""
if isinstance(authors_value, list):
authors_value = ", ".join(str(v) for v in authors_value if v)
authors_text = str(authors_value or "").strip()
if authors_text:
metadata["authors_display"] = authors_text
year_value = metadata.get("year") or metadata.get("first_publish_year")
if year_value and not isinstance(year_value, str):
year_value = str(year_value)
if year_value:
metadata["year"] = str(year_value)
metadata.setdefault("openlibrary_id", metadata.get("openlibrary_id") or metadata.get("olid"))
metadata.setdefault("source", metadata.get("source") or "openlibrary")
yield ResultModel(
title=str(sr.title or metadata.get("title") or selection_url or "OpenLibrary"),
path=selection_url or None,
metadata=metadata,
source="openlibrary",
)
def _columns_factory(rows: List[ResultModel]) -> List[ColumnSpec]:
cols: List[ColumnSpec] = [title_column()]
def _has(key: str) -> bool:
return any((row.metadata or {}).get(key) for row in rows)
if _has("authors_display"):
cols.append(
ColumnSpec(
"authors_display",
"Author",
lambda r: (r.metadata or {}).get("authors_display") or "",
)
)
if _has("year"):
cols.append(metadata_column("year", "Year"))
if _has("availability"):
cols.append(metadata_column("availability", "Avail"))
if _has("archive_id"):
cols.append(metadata_column("archive_id", "Archive ID"))
if _has("openlibrary_id"):
cols.append(metadata_column("openlibrary_id", "OLID"))
return cols
def _selection_fn(row: ResultModel) -> List[str]:
metadata = row.metadata or {}
url = str(metadata.get("selection_url") or row.path or "").strip()
if url:
return ["-url", url]
return ["-title", row.title or ""]
register_provider(
"openlibrary",
_adapter,
columns=_columns_factory,
selection_fn=_selection_fn,
metadata={"description": "OpenLibrary search provider (JSON result table template)"},
)
except Exception:
pass
# Registry ---------------------------------------------------------------
_METADATA_PROVIDERS: Dict[str,

View File

@@ -11,18 +11,29 @@ import sys
import tempfile
import time
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from urllib.parse import urlparse
import requests
from API.HTTP import HTTPClient
from API.HTTP import HTTPClient, get_requests_verify_value
from ProviderCore.base import Provider, SearchResult
from SYS.utils import sanitize_filename
from SYS.cli_syntax import get_field, get_free_text, parse_query
from SYS.logger import debug, log
from Provider.metadata_provider import (
archive_item_metadata_to_tags,
fetch_archive_item_metadata,
)
from SYS.utils import unique_path
_ARCHIVE_VERIFY_VALUE = get_requests_verify_value()
def _create_archive_session() -> requests.Session:
session = requests.Session()
session.verify = _ARCHIVE_VERIFY_VALUE
return session
try:
from Crypto.Cipher import AES # type: ignore
from Crypto.Util import Counter # type: ignore
@@ -262,182 +273,6 @@ def title_hint_from_url_slug(u: str) -> str:
return slug or "OpenLibrary"
def _coerce_archive_field_list(value: Any) -> List[str]:
"""Coerce an Archive.org metadata field to a list of strings."""
if value is None:
return []
if isinstance(value, list):
out: List[str] = []
for v in value:
try:
s = str(v).strip()
except Exception:
continue
if s:
out.append(s)
return out
if isinstance(value, (tuple, set)):
out = []
for v in value:
try:
s = str(v).strip()
except Exception:
continue
if s:
out.append(s)
return out
try:
s = str(value).strip()
except Exception:
return []
return [s] if s else []
def _archive_item_metadata_to_tags(archive_id: str,
item_metadata: Dict[str,
Any]) -> List[str]:
"""Map Archive.org metadata JSON (the `metadata` object) to tag strings.
This is intentionally best-effort and conservative: it focuses on stable,
useful bibliographic fields (title/author/publisher/ISBN/identifier/topics).
"""
archive_id_clean = str(archive_id or "").strip()
meta = item_metadata if isinstance(item_metadata,
dict) else {}
tags: List[str] = []
seen: set[str] = set()
def _add(tag: str) -> None:
try:
t = str(tag).strip()
except Exception:
return
if not t:
return
if t.lower() in seen:
return
seen.add(t.lower())
tags.append(t)
if archive_id_clean:
_add(f"internet_archive:{archive_id_clean}")
# Title
for title in _coerce_archive_field_list(meta.get("title"))[:1]:
_add(f"title:{title}")
# Authors/creators
creators: List[str] = []
creators.extend(_coerce_archive_field_list(meta.get("creator")))
creators.extend(_coerce_archive_field_list(meta.get("author")))
for creator in creators[:3]:
_add(f"author:{creator}")
# Publisher
for publisher in _coerce_archive_field_list(meta.get("publisher"))[:3]:
_add(f"publisher:{publisher}")
# Publish date/year
for date_val in _coerce_archive_field_list(meta.get("date"))[:1]:
_add(f"publish_date:{date_val}")
for year_val in _coerce_archive_field_list(meta.get("year"))[:1]:
_add(f"publish_date:{year_val}")
# Language
for lang in _coerce_archive_field_list(meta.get("language"))[:3]:
_add(f"language:{lang}")
# Topics/subjects: follow existing OpenLibrary behavior (un-namespaced tags)
for subj in _coerce_archive_field_list(meta.get("subject"))[:15]:
if len(subj) > 200:
subj = subj[:200]
_add(subj)
# ISBNs and identifiers
def _clean_isbn(raw: str) -> str:
return str(raw or "").replace("-", "").strip()
for isbn in _coerce_archive_field_list(meta.get("isbn"))[:10]:
isbn_clean = _clean_isbn(isbn)
if isbn_clean:
_add(f"isbn:{isbn_clean}")
identifiers: List[str] = []
identifiers.extend(_coerce_archive_field_list(meta.get("identifier")))
identifiers.extend(_coerce_archive_field_list(meta.get("external-identifier")))
added_other = 0
for ident in identifiers:
ident_s = str(ident or "").strip()
if not ident_s:
continue
low = ident_s.lower()
if low.startswith("urn:isbn:"):
val = _clean_isbn(ident_s.split(":", 2)[-1])
if val:
_add(f"isbn:{val}")
continue
if low.startswith("isbn:"):
val = _clean_isbn(ident_s.split(":", 1)[-1])
if val:
_add(f"isbn:{val}")
continue
if low.startswith("urn:oclc:"):
val = ident_s.split(":", 2)[-1].strip()
if val:
_add(f"oclc:{val}")
continue
if low.startswith("oclc:"):
val = ident_s.split(":", 1)[-1].strip()
if val:
_add(f"oclc:{val}")
continue
if low.startswith("urn:lccn:"):
val = ident_s.split(":", 2)[-1].strip()
if val:
_add(f"lccn:{val}")
continue
if low.startswith("lccn:"):
val = ident_s.split(":", 1)[-1].strip()
if val:
_add(f"lccn:{val}")
continue
if low.startswith("doi:"):
val = ident_s.split(":", 1)[-1].strip()
if val:
_add(f"doi:{val}")
continue
if archive_id_clean and low == archive_id_clean.lower():
continue
if added_other >= 5:
continue
if len(ident_s) > 200:
ident_s = ident_s[:200]
_add(f"identifier:{ident_s}")
added_other += 1
return tags
def _fetch_archive_item_metadata(archive_id: str,
*,
timeout: int = 8) -> Dict[str,
Any]:
ident = str(archive_id or "").strip()
if not ident:
return {}
resp = requests.get(f"https://archive.org/metadata/{ident}", timeout=int(timeout))
resp.raise_for_status()
data = resp.json() if resp is not None else {}
if not isinstance(data, dict):
return {}
meta = data.get("metadata")
return meta if isinstance(meta,
dict) else {}
class OpenLibrary(Provider):
TABLE_AUTO_STAGES = {
@@ -466,7 +301,7 @@ class OpenLibrary(Provider):
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
self._session = requests.Session()
self._session = _create_archive_session()
class BookNotAvailableError(Exception):
"""Raised when a book is not available for borrowing (waitlisted/in use)."""
@@ -612,7 +447,7 @@ class OpenLibrary(Provider):
@classmethod
def _archive_login(cls, email: str, password: str) -> requests.Session:
"""Login to archive.org using the token-based services endpoint (matches test-login.py)."""
session = requests.Session()
session = _create_archive_session()
token_resp = session.get(
"https://archive.org/services/account/login/",
@@ -766,7 +601,11 @@ class OpenLibrary(Provider):
if not ident:
return False, "no-archive-id"
try:
resp = requests.get(f"https://archive.org/metadata/{ident}", timeout=8)
resp = requests.get(
f"https://archive.org/metadata/{ident}",
timeout=8,
verify=_ARCHIVE_VERIFY_VALUE,
)
resp.raise_for_status()
data = resp.json() if resp is not None else {}
meta = data.get("metadata",
@@ -976,7 +815,11 @@ class OpenLibrary(Provider):
"""Check for a directly downloadable original PDF in Archive.org metadata."""
try:
metadata_url = f"https://archive.org/metadata/{book_id}"
response = requests.get(metadata_url, timeout=6)
response = requests.get(
metadata_url,
timeout=6,
verify=_ARCHIVE_VERIFY_VALUE,
)
response.raise_for_status()
metadata = response.json()
files = metadata.get("files") if isinstance(metadata, dict) else None
@@ -993,7 +836,8 @@ class OpenLibrary(Provider):
check_response = requests.head(
pdf_url,
timeout=4,
allow_redirects=True
allow_redirects=True,
verify=_ARCHIVE_VERIFY_VALUE,
)
if check_response.status_code == 200:
return True, pdf_url
@@ -1001,235 +845,6 @@ class OpenLibrary(Provider):
except Exception:
return False, ""
@staticmethod
def scrape_isbn_metadata(isbn: str) -> List[str]:
"""Scrape tags for an ISBN using Open Library API.
Returns tags such as:
- title:<...>, author:<...>, publish_date:<...>, publisher:<...>, description:<...>, pages:<...>
- identifiers: openlibrary:<...>, lccn:<...>, oclc:<...>, goodreads:<...>, librarything:<...>, doi:<...>, internet_archive:<...>
"""
new_tags: List[str] = []
isbn_clean = str(isbn or "").replace("isbn:", "").replace("-", "").strip()
if not isbn_clean:
return []
url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json"
try:
with HTTPClient() as client:
response = client.get(url)
response.raise_for_status()
data = json_module.loads(response.content.decode("utf-8"))
except Exception as exc:
log(f"Failed to fetch ISBN metadata: {exc}", file=sys.stderr)
return []
if not data:
log(f"No ISBN metadata found for: {isbn}")
return []
book_data = next(iter(data.values()), None)
if not isinstance(book_data, dict):
return []
if "title" in book_data:
new_tags.append(f"title:{book_data['title']}")
authors = book_data.get("authors")
if isinstance(authors, list):
for author in authors[:3]:
if isinstance(author, dict) and author.get("name"):
new_tags.append(f"author:{author['name']}")
if book_data.get("publish_date"):
new_tags.append(f"publish_date:{book_data['publish_date']}")
publishers = book_data.get("publishers")
if isinstance(publishers, list) and publishers:
pub = publishers[0]
if isinstance(pub, dict) and pub.get("name"):
new_tags.append(f"publisher:{pub['name']}")
if "description" in book_data:
desc = book_data.get("description")
if isinstance(desc, dict) and "value" in desc:
desc = desc.get("value")
if desc:
desc_str = str(desc).strip()
if desc_str:
new_tags.append(f"description:{desc_str[:200]}")
page_count = book_data.get("number_of_pages")
if isinstance(page_count, int) and page_count > 0:
new_tags.append(f"pages:{page_count}")
identifiers = book_data.get("identifiers")
if isinstance(identifiers, dict):
def _first(value: Any) -> Any:
if isinstance(value, list) and value:
return value[0]
return value
for key, ns in (
("openlibrary", "openlibrary"),
("lccn", "lccn"),
("oclc", "oclc"),
("goodreads", "goodreads"),
("librarything", "librarything"),
("doi", "doi"),
("internet_archive", "internet_archive"),
):
val = _first(identifiers.get(key))
if val:
new_tags.append(f"{ns}:{val}")
debug(f"Found {len(new_tags)} tag(s) from ISBN lookup")
return new_tags
@staticmethod
def scrape_openlibrary_metadata(olid: str) -> List[str]:
"""Scrape tags for an OpenLibrary ID using the .json API endpoint."""
new_tags: List[str] = []
olid_text = str(olid or "").strip()
if not olid_text:
return []
# Normalize OLID to the common "OL<digits>M" form when possible.
olid_norm = olid_text
try:
if not olid_norm.startswith("OL"):
olid_norm = f"OL{olid_norm}"
if not olid_norm.endswith("M"):
olid_norm = f"{olid_norm}M"
except Exception:
olid_norm = olid_text
# Ensure we always include a scrapeable identifier tag.
new_tags.append(f"openlibrary:{olid_norm}")
# Accept OL9674499M, 9674499M, or just digits.
olid_clean = olid_text.replace("OL", "").replace("M", "")
if not olid_clean.isdigit():
olid_clean = olid_text
if not olid_text.startswith("OL"):
url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
else:
url = f"https://openlibrary.org/books/{olid_text}.json"
try:
with HTTPClient() as client:
response = client.get(url)
response.raise_for_status()
data = json_module.loads(response.content.decode("utf-8"))
except Exception as exc:
log(f"Failed to fetch OpenLibrary metadata: {exc}", file=sys.stderr)
return []
if not isinstance(data, dict) or not data:
log(f"No OpenLibrary metadata found for: {olid_text}")
return []
if "title" in data:
new_tags.append(f"title:{data['title']}")
authors = data.get("authors")
if isinstance(authors, list):
for author in authors[:3]:
if isinstance(author, dict) and author.get("name"):
new_tags.append(f"author:{author['name']}")
continue
# Common OL shape: {"key": "/authors/OL...A"} or {"author": {"key": ...}}
author_key = None
if isinstance(author, dict):
if isinstance(author.get("author"), dict):
author_key = author.get("author",
{}).get("key")
if not author_key:
author_key = author.get("key")
if isinstance(author_key, str) and author_key.startswith("/"):
try:
author_url = f"https://openlibrary.org{author_key}.json"
with HTTPClient(timeout=10) as client:
author_resp = client.get(author_url)
author_resp.raise_for_status()
author_data = json_module.loads(
author_resp.content.decode("utf-8")
)
if isinstance(author_data, dict) and author_data.get("name"):
new_tags.append(f"author:{author_data['name']}")
continue
except Exception:
pass
if isinstance(author, str) and author:
new_tags.append(f"author:{author}")
if data.get("publish_date"):
new_tags.append(f"publish_date:{data['publish_date']}")
publishers = data.get("publishers")
if isinstance(publishers, list) and publishers:
pub = publishers[0]
if isinstance(pub, dict) and pub.get("name"):
new_tags.append(f"publisher:{pub['name']}")
elif isinstance(pub, str) and pub:
new_tags.append(f"publisher:{pub}")
if "description" in data:
desc = data.get("description")
if isinstance(desc, dict) and "value" in desc:
desc = desc.get("value")
if desc:
desc_str = str(desc).strip()
if desc_str:
new_tags.append(f"description:{desc_str[:200]}")
page_count = data.get("number_of_pages")
if isinstance(page_count, int) and page_count > 0:
new_tags.append(f"pages:{page_count}")
subjects = data.get("subjects")
if isinstance(subjects, list):
for subject in subjects[:10]:
if isinstance(subject, str):
subject_clean = subject.strip()
if subject_clean and subject_clean not in new_tags:
new_tags.append(subject_clean)
identifiers = data.get("identifiers")
if isinstance(identifiers, dict):
def _first(value: Any) -> Any:
if isinstance(value, list) and value:
return value[0]
return value
for key, ns in (
("isbn_10", "isbn_10"),
("isbn_13", "isbn_13"),
("lccn", "lccn"),
("oclc_numbers", "oclc"),
("goodreads", "goodreads"),
("internet_archive", "internet_archive"),
):
val = _first(identifiers.get(key))
if val:
new_tags.append(f"{ns}:{val}")
# Some editions expose a direct Archive.org identifier as "ocaid".
ocaid = data.get("ocaid")
if isinstance(ocaid, str) and ocaid.strip():
new_tags.append(f"internet_archive:{ocaid.strip()}")
debug(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup")
return new_tags
def search(
self,
query: str,
@@ -1293,7 +908,7 @@ class OpenLibrary(Provider):
ia_val_local = []
ia_ids_local = [str(x) for x in ia_val_local if x]
session_local = requests.Session()
session_local = _create_archive_session()
try:
archive_id_local = _resolve_archive_id(
@@ -1423,19 +1038,38 @@ class OpenLibrary(Provider):
"borrow"}:
annotations.append(availability)
book_path = (
f"https://openlibrary.org/books/{edition_id}" if edition_id else
(
f"https://openlibrary.org{work_key}"
if isinstance(work_key, str) and work_key.startswith("/") else
"https://openlibrary.org"
)
)
metadata = {
"openlibrary_id": edition_id,
"openlibrary_key": work_key,
"authors": authors_list,
"year": year,
"isbn_10": isbn_10,
"isbn_13": isbn_13,
"ia": ia_ids,
"availability": availability,
"availability_reason": availability_reason,
"archive_id": archive_id,
"direct_url": direct_url,
"raw": doc,
}
if book_path:
metadata["selection_url"] = book_path
metadata["_selection_args"] = ["-url", book_path]
metadata["_selection_action"] = ["download-file", "-url", book_path]
results.append(
SearchResult(
table="openlibrary",
title=book_title,
path=(
f"https://openlibrary.org/books/{edition_id}" if edition_id else
(
f"https://openlibrary.org{work_key}"
if isinstance(work_key,
str) and work_key.startswith("/") else
"https://openlibrary.org"
)
),
path=book_path,
detail=(
(f"By: {', '.join(authors_list)}" if authors_list else "") +
(f" ({year})" if year else "")
@@ -1443,20 +1077,7 @@ class OpenLibrary(Provider):
annotations=annotations,
media_kind="book",
columns=columns,
full_metadata={
"openlibrary_id": edition_id,
"openlibrary_key": work_key,
"authors": authors_list,
"year": year,
"isbn_10": isbn_10,
"isbn_13": isbn_13,
"ia": ia_ids,
"availability": availability,
"availability_reason": availability_reason,
"archive_id": archive_id,
"direct_url": direct_url,
"raw": doc,
},
full_metadata=metadata,
)
)
@@ -1507,8 +1128,8 @@ class OpenLibrary(Provider):
# Best-effort metadata scrape to attach bibliographic tags for downstream cmdlets.
try:
archive_meta = _fetch_archive_item_metadata(archive_id)
tags = _archive_item_metadata_to_tags(archive_id, archive_meta)
archive_meta = fetch_archive_item_metadata(archive_id)
tags = archive_item_metadata_to_tags(archive_id, archive_meta)
if tags:
try:
result.tag.update(tags)