This commit is contained in:
2026-01-06 16:19:29 -08:00
parent 41c11d39fd
commit edc33f4528
10 changed files with 1192 additions and 881 deletions

View File

@@ -8,6 +8,9 @@ import requests
import sys
import json
import subprocess
from API.HTTP import HTTPClient
from ProviderCore.base import SearchResult
try: # Optional dependency for IMDb scraping
from imdbinfo.services import search_title # type: ignore
except ImportError: # pragma: no cover - optional
@@ -15,6 +18,7 @@ except ImportError: # pragma: no cover - optional
from SYS.logger import log, debug
from SYS.metadata import imdb_tag
from SYS.json_table import normalize_record
try: # Optional dependency
import musicbrainzngs # type: ignore
@@ -892,6 +896,524 @@ class YtdlpMetadataProvider(MetadataProvider):
return out
def _coerce_archive_field_list(value: Any) -> List[str]:
"""Coerce an Archive.org metadata field to a list of strings."""
if value is None:
return []
if isinstance(value, list):
out: List[str] = []
for v in value:
try:
s = str(v).strip()
except Exception:
continue
if s:
out.append(s)
return out
if isinstance(value, (tuple, set)):
out = []
for v in value:
try:
s = str(v).strip()
except Exception:
continue
if s:
out.append(s)
return out
try:
s = str(value).strip()
except Exception:
return []
return [s] if s else []
def archive_item_metadata_to_tags(archive_id: str,
item_metadata: Dict[str, Any]) -> List[str]:
"""Coerce Archive.org metadata into a stable set of bibliographic tags."""
archive_id_clean = str(archive_id or "").strip()
meta = item_metadata if isinstance(item_metadata, dict) else {}
tags: List[str] = []
seen: set[str] = set()
def _add(tag: str) -> None:
try:
t = str(tag).strip()
except Exception:
return
if not t:
return
if t.lower() in seen:
return
seen.add(t.lower())
tags.append(t)
if archive_id_clean:
_add(f"internet_archive:{archive_id_clean}")
for title in _coerce_archive_field_list(meta.get("title"))[:1]:
_add(f"title:{title}")
creators: List[str] = []
creators.extend(_coerce_archive_field_list(meta.get("creator")))
creators.extend(_coerce_archive_field_list(meta.get("author")))
for creator in creators[:3]:
_add(f"author:{creator}")
for publisher in _coerce_archive_field_list(meta.get("publisher"))[:3]:
_add(f"publisher:{publisher}")
for date_val in _coerce_archive_field_list(meta.get("date"))[:1]:
_add(f"publish_date:{date_val}")
for year_val in _coerce_archive_field_list(meta.get("year"))[:1]:
_add(f"publish_date:{year_val}")
for lang in _coerce_archive_field_list(meta.get("language"))[:3]:
_add(f"language:{lang}")
for subj in _coerce_archive_field_list(meta.get("subject"))[:15]:
if len(subj) > 200:
subj = subj[:200]
_add(subj)
def _clean_isbn(raw: str) -> str:
return str(raw or "").replace("-", "").strip()
for isbn in _coerce_archive_field_list(meta.get("isbn"))[:10]:
isbn_clean = _clean_isbn(isbn)
if isbn_clean:
_add(f"isbn:{isbn_clean}")
identifiers: List[str] = []
identifiers.extend(_coerce_archive_field_list(meta.get("identifier")))
identifiers.extend(_coerce_archive_field_list(meta.get("external-identifier")))
added_other = 0
for ident in identifiers:
ident_s = str(ident or "").strip()
if not ident_s:
continue
low = ident_s.lower()
if low.startswith("urn:isbn:"):
val = _clean_isbn(ident_s.split(":", 2)[-1])
if val:
_add(f"isbn:{val}")
continue
if low.startswith("isbn:"):
val = _clean_isbn(ident_s.split(":", 1)[-1])
if val:
_add(f"isbn:{val}")
continue
if low.startswith("urn:oclc:"):
val = ident_s.split(":", 2)[-1].strip()
if val:
_add(f"oclc:{val}")
continue
if low.startswith("oclc:"):
val = ident_s.split(":", 1)[-1].strip()
if val:
_add(f"oclc:{val}")
continue
if low.startswith("urn:lccn:"):
val = ident_s.split(":", 2)[-1].strip()
if val:
_add(f"lccn:{val}")
continue
if low.startswith("lccn:"):
val = ident_s.split(":", 1)[-1].strip()
if val:
_add(f"lccn:{val}")
continue
if low.startswith("doi:"):
val = ident_s.split(":", 1)[-1].strip()
if val:
_add(f"doi:{val}")
continue
if archive_id_clean and low == archive_id_clean.lower():
continue
if added_other >= 5:
continue
if len(ident_s) > 200:
ident_s = ident_s[:200]
_add(f"identifier:{ident_s}")
added_other += 1
return tags
def fetch_archive_item_metadata(archive_id: str,
*,
timeout: int = 8) -> Dict[str, Any]:
ident = str(archive_id or "").strip()
if not ident:
return {}
resp = requests.get(f"https://archive.org/metadata/{ident}", timeout=int(timeout))
resp.raise_for_status()
data = resp.json() if resp is not None else {}
if not isinstance(data, dict):
return {}
meta = data.get("metadata")
return meta if isinstance(meta, dict) else {}
def scrape_isbn_metadata(isbn: str) -> List[str]:
"""Scrape metadata tags for an ISBN using OpenLibrary's books API."""
new_tags: List[str] = []
isbn_clean = str(isbn or "").replace("isbn:", "").replace("-", "").strip()
if not isbn_clean:
return []
url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json"
try:
with HTTPClient() as client:
response = client.get(url)
response.raise_for_status()
data = json.loads(response.content.decode("utf-8"))
except Exception as exc:
log(f"Failed to fetch ISBN metadata: {exc}", file=sys.stderr)
return []
if not data:
log(f"No ISBN metadata found for: {isbn}")
return []
book_data = next(iter(data.values()), None)
if not isinstance(book_data, dict):
return []
if "title" in book_data:
new_tags.append(f"title:{book_data['title']}")
authors = book_data.get("authors")
if isinstance(authors, list):
for author in authors[:3]:
if isinstance(author, dict) and author.get("name"):
new_tags.append(f"author:{author['name']}")
if book_data.get("publish_date"):
new_tags.append(f"publish_date:{book_data['publish_date']}")
publishers = book_data.get("publishers")
if isinstance(publishers, list) and publishers:
pub = publishers[0]
if isinstance(pub, dict) and pub.get("name"):
new_tags.append(f"publisher:{pub['name']}")
if "description" in book_data:
desc = book_data.get("description")
if isinstance(desc, dict) and "value" in desc:
desc = desc.get("value")
if desc:
desc_str = str(desc).strip()
if desc_str:
new_tags.append(f"description:{desc_str[:200]}")
page_count = book_data.get("number_of_pages")
if isinstance(page_count, int) and page_count > 0:
new_tags.append(f"pages:{page_count}")
identifiers = book_data.get("identifiers")
if isinstance(identifiers, dict):
def _first(value: Any) -> Any:
if isinstance(value, list) and value:
return value[0]
return value
for key, ns in (
("openlibrary", "openlibrary"),
("lccn", "lccn"),
("oclc", "oclc"),
("goodreads", "goodreads"),
("librarything", "librarything"),
("doi", "doi"),
("internet_archive", "internet_archive"),
):
val = _first(identifiers.get(key))
if val:
new_tags.append(f"{ns}:{val}")
debug(f"Found {len(new_tags)} tag(s) from ISBN lookup")
return new_tags
def scrape_openlibrary_metadata(olid: str) -> List[str]:
"""Scrape metadata tags for an OpenLibrary ID using the edition JSON endpoint."""
new_tags: List[str] = []
olid_text = str(olid or "").strip()
if not olid_text:
return []
olid_norm = olid_text
try:
if not olid_norm.startswith("OL"):
olid_norm = f"OL{olid_norm}"
if not olid_norm.endswith("M"):
olid_norm = f"{olid_norm}M"
except Exception:
olid_norm = olid_text
new_tags.append(f"openlibrary:{olid_norm}")
olid_clean = olid_text.replace("OL", "").replace("M", "")
if not olid_clean.isdigit():
olid_clean = olid_text
if not olid_text.startswith("OL"):
url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
else:
url = f"https://openlibrary.org/books/{olid_text}.json"
try:
with HTTPClient() as client:
response = client.get(url)
response.raise_for_status()
data = json.loads(response.content.decode("utf-8"))
except Exception as exc:
log(f"Failed to fetch OpenLibrary metadata: {exc}", file=sys.stderr)
return []
if not isinstance(data, dict) or not data:
log(f"No OpenLibrary metadata found for: {olid_text}")
return []
if "title" in data:
new_tags.append(f"title:{data['title']}")
authors = data.get("authors")
if isinstance(authors, list):
for author in authors[:3]:
if isinstance(author, dict) and author.get("name"):
new_tags.append(f"author:{author['name']}")
continue
author_key = None
if isinstance(author, dict):
if isinstance(author.get("author"), dict):
author_key = author.get("author", {}).get("key")
if not author_key:
author_key = author.get("key")
if isinstance(author_key, str) and author_key.startswith("/"):
try:
author_url = f"https://openlibrary.org{author_key}.json"
with HTTPClient(timeout=10) as client:
author_resp = client.get(author_url)
author_resp.raise_for_status()
author_data = json.loads(author_resp.content.decode("utf-8"))
if isinstance(author_data, dict) and author_data.get("name"):
new_tags.append(f"author:{author_data['name']}")
continue
except Exception:
pass
if isinstance(author, str) and author:
new_tags.append(f"author:{author}")
if data.get("publish_date"):
new_tags.append(f"publish_date:{data['publish_date']}")
publishers = data.get("publishers")
if isinstance(publishers, list) and publishers:
pub = publishers[0]
if isinstance(pub, dict) and pub.get("name"):
new_tags.append(f"publisher:{pub['name']}")
elif isinstance(pub, str) and pub:
new_tags.append(f"publisher:{pub}")
if "description" in data:
desc = data.get("description")
if isinstance(desc, dict) and "value" in desc:
desc = desc.get("value")
if desc:
desc_str = str(desc).strip()
if desc_str:
new_tags.append(f"description:{desc_str[:200]}")
page_count = data.get("number_of_pages")
if isinstance(page_count, int) and page_count > 0:
new_tags.append(f"pages:{page_count}")
subjects = data.get("subjects")
if isinstance(subjects, list):
for subject in subjects[:10]:
if isinstance(subject, str):
subject_clean = subject.strip()
if subject_clean and subject_clean not in new_tags:
new_tags.append(subject_clean)
identifiers = data.get("identifiers")
if isinstance(identifiers, dict):
def _first(value: Any) -> Any:
if isinstance(value, list) and value:
return value[0]
return value
for key, ns in (
("isbn_10", "isbn_10"),
("isbn_13", "isbn_13"),
("lccn", "lccn"),
("oclc_numbers", "oclc"),
("goodreads", "goodreads"),
("internet_archive", "internet_archive"),
):
val = _first(identifiers.get(key))
if val:
new_tags.append(f"{ns}:{val}")
ocaid = data.get("ocaid")
if isinstance(ocaid, str) and ocaid.strip():
new_tags.append(f"internet_archive:{ocaid.strip()}")
debug(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup")
return new_tags
SAMPLE_ITEMS: List[Dict[str, Any]] = [
{
"title": "Sample OpenLibrary book",
"path": "https://openlibrary.org/books/OL123M",
"openlibrary_id": "OL123M",
"archive_id": "samplearchive123",
"availability": "borrow",
"availability_reason": "sample",
"direct_url": "https://archive.org/download/sample.pdf",
"author_name": ["OpenLibrary Demo"],
"first_publish_year": 2023,
"ia": ["samplearchive123"],
},
]
try:
from typing import Iterable
from SYS.result_table_api import ColumnSpec, ResultModel, metadata_column, title_column
from SYS.result_table_adapters import register_provider
def _ensure_search_result(item: Any) -> SearchResult:
if isinstance(item, SearchResult):
return item
if isinstance(item, dict):
data = dict(item)
title = str(data.get("title") or data.get("name") or "OpenLibrary")
path = str(data.get("path") or data.get("url") or "")
detail = str(data.get("detail") or "")
annotations = list(data.get("annotations") or [])
media_kind = str(data.get("media_kind") or "book")
return SearchResult(
table="openlibrary",
title=title,
path=path,
detail=detail,
annotations=annotations,
media_kind=media_kind,
columns=data.get("columns") or [],
full_metadata={**data, "raw": dict(item)},
)
return SearchResult(
table="openlibrary",
title=str(item or "OpenLibrary"),
path="",
detail="",
annotations=[],
media_kind="book",
full_metadata={"raw": {}},
)
def _adapter(items: Iterable[Any]) -> Iterable[ResultModel]:
for item in items:
sr = _ensure_search_result(item)
metadata = dict(getattr(sr, "full_metadata", {}) or {})
raw = metadata.get("raw")
if isinstance(raw, dict):
normalized = normalize_record(raw)
for key, val in normalized.items():
metadata.setdefault(key, val)
def _make_url() -> str:
candidate = (
metadata.get("selection_url") or
metadata.get("direct_url") or
metadata.get("url") or
metadata.get("path") or
sr.path or
""
)
return str(candidate or "").strip()
selection_url = _make_url()
if selection_url:
metadata["selection_url"] = selection_url
authors_value = metadata.get("authors_display") or metadata.get("authors") or metadata.get("author_name") or ""
if isinstance(authors_value, list):
authors_value = ", ".join(str(v) for v in authors_value if v)
authors_text = str(authors_value or "").strip()
if authors_text:
metadata["authors_display"] = authors_text
year_value = metadata.get("year") or metadata.get("first_publish_year")
if year_value and not isinstance(year_value, str):
year_value = str(year_value)
if year_value:
metadata["year"] = str(year_value)
metadata.setdefault("openlibrary_id", metadata.get("openlibrary_id") or metadata.get("olid"))
metadata.setdefault("source", metadata.get("source") or "openlibrary")
yield ResultModel(
title=str(sr.title or metadata.get("title") or selection_url or "OpenLibrary"),
path=selection_url or None,
metadata=metadata,
source="openlibrary",
)
def _columns_factory(rows: List[ResultModel]) -> List[ColumnSpec]:
cols: List[ColumnSpec] = [title_column()]
def _has(key: str) -> bool:
return any((row.metadata or {}).get(key) for row in rows)
if _has("authors_display"):
cols.append(
ColumnSpec(
"authors_display",
"Author",
lambda r: (r.metadata or {}).get("authors_display") or "",
)
)
if _has("year"):
cols.append(metadata_column("year", "Year"))
if _has("availability"):
cols.append(metadata_column("availability", "Avail"))
if _has("archive_id"):
cols.append(metadata_column("archive_id", "Archive ID"))
if _has("openlibrary_id"):
cols.append(metadata_column("openlibrary_id", "OLID"))
return cols
def _selection_fn(row: ResultModel) -> List[str]:
metadata = row.metadata or {}
url = str(metadata.get("selection_url") or row.path or "").strip()
if url:
return ["-url", url]
return ["-title", row.title or ""]
register_provider(
"openlibrary",
_adapter,
columns=_columns_factory,
selection_fn=_selection_fn,
metadata={"description": "OpenLibrary search provider (JSON result table template)"},
)
except Exception:
pass
# Registry ---------------------------------------------------------------
_METADATA_PROVIDERS: Dict[str,