This commit is contained in:
2026-03-22 22:41:56 -07:00
parent 67ba6cb3d1
commit 23a73a94e6
6 changed files with 956 additions and 179 deletions

View File

@@ -210,6 +210,135 @@ def _resolve_archive_id(
return ""
def _fetch_openlibrary_edition_metadata(
session: requests.Session,
edition_id: str,
) -> Dict[str, Any]:
if not edition_id:
return {}
try:
resp = session.get(
f"https://openlibrary.org/books/{edition_id}.json",
timeout=6,
)
resp.raise_for_status()
data = resp.json() or {}
except Exception:
return {}
if not isinstance(data, dict):
return {}
identifiers = data.get("identifiers")
if not isinstance(identifiers, dict):
identifiers = {}
def _first_clean(value: Any) -> str:
raw = _first_str(value)
return str(raw or "").strip()
isbn_10 = _first_clean(identifiers.get("isbn_10"))
isbn_13 = _first_clean(identifiers.get("isbn_13"))
archive_id = str(data.get("ocaid") or "").strip()
if not archive_id:
archive_id = _first_clean(identifiers.get("internet_archive"))
out: Dict[str, Any] = {
"openlibrary_id": str(edition_id).strip(),
"openlibrary": str(edition_id).strip(),
}
if isbn_10:
out["isbn_10"] = isbn_10
if isbn_13:
out["isbn_13"] = isbn_13
if archive_id:
out["archive_id"] = archive_id
return out
def _select_preferred_isbns(values: Any) -> Tuple[str, str]:
items: List[Any]
if isinstance(values, list):
items = values
elif values in (None, ""):
items = []
else:
items = [values]
isbn_10 = ""
isbn_13 = ""
for raw in items:
token = re.sub(r"[^0-9Xx]", "", str(raw or "")).upper().strip()
if not token:
continue
if len(token) == 13 and not isbn_13:
isbn_13 = token
elif len(token) == 10 and not isbn_10:
isbn_10 = token
return isbn_10, isbn_13
def _build_pipeline_progress_callback(
progress: Any,
title: str,
) -> Callable[[str, int, Optional[int], str], None]:
transfer_label = str(title or "book").strip() or "book"
state = {"active": False, "finished": False}
def _ensure_started(total: Optional[int]) -> None:
if state["active"]:
return
try:
progress.begin_transfer(label=transfer_label, total=total)
state["active"] = True
state["finished"] = False
except Exception:
pass
def _finish() -> None:
if not state["active"] or state["finished"]:
return
try:
progress.finish_transfer(label=transfer_label)
except Exception:
pass
state["finished"] = True
state["active"] = False
def _callback(kind: str, completed: int, total: Optional[int], label: str) -> None:
text = str(label or kind or "download").strip() or "download"
try:
progress.set_status(f"openlibrary: {text}")
except Exception:
pass
if kind == "step":
if text != "download pages":
_finish()
return
if kind in {"pages", "bytes"}:
_ensure_started(total)
try:
progress.update_transfer(
label=transfer_label,
completed=int(completed) if completed is not None else None,
total=int(total) if total is not None else None,
)
except Exception:
pass
if total is not None:
try:
if int(completed) >= int(total):
_finish()
except Exception:
pass
setattr(_callback, "_finish_transfer", _finish)
return _callback
def _archive_id_from_url(url: str) -> str:
"""Best-effort extraction of an Archive.org item identifier from a URL."""
@@ -1082,6 +1211,12 @@ class OpenLibrary(Provider):
meta = result.full_metadata or {}
edition_id = str(meta.get("openlibrary_id") or "").strip()
edition_meta = _fetch_openlibrary_edition_metadata(self._session, edition_id)
if edition_meta and isinstance(meta, dict):
for key, value in edition_meta.items():
if value and not meta.get(key):
meta[key] = value
result.full_metadata = meta
# Accept direct Archive.org URLs too (details/borrow/download) even when no OL edition id is known.
archive_id = str(meta.get("archive_id") or "").strip()
@@ -1097,7 +1232,9 @@ class OpenLibrary(Provider):
archive_id = _first_str(ia_candidates) or ""
if not archive_id and edition_id:
archive_id = _resolve_archive_id(self._session, edition_id, ia_candidates)
archive_id = str(edition_meta.get("archive_id") or "").strip()
if not archive_id:
archive_id = _resolve_archive_id(self._session, edition_id, ia_candidates)
if not archive_id:
# Try to extract identifier from the SearchResult path (URL).
@@ -1114,17 +1251,49 @@ class OpenLibrary(Provider):
try:
archive_meta = fetch_archive_item_metadata(archive_id)
tags = archive_item_metadata_to_tags(archive_id, archive_meta)
if edition_id:
tags.append(f"openlibrary:{edition_id}")
if tags:
try:
result.tag.update(tags)
except Exception:
# Fallback for callers that pass plain dicts.
pass
isbn_10 = str(meta.get("isbn_10") or edition_meta.get("isbn_10") or "").strip()
isbn_13 = str(meta.get("isbn_13") or edition_meta.get("isbn_13") or "").strip()
if not isbn_10 and not isbn_13:
isbn_10, isbn_13 = _select_preferred_isbns(archive_meta.get("isbn"))
if isinstance(meta, dict):
meta["archive_id"] = archive_id
if archive_meta:
meta["archive_metadata"] = archive_meta
if edition_id:
meta.setdefault("openlibrary_id", edition_id)
meta.setdefault("openlibrary", edition_id)
if isbn_10:
meta.setdefault("isbn_10", isbn_10)
if isbn_13:
meta.setdefault("isbn_13", isbn_13)
if not meta.get("isbn"):
meta["isbn"] = isbn_13 or isbn_10
result.full_metadata = meta
extra_identifier_tags: List[str] = []
if edition_id:
extra_identifier_tags.append(f"openlibrary:{edition_id}")
if isbn_13:
extra_identifier_tags.append(f"isbn_13:{isbn_13}")
extra_identifier_tags.append(f"isbn:{isbn_13}")
elif isbn_10:
extra_identifier_tags.append(f"isbn_10:{isbn_10}")
extra_identifier_tags.append(f"isbn:{isbn_10}")
if extra_identifier_tags:
try:
result.tag.update(extra_identifier_tags)
except Exception:
pass
except Exception:
# Never block downloads on metadata fetch.
pass
@@ -1133,6 +1302,13 @@ class OpenLibrary(Provider):
if not safe_title or "http" in safe_title.lower():
safe_title = sanitize_filename(archive_id) or "archive"
internal_progress_finish = None
if progress_callback is None and isinstance(self.config, dict):
pipeline_progress = self.config.get("_pipeline_progress")
if pipeline_progress is not None:
progress_callback = _build_pipeline_progress_callback(pipeline_progress, safe_title)
internal_progress_finish = getattr(progress_callback, "_finish_transfer", None)
# 1) Direct download if available.
try:
can_direct, pdf_url = self._archive_check_direct_download(archive_id)
@@ -1318,6 +1494,12 @@ class OpenLibrary(Provider):
except Exception as exc:
log(f"[openlibrary] Borrow workflow error: {exc}", file=sys.stderr)
return None
finally:
if callable(internal_progress_finish):
try:
internal_progress_finish()
except Exception:
pass
def validate(self) -> bool:
return True