re

2025-12-25 04:49:22 -08:00
parent 2542a68479
commit 43afa4e3fa
19 changed files with 2766 additions and 234 deletions
--- a/Provider/openlibrary.py
+++ b/Provider/openlibrary.py
@@ -190,6 +190,7 @@ def _archive_id_from_url(url: str) -> str:
    u = str(url or "").strip()
    if not u:
        return ""
+
    try:
        p = urlparse(u)
        host = (p.hostname or "").lower().strip()
@@ -215,6 +216,175 @@ def _archive_id_from_url(url: str) -> str:
    return ""


+def _coerce_archive_field_list(value: Any) -> List[str]:
+    """Coerce an Archive.org metadata field to a list of strings."""
+    if value is None:
+        return []
+    if isinstance(value, list):
+        out: List[str] = []
+        for v in value:
+            try:
+                s = str(v).strip()
+            except Exception:
+                continue
+            if s:
+                out.append(s)
+        return out
+    if isinstance(value, (tuple, set)):
+        out = []
+        for v in value:
+            try:
+                s = str(v).strip()
+            except Exception:
+                continue
+            if s:
+                out.append(s)
+        return out
+    try:
+        s = str(value).strip()
+    except Exception:
+        return []
+    return [s] if s else []
+
+
+def _archive_item_metadata_to_tags(archive_id: str, item_metadata: Dict[str, Any]) -> List[str]:
+    """Map Archive.org metadata JSON (the `metadata` object) to tag strings.
+
+    This is intentionally best-effort and conservative: it focuses on stable,
+    useful bibliographic fields (title/author/publisher/ISBN/identifier/topics).
+    """
+    archive_id_clean = str(archive_id or "").strip()
+    meta = item_metadata if isinstance(item_metadata, dict) else {}
+
+    tags: List[str] = []
+    seen: set[str] = set()
+
+    def _add(tag: str) -> None:
+        try:
+            t = str(tag).strip()
+        except Exception:
+            return
+        if not t:
+            return
+        if t.lower() in seen:
+            return
+        seen.add(t.lower())
+        tags.append(t)
+
+    if archive_id_clean:
+        _add(f"internet_archive:{archive_id_clean}")
+
+    # Title
+    for title in _coerce_archive_field_list(meta.get("title"))[:1]:
+        _add(f"title:{title}")
+
+    # Authors/creators
+    creators: List[str] = []
+    creators.extend(_coerce_archive_field_list(meta.get("creator")))
+    creators.extend(_coerce_archive_field_list(meta.get("author")))
+    for creator in creators[:3]:
+        _add(f"author:{creator}")
+
+    # Publisher
+    for publisher in _coerce_archive_field_list(meta.get("publisher"))[:3]:
+        _add(f"publisher:{publisher}")
+
+    # Publish date/year
+    for date_val in _coerce_archive_field_list(meta.get("date"))[:1]:
+        _add(f"publish_date:{date_val}")
+    for year_val in _coerce_archive_field_list(meta.get("year"))[:1]:
+        _add(f"publish_date:{year_val}")
+
+    # Language
+    for lang in _coerce_archive_field_list(meta.get("language"))[:3]:
+        _add(f"language:{lang}")
+
+    # Topics/subjects: follow existing OpenLibrary behavior (un-namespaced tags)
+    for subj in _coerce_archive_field_list(meta.get("subject"))[:15]:
+        if len(subj) > 200:
+            subj = subj[:200]
+        _add(subj)
+
+    # ISBNs and identifiers
+    def _clean_isbn(raw: str) -> str:
+        return str(raw or "").replace("-", "").strip()
+
+    for isbn in _coerce_archive_field_list(meta.get("isbn"))[:10]:
+        isbn_clean = _clean_isbn(isbn)
+        if isbn_clean:
+            _add(f"isbn:{isbn_clean}")
+
+    identifiers: List[str] = []
+    identifiers.extend(_coerce_archive_field_list(meta.get("identifier")))
+    identifiers.extend(_coerce_archive_field_list(meta.get("external-identifier")))
+    added_other = 0
+    for ident in identifiers:
+        ident_s = str(ident or "").strip()
+        if not ident_s:
+            continue
+        low = ident_s.lower()
+
+        if low.startswith("urn:isbn:"):
+            val = _clean_isbn(ident_s.split(":", 2)[-1])
+            if val:
+                _add(f"isbn:{val}")
+            continue
+        if low.startswith("isbn:"):
+            val = _clean_isbn(ident_s.split(":", 1)[-1])
+            if val:
+                _add(f"isbn:{val}")
+            continue
+        if low.startswith("urn:oclc:"):
+            val = ident_s.split(":", 2)[-1].strip()
+            if val:
+                _add(f"oclc:{val}")
+            continue
+        if low.startswith("oclc:"):
+            val = ident_s.split(":", 1)[-1].strip()
+            if val:
+                _add(f"oclc:{val}")
+            continue
+        if low.startswith("urn:lccn:"):
+            val = ident_s.split(":", 2)[-1].strip()
+            if val:
+                _add(f"lccn:{val}")
+            continue
+        if low.startswith("lccn:"):
+            val = ident_s.split(":", 1)[-1].strip()
+            if val:
+                _add(f"lccn:{val}")
+            continue
+        if low.startswith("doi:"):
+            val = ident_s.split(":", 1)[-1].strip()
+            if val:
+                _add(f"doi:{val}")
+            continue
+
+        if archive_id_clean and low == archive_id_clean.lower():
+            continue
+        if added_other >= 5:
+            continue
+        if len(ident_s) > 200:
+            ident_s = ident_s[:200]
+        _add(f"identifier:{ident_s}")
+        added_other += 1
+
+    return tags
+
+
+def _fetch_archive_item_metadata(archive_id: str, *, timeout: int = 8) -> Dict[str, Any]:
+    ident = str(archive_id or "").strip()
+    if not ident:
+        return {}
+    resp = requests.get(f"https://archive.org/metadata/{ident}", timeout=int(timeout))
+    resp.raise_for_status()
+    data = resp.json() if resp is not None else {}
+    if not isinstance(data, dict):
+        return {}
+    meta = data.get("metadata")
+    return meta if isinstance(meta, dict) else {}
+
+
 class OpenLibrary(Provider):
    # Domains that should be routed to this provider when the user supplies a URL.
    # (Used by ProviderCore.registry.match_provider_name_for_url)
@@ -1046,6 +1216,25 @@ class OpenLibrary(Provider):
            log("[openlibrary] No archive identifier available; cannot download", file=sys.stderr)
            return None

+        # Best-effort metadata scrape to attach bibliographic tags for downstream cmdlets.
+        try:
+            archive_meta = _fetch_archive_item_metadata(archive_id)
+            tags = _archive_item_metadata_to_tags(archive_id, archive_meta)
+            if tags:
+                try:
+                    result.tag.update(tags)
+                except Exception:
+                    # Fallback for callers that pass plain dicts.
+                    pass
+            if isinstance(meta, dict):
+                meta["archive_id"] = archive_id
+                if archive_meta:
+                    meta["archive_metadata"] = archive_meta
+                result.full_metadata = meta
+        except Exception:
+            # Never block downloads on metadata fetch.
+            pass
+
        safe_title = sanitize_filename(result.title)
        if not safe_title or "http" in safe_title.lower():
            safe_title = sanitize_filename(archive_id) or "archive"