re
Some checks failed
smoke-mm / Install & smoke test mm --help (push) Has been cancelled

This commit is contained in:
nose
2025-12-25 04:49:22 -08:00
parent 2542a68479
commit 43afa4e3fa
19 changed files with 2766 additions and 234 deletions

View File

@@ -190,6 +190,7 @@ def _archive_id_from_url(url: str) -> str:
u = str(url or "").strip()
if not u:
return ""
try:
p = urlparse(u)
host = (p.hostname or "").lower().strip()
@@ -215,6 +216,175 @@ def _archive_id_from_url(url: str) -> str:
return ""
def _coerce_archive_field_list(value: Any) -> List[str]:
"""Coerce an Archive.org metadata field to a list of strings."""
if value is None:
return []
if isinstance(value, list):
out: List[str] = []
for v in value:
try:
s = str(v).strip()
except Exception:
continue
if s:
out.append(s)
return out
if isinstance(value, (tuple, set)):
out = []
for v in value:
try:
s = str(v).strip()
except Exception:
continue
if s:
out.append(s)
return out
try:
s = str(value).strip()
except Exception:
return []
return [s] if s else []
def _archive_item_metadata_to_tags(archive_id: str, item_metadata: Dict[str, Any]) -> List[str]:
"""Map Archive.org metadata JSON (the `metadata` object) to tag strings.
This is intentionally best-effort and conservative: it focuses on stable,
useful bibliographic fields (title/author/publisher/ISBN/identifier/topics).
"""
archive_id_clean = str(archive_id or "").strip()
meta = item_metadata if isinstance(item_metadata, dict) else {}
tags: List[str] = []
seen: set[str] = set()
def _add(tag: str) -> None:
try:
t = str(tag).strip()
except Exception:
return
if not t:
return
if t.lower() in seen:
return
seen.add(t.lower())
tags.append(t)
if archive_id_clean:
_add(f"internet_archive:{archive_id_clean}")
# Title
for title in _coerce_archive_field_list(meta.get("title"))[:1]:
_add(f"title:{title}")
# Authors/creators
creators: List[str] = []
creators.extend(_coerce_archive_field_list(meta.get("creator")))
creators.extend(_coerce_archive_field_list(meta.get("author")))
for creator in creators[:3]:
_add(f"author:{creator}")
# Publisher
for publisher in _coerce_archive_field_list(meta.get("publisher"))[:3]:
_add(f"publisher:{publisher}")
# Publish date/year
for date_val in _coerce_archive_field_list(meta.get("date"))[:1]:
_add(f"publish_date:{date_val}")
for year_val in _coerce_archive_field_list(meta.get("year"))[:1]:
_add(f"publish_date:{year_val}")
# Language
for lang in _coerce_archive_field_list(meta.get("language"))[:3]:
_add(f"language:{lang}")
# Topics/subjects: follow existing OpenLibrary behavior (un-namespaced tags)
for subj in _coerce_archive_field_list(meta.get("subject"))[:15]:
if len(subj) > 200:
subj = subj[:200]
_add(subj)
# ISBNs and identifiers
def _clean_isbn(raw: str) -> str:
return str(raw or "").replace("-", "").strip()
for isbn in _coerce_archive_field_list(meta.get("isbn"))[:10]:
isbn_clean = _clean_isbn(isbn)
if isbn_clean:
_add(f"isbn:{isbn_clean}")
identifiers: List[str] = []
identifiers.extend(_coerce_archive_field_list(meta.get("identifier")))
identifiers.extend(_coerce_archive_field_list(meta.get("external-identifier")))
added_other = 0
for ident in identifiers:
ident_s = str(ident or "").strip()
if not ident_s:
continue
low = ident_s.lower()
if low.startswith("urn:isbn:"):
val = _clean_isbn(ident_s.split(":", 2)[-1])
if val:
_add(f"isbn:{val}")
continue
if low.startswith("isbn:"):
val = _clean_isbn(ident_s.split(":", 1)[-1])
if val:
_add(f"isbn:{val}")
continue
if low.startswith("urn:oclc:"):
val = ident_s.split(":", 2)[-1].strip()
if val:
_add(f"oclc:{val}")
continue
if low.startswith("oclc:"):
val = ident_s.split(":", 1)[-1].strip()
if val:
_add(f"oclc:{val}")
continue
if low.startswith("urn:lccn:"):
val = ident_s.split(":", 2)[-1].strip()
if val:
_add(f"lccn:{val}")
continue
if low.startswith("lccn:"):
val = ident_s.split(":", 1)[-1].strip()
if val:
_add(f"lccn:{val}")
continue
if low.startswith("doi:"):
val = ident_s.split(":", 1)[-1].strip()
if val:
_add(f"doi:{val}")
continue
if archive_id_clean and low == archive_id_clean.lower():
continue
if added_other >= 5:
continue
if len(ident_s) > 200:
ident_s = ident_s[:200]
_add(f"identifier:{ident_s}")
added_other += 1
return tags
def _fetch_archive_item_metadata(archive_id: str, *, timeout: int = 8) -> Dict[str, Any]:
ident = str(archive_id or "").strip()
if not ident:
return {}
resp = requests.get(f"https://archive.org/metadata/{ident}", timeout=int(timeout))
resp.raise_for_status()
data = resp.json() if resp is not None else {}
if not isinstance(data, dict):
return {}
meta = data.get("metadata")
return meta if isinstance(meta, dict) else {}
class OpenLibrary(Provider):
# Domains that should be routed to this provider when the user supplies a URL.
# (Used by ProviderCore.registry.match_provider_name_for_url)
@@ -1046,6 +1216,25 @@ class OpenLibrary(Provider):
log("[openlibrary] No archive identifier available; cannot download", file=sys.stderr)
return None
# Best-effort metadata scrape to attach bibliographic tags for downstream cmdlets.
try:
archive_meta = _fetch_archive_item_metadata(archive_id)
tags = _archive_item_metadata_to_tags(archive_id, archive_meta)
if tags:
try:
result.tag.update(tags)
except Exception:
# Fallback for callers that pass plain dicts.
pass
if isinstance(meta, dict):
meta["archive_id"] = archive_id
if archive_meta:
meta["archive_metadata"] = archive_meta
result.full_metadata = meta
except Exception:
# Never block downloads on metadata fetch.
pass
safe_title = sanitize_filename(result.title)
if not safe_title or "http" in safe_title.lower():
safe_title = sanitize_filename(archive_id) or "archive"