re
Some checks failed
smoke-mm / Install & smoke test mm --help (push) Has been cancelled
Some checks failed
smoke-mm / Install & smoke test mm --help (push) Has been cancelled
This commit is contained in:
@@ -190,6 +190,7 @@ def _archive_id_from_url(url: str) -> str:
|
||||
u = str(url or "").strip()
|
||||
if not u:
|
||||
return ""
|
||||
|
||||
try:
|
||||
p = urlparse(u)
|
||||
host = (p.hostname or "").lower().strip()
|
||||
@@ -215,6 +216,175 @@ def _archive_id_from_url(url: str) -> str:
|
||||
return ""
|
||||
|
||||
|
||||
def _coerce_archive_field_list(value: Any) -> List[str]:
|
||||
"""Coerce an Archive.org metadata field to a list of strings."""
|
||||
if value is None:
|
||||
return []
|
||||
if isinstance(value, list):
|
||||
out: List[str] = []
|
||||
for v in value:
|
||||
try:
|
||||
s = str(v).strip()
|
||||
except Exception:
|
||||
continue
|
||||
if s:
|
||||
out.append(s)
|
||||
return out
|
||||
if isinstance(value, (tuple, set)):
|
||||
out = []
|
||||
for v in value:
|
||||
try:
|
||||
s = str(v).strip()
|
||||
except Exception:
|
||||
continue
|
||||
if s:
|
||||
out.append(s)
|
||||
return out
|
||||
try:
|
||||
s = str(value).strip()
|
||||
except Exception:
|
||||
return []
|
||||
return [s] if s else []
|
||||
|
||||
|
||||
def _archive_item_metadata_to_tags(archive_id: str, item_metadata: Dict[str, Any]) -> List[str]:
|
||||
"""Map Archive.org metadata JSON (the `metadata` object) to tag strings.
|
||||
|
||||
This is intentionally best-effort and conservative: it focuses on stable,
|
||||
useful bibliographic fields (title/author/publisher/ISBN/identifier/topics).
|
||||
"""
|
||||
archive_id_clean = str(archive_id or "").strip()
|
||||
meta = item_metadata if isinstance(item_metadata, dict) else {}
|
||||
|
||||
tags: List[str] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
def _add(tag: str) -> None:
|
||||
try:
|
||||
t = str(tag).strip()
|
||||
except Exception:
|
||||
return
|
||||
if not t:
|
||||
return
|
||||
if t.lower() in seen:
|
||||
return
|
||||
seen.add(t.lower())
|
||||
tags.append(t)
|
||||
|
||||
if archive_id_clean:
|
||||
_add(f"internet_archive:{archive_id_clean}")
|
||||
|
||||
# Title
|
||||
for title in _coerce_archive_field_list(meta.get("title"))[:1]:
|
||||
_add(f"title:{title}")
|
||||
|
||||
# Authors/creators
|
||||
creators: List[str] = []
|
||||
creators.extend(_coerce_archive_field_list(meta.get("creator")))
|
||||
creators.extend(_coerce_archive_field_list(meta.get("author")))
|
||||
for creator in creators[:3]:
|
||||
_add(f"author:{creator}")
|
||||
|
||||
# Publisher
|
||||
for publisher in _coerce_archive_field_list(meta.get("publisher"))[:3]:
|
||||
_add(f"publisher:{publisher}")
|
||||
|
||||
# Publish date/year
|
||||
for date_val in _coerce_archive_field_list(meta.get("date"))[:1]:
|
||||
_add(f"publish_date:{date_val}")
|
||||
for year_val in _coerce_archive_field_list(meta.get("year"))[:1]:
|
||||
_add(f"publish_date:{year_val}")
|
||||
|
||||
# Language
|
||||
for lang in _coerce_archive_field_list(meta.get("language"))[:3]:
|
||||
_add(f"language:{lang}")
|
||||
|
||||
# Topics/subjects: follow existing OpenLibrary behavior (un-namespaced tags)
|
||||
for subj in _coerce_archive_field_list(meta.get("subject"))[:15]:
|
||||
if len(subj) > 200:
|
||||
subj = subj[:200]
|
||||
_add(subj)
|
||||
|
||||
# ISBNs and identifiers
|
||||
def _clean_isbn(raw: str) -> str:
|
||||
return str(raw or "").replace("-", "").strip()
|
||||
|
||||
for isbn in _coerce_archive_field_list(meta.get("isbn"))[:10]:
|
||||
isbn_clean = _clean_isbn(isbn)
|
||||
if isbn_clean:
|
||||
_add(f"isbn:{isbn_clean}")
|
||||
|
||||
identifiers: List[str] = []
|
||||
identifiers.extend(_coerce_archive_field_list(meta.get("identifier")))
|
||||
identifiers.extend(_coerce_archive_field_list(meta.get("external-identifier")))
|
||||
added_other = 0
|
||||
for ident in identifiers:
|
||||
ident_s = str(ident or "").strip()
|
||||
if not ident_s:
|
||||
continue
|
||||
low = ident_s.lower()
|
||||
|
||||
if low.startswith("urn:isbn:"):
|
||||
val = _clean_isbn(ident_s.split(":", 2)[-1])
|
||||
if val:
|
||||
_add(f"isbn:{val}")
|
||||
continue
|
||||
if low.startswith("isbn:"):
|
||||
val = _clean_isbn(ident_s.split(":", 1)[-1])
|
||||
if val:
|
||||
_add(f"isbn:{val}")
|
||||
continue
|
||||
if low.startswith("urn:oclc:"):
|
||||
val = ident_s.split(":", 2)[-1].strip()
|
||||
if val:
|
||||
_add(f"oclc:{val}")
|
||||
continue
|
||||
if low.startswith("oclc:"):
|
||||
val = ident_s.split(":", 1)[-1].strip()
|
||||
if val:
|
||||
_add(f"oclc:{val}")
|
||||
continue
|
||||
if low.startswith("urn:lccn:"):
|
||||
val = ident_s.split(":", 2)[-1].strip()
|
||||
if val:
|
||||
_add(f"lccn:{val}")
|
||||
continue
|
||||
if low.startswith("lccn:"):
|
||||
val = ident_s.split(":", 1)[-1].strip()
|
||||
if val:
|
||||
_add(f"lccn:{val}")
|
||||
continue
|
||||
if low.startswith("doi:"):
|
||||
val = ident_s.split(":", 1)[-1].strip()
|
||||
if val:
|
||||
_add(f"doi:{val}")
|
||||
continue
|
||||
|
||||
if archive_id_clean and low == archive_id_clean.lower():
|
||||
continue
|
||||
if added_other >= 5:
|
||||
continue
|
||||
if len(ident_s) > 200:
|
||||
ident_s = ident_s[:200]
|
||||
_add(f"identifier:{ident_s}")
|
||||
added_other += 1
|
||||
|
||||
return tags
|
||||
|
||||
|
||||
def _fetch_archive_item_metadata(archive_id: str, *, timeout: int = 8) -> Dict[str, Any]:
|
||||
ident = str(archive_id or "").strip()
|
||||
if not ident:
|
||||
return {}
|
||||
resp = requests.get(f"https://archive.org/metadata/{ident}", timeout=int(timeout))
|
||||
resp.raise_for_status()
|
||||
data = resp.json() if resp is not None else {}
|
||||
if not isinstance(data, dict):
|
||||
return {}
|
||||
meta = data.get("metadata")
|
||||
return meta if isinstance(meta, dict) else {}
|
||||
|
||||
|
||||
class OpenLibrary(Provider):
|
||||
# Domains that should be routed to this provider when the user supplies a URL.
|
||||
# (Used by ProviderCore.registry.match_provider_name_for_url)
|
||||
@@ -1046,6 +1216,25 @@ class OpenLibrary(Provider):
|
||||
log("[openlibrary] No archive identifier available; cannot download", file=sys.stderr)
|
||||
return None
|
||||
|
||||
# Best-effort metadata scrape to attach bibliographic tags for downstream cmdlets.
|
||||
try:
|
||||
archive_meta = _fetch_archive_item_metadata(archive_id)
|
||||
tags = _archive_item_metadata_to_tags(archive_id, archive_meta)
|
||||
if tags:
|
||||
try:
|
||||
result.tag.update(tags)
|
||||
except Exception:
|
||||
# Fallback for callers that pass plain dicts.
|
||||
pass
|
||||
if isinstance(meta, dict):
|
||||
meta["archive_id"] = archive_id
|
||||
if archive_meta:
|
||||
meta["archive_metadata"] = archive_meta
|
||||
result.full_metadata = meta
|
||||
except Exception:
|
||||
# Never block downloads on metadata fetch.
|
||||
pass
|
||||
|
||||
safe_title = sanitize_filename(result.title)
|
||||
if not safe_title or "http" in safe_title.lower():
|
||||
safe_title = sanitize_filename(archive_id) or "archive"
|
||||
|
||||
Reference in New Issue
Block a user