ssd
This commit is contained in:
@@ -8,6 +8,9 @@ import requests
|
||||
import sys
|
||||
import json
|
||||
import subprocess
|
||||
|
||||
from API.HTTP import HTTPClient
|
||||
from ProviderCore.base import SearchResult
|
||||
try: # Optional dependency for IMDb scraping
|
||||
from imdbinfo.services import search_title # type: ignore
|
||||
except ImportError: # pragma: no cover - optional
|
||||
@@ -15,6 +18,7 @@ except ImportError: # pragma: no cover - optional
|
||||
|
||||
from SYS.logger import log, debug
|
||||
from SYS.metadata import imdb_tag
|
||||
from SYS.json_table import normalize_record
|
||||
|
||||
try: # Optional dependency
|
||||
import musicbrainzngs # type: ignore
|
||||
@@ -892,6 +896,524 @@ class YtdlpMetadataProvider(MetadataProvider):
|
||||
return out
|
||||
|
||||
|
||||
def _coerce_archive_field_list(value: Any) -> List[str]:
|
||||
"""Coerce an Archive.org metadata field to a list of strings."""
|
||||
|
||||
if value is None:
|
||||
return []
|
||||
if isinstance(value, list):
|
||||
out: List[str] = []
|
||||
for v in value:
|
||||
try:
|
||||
s = str(v).strip()
|
||||
except Exception:
|
||||
continue
|
||||
if s:
|
||||
out.append(s)
|
||||
return out
|
||||
if isinstance(value, (tuple, set)):
|
||||
out = []
|
||||
for v in value:
|
||||
try:
|
||||
s = str(v).strip()
|
||||
except Exception:
|
||||
continue
|
||||
if s:
|
||||
out.append(s)
|
||||
return out
|
||||
try:
|
||||
s = str(value).strip()
|
||||
except Exception:
|
||||
return []
|
||||
return [s] if s else []
|
||||
|
||||
|
||||
def archive_item_metadata_to_tags(archive_id: str,
|
||||
item_metadata: Dict[str, Any]) -> List[str]:
|
||||
"""Coerce Archive.org metadata into a stable set of bibliographic tags."""
|
||||
|
||||
archive_id_clean = str(archive_id or "").strip()
|
||||
meta = item_metadata if isinstance(item_metadata, dict) else {}
|
||||
|
||||
tags: List[str] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
def _add(tag: str) -> None:
|
||||
try:
|
||||
t = str(tag).strip()
|
||||
except Exception:
|
||||
return
|
||||
if not t:
|
||||
return
|
||||
if t.lower() in seen:
|
||||
return
|
||||
seen.add(t.lower())
|
||||
tags.append(t)
|
||||
|
||||
if archive_id_clean:
|
||||
_add(f"internet_archive:{archive_id_clean}")
|
||||
|
||||
for title in _coerce_archive_field_list(meta.get("title"))[:1]:
|
||||
_add(f"title:{title}")
|
||||
|
||||
creators: List[str] = []
|
||||
creators.extend(_coerce_archive_field_list(meta.get("creator")))
|
||||
creators.extend(_coerce_archive_field_list(meta.get("author")))
|
||||
for creator in creators[:3]:
|
||||
_add(f"author:{creator}")
|
||||
|
||||
for publisher in _coerce_archive_field_list(meta.get("publisher"))[:3]:
|
||||
_add(f"publisher:{publisher}")
|
||||
|
||||
for date_val in _coerce_archive_field_list(meta.get("date"))[:1]:
|
||||
_add(f"publish_date:{date_val}")
|
||||
for year_val in _coerce_archive_field_list(meta.get("year"))[:1]:
|
||||
_add(f"publish_date:{year_val}")
|
||||
|
||||
for lang in _coerce_archive_field_list(meta.get("language"))[:3]:
|
||||
_add(f"language:{lang}")
|
||||
|
||||
for subj in _coerce_archive_field_list(meta.get("subject"))[:15]:
|
||||
if len(subj) > 200:
|
||||
subj = subj[:200]
|
||||
_add(subj)
|
||||
|
||||
def _clean_isbn(raw: str) -> str:
|
||||
return str(raw or "").replace("-", "").strip()
|
||||
|
||||
for isbn in _coerce_archive_field_list(meta.get("isbn"))[:10]:
|
||||
isbn_clean = _clean_isbn(isbn)
|
||||
if isbn_clean:
|
||||
_add(f"isbn:{isbn_clean}")
|
||||
|
||||
identifiers: List[str] = []
|
||||
identifiers.extend(_coerce_archive_field_list(meta.get("identifier")))
|
||||
identifiers.extend(_coerce_archive_field_list(meta.get("external-identifier")))
|
||||
added_other = 0
|
||||
for ident in identifiers:
|
||||
ident_s = str(ident or "").strip()
|
||||
if not ident_s:
|
||||
continue
|
||||
low = ident_s.lower()
|
||||
|
||||
if low.startswith("urn:isbn:"):
|
||||
val = _clean_isbn(ident_s.split(":", 2)[-1])
|
||||
if val:
|
||||
_add(f"isbn:{val}")
|
||||
continue
|
||||
if low.startswith("isbn:"):
|
||||
val = _clean_isbn(ident_s.split(":", 1)[-1])
|
||||
if val:
|
||||
_add(f"isbn:{val}")
|
||||
continue
|
||||
if low.startswith("urn:oclc:"):
|
||||
val = ident_s.split(":", 2)[-1].strip()
|
||||
if val:
|
||||
_add(f"oclc:{val}")
|
||||
continue
|
||||
if low.startswith("oclc:"):
|
||||
val = ident_s.split(":", 1)[-1].strip()
|
||||
if val:
|
||||
_add(f"oclc:{val}")
|
||||
continue
|
||||
if low.startswith("urn:lccn:"):
|
||||
val = ident_s.split(":", 2)[-1].strip()
|
||||
if val:
|
||||
_add(f"lccn:{val}")
|
||||
continue
|
||||
if low.startswith("lccn:"):
|
||||
val = ident_s.split(":", 1)[-1].strip()
|
||||
if val:
|
||||
_add(f"lccn:{val}")
|
||||
continue
|
||||
if low.startswith("doi:"):
|
||||
val = ident_s.split(":", 1)[-1].strip()
|
||||
if val:
|
||||
_add(f"doi:{val}")
|
||||
continue
|
||||
|
||||
if archive_id_clean and low == archive_id_clean.lower():
|
||||
continue
|
||||
if added_other >= 5:
|
||||
continue
|
||||
if len(ident_s) > 200:
|
||||
ident_s = ident_s[:200]
|
||||
_add(f"identifier:{ident_s}")
|
||||
added_other += 1
|
||||
|
||||
return tags
|
||||
|
||||
|
||||
def fetch_archive_item_metadata(archive_id: str,
|
||||
*,
|
||||
timeout: int = 8) -> Dict[str, Any]:
|
||||
ident = str(archive_id or "").strip()
|
||||
if not ident:
|
||||
return {}
|
||||
resp = requests.get(f"https://archive.org/metadata/{ident}", timeout=int(timeout))
|
||||
resp.raise_for_status()
|
||||
data = resp.json() if resp is not None else {}
|
||||
if not isinstance(data, dict):
|
||||
return {}
|
||||
meta = data.get("metadata")
|
||||
return meta if isinstance(meta, dict) else {}
|
||||
|
||||
|
||||
def scrape_isbn_metadata(isbn: str) -> List[str]:
|
||||
"""Scrape metadata tags for an ISBN using OpenLibrary's books API."""
|
||||
|
||||
new_tags: List[str] = []
|
||||
|
||||
isbn_clean = str(isbn or "").replace("isbn:", "").replace("-", "").strip()
|
||||
if not isbn_clean:
|
||||
return []
|
||||
|
||||
url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json"
|
||||
try:
|
||||
with HTTPClient() as client:
|
||||
response = client.get(url)
|
||||
response.raise_for_status()
|
||||
data = json.loads(response.content.decode("utf-8"))
|
||||
except Exception as exc:
|
||||
log(f"Failed to fetch ISBN metadata: {exc}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
if not data:
|
||||
log(f"No ISBN metadata found for: {isbn}")
|
||||
return []
|
||||
|
||||
book_data = next(iter(data.values()), None)
|
||||
if not isinstance(book_data, dict):
|
||||
return []
|
||||
|
||||
if "title" in book_data:
|
||||
new_tags.append(f"title:{book_data['title']}")
|
||||
|
||||
authors = book_data.get("authors")
|
||||
if isinstance(authors, list):
|
||||
for author in authors[:3]:
|
||||
if isinstance(author, dict) and author.get("name"):
|
||||
new_tags.append(f"author:{author['name']}")
|
||||
|
||||
if book_data.get("publish_date"):
|
||||
new_tags.append(f"publish_date:{book_data['publish_date']}")
|
||||
|
||||
publishers = book_data.get("publishers")
|
||||
if isinstance(publishers, list) and publishers:
|
||||
pub = publishers[0]
|
||||
if isinstance(pub, dict) and pub.get("name"):
|
||||
new_tags.append(f"publisher:{pub['name']}")
|
||||
|
||||
if "description" in book_data:
|
||||
desc = book_data.get("description")
|
||||
if isinstance(desc, dict) and "value" in desc:
|
||||
desc = desc.get("value")
|
||||
if desc:
|
||||
desc_str = str(desc).strip()
|
||||
if desc_str:
|
||||
new_tags.append(f"description:{desc_str[:200]}")
|
||||
|
||||
page_count = book_data.get("number_of_pages")
|
||||
if isinstance(page_count, int) and page_count > 0:
|
||||
new_tags.append(f"pages:{page_count}")
|
||||
|
||||
identifiers = book_data.get("identifiers")
|
||||
if isinstance(identifiers, dict):
|
||||
|
||||
def _first(value: Any) -> Any:
|
||||
if isinstance(value, list) and value:
|
||||
return value[0]
|
||||
return value
|
||||
|
||||
for key, ns in (
|
||||
("openlibrary", "openlibrary"),
|
||||
("lccn", "lccn"),
|
||||
("oclc", "oclc"),
|
||||
("goodreads", "goodreads"),
|
||||
("librarything", "librarything"),
|
||||
("doi", "doi"),
|
||||
("internet_archive", "internet_archive"),
|
||||
):
|
||||
val = _first(identifiers.get(key))
|
||||
if val:
|
||||
new_tags.append(f"{ns}:{val}")
|
||||
|
||||
debug(f"Found {len(new_tags)} tag(s) from ISBN lookup")
|
||||
return new_tags
|
||||
|
||||
|
||||
def scrape_openlibrary_metadata(olid: str) -> List[str]:
|
||||
"""Scrape metadata tags for an OpenLibrary ID using the edition JSON endpoint."""
|
||||
|
||||
new_tags: List[str] = []
|
||||
|
||||
olid_text = str(olid or "").strip()
|
||||
if not olid_text:
|
||||
return []
|
||||
|
||||
olid_norm = olid_text
|
||||
try:
|
||||
if not olid_norm.startswith("OL"):
|
||||
olid_norm = f"OL{olid_norm}"
|
||||
if not olid_norm.endswith("M"):
|
||||
olid_norm = f"{olid_norm}M"
|
||||
except Exception:
|
||||
olid_norm = olid_text
|
||||
|
||||
new_tags.append(f"openlibrary:{olid_norm}")
|
||||
|
||||
olid_clean = olid_text.replace("OL", "").replace("M", "")
|
||||
if not olid_clean.isdigit():
|
||||
olid_clean = olid_text
|
||||
|
||||
if not olid_text.startswith("OL"):
|
||||
url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
|
||||
else:
|
||||
url = f"https://openlibrary.org/books/{olid_text}.json"
|
||||
|
||||
try:
|
||||
with HTTPClient() as client:
|
||||
response = client.get(url)
|
||||
response.raise_for_status()
|
||||
data = json.loads(response.content.decode("utf-8"))
|
||||
except Exception as exc:
|
||||
log(f"Failed to fetch OpenLibrary metadata: {exc}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
if not isinstance(data, dict) or not data:
|
||||
log(f"No OpenLibrary metadata found for: {olid_text}")
|
||||
return []
|
||||
|
||||
if "title" in data:
|
||||
new_tags.append(f"title:{data['title']}")
|
||||
|
||||
authors = data.get("authors")
|
||||
if isinstance(authors, list):
|
||||
for author in authors[:3]:
|
||||
if isinstance(author, dict) and author.get("name"):
|
||||
new_tags.append(f"author:{author['name']}")
|
||||
continue
|
||||
|
||||
author_key = None
|
||||
if isinstance(author, dict):
|
||||
if isinstance(author.get("author"), dict):
|
||||
author_key = author.get("author", {}).get("key")
|
||||
if not author_key:
|
||||
author_key = author.get("key")
|
||||
|
||||
if isinstance(author_key, str) and author_key.startswith("/"):
|
||||
try:
|
||||
author_url = f"https://openlibrary.org{author_key}.json"
|
||||
with HTTPClient(timeout=10) as client:
|
||||
author_resp = client.get(author_url)
|
||||
author_resp.raise_for_status()
|
||||
author_data = json.loads(author_resp.content.decode("utf-8"))
|
||||
if isinstance(author_data, dict) and author_data.get("name"):
|
||||
new_tags.append(f"author:{author_data['name']}")
|
||||
continue
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if isinstance(author, str) and author:
|
||||
new_tags.append(f"author:{author}")
|
||||
|
||||
if data.get("publish_date"):
|
||||
new_tags.append(f"publish_date:{data['publish_date']}")
|
||||
|
||||
publishers = data.get("publishers")
|
||||
if isinstance(publishers, list) and publishers:
|
||||
pub = publishers[0]
|
||||
if isinstance(pub, dict) and pub.get("name"):
|
||||
new_tags.append(f"publisher:{pub['name']}")
|
||||
elif isinstance(pub, str) and pub:
|
||||
new_tags.append(f"publisher:{pub}")
|
||||
|
||||
if "description" in data:
|
||||
desc = data.get("description")
|
||||
if isinstance(desc, dict) and "value" in desc:
|
||||
desc = desc.get("value")
|
||||
if desc:
|
||||
desc_str = str(desc).strip()
|
||||
if desc_str:
|
||||
new_tags.append(f"description:{desc_str[:200]}")
|
||||
|
||||
page_count = data.get("number_of_pages")
|
||||
if isinstance(page_count, int) and page_count > 0:
|
||||
new_tags.append(f"pages:{page_count}")
|
||||
|
||||
subjects = data.get("subjects")
|
||||
if isinstance(subjects, list):
|
||||
for subject in subjects[:10]:
|
||||
if isinstance(subject, str):
|
||||
subject_clean = subject.strip()
|
||||
if subject_clean and subject_clean not in new_tags:
|
||||
new_tags.append(subject_clean)
|
||||
|
||||
identifiers = data.get("identifiers")
|
||||
if isinstance(identifiers, dict):
|
||||
|
||||
def _first(value: Any) -> Any:
|
||||
if isinstance(value, list) and value:
|
||||
return value[0]
|
||||
return value
|
||||
|
||||
for key, ns in (
|
||||
("isbn_10", "isbn_10"),
|
||||
("isbn_13", "isbn_13"),
|
||||
("lccn", "lccn"),
|
||||
("oclc_numbers", "oclc"),
|
||||
("goodreads", "goodreads"),
|
||||
("internet_archive", "internet_archive"),
|
||||
):
|
||||
val = _first(identifiers.get(key))
|
||||
if val:
|
||||
new_tags.append(f"{ns}:{val}")
|
||||
|
||||
ocaid = data.get("ocaid")
|
||||
if isinstance(ocaid, str) and ocaid.strip():
|
||||
new_tags.append(f"internet_archive:{ocaid.strip()}")
|
||||
|
||||
debug(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup")
|
||||
return new_tags
|
||||
|
||||
|
||||
SAMPLE_ITEMS: List[Dict[str, Any]] = [
|
||||
{
|
||||
"title": "Sample OpenLibrary book",
|
||||
"path": "https://openlibrary.org/books/OL123M",
|
||||
"openlibrary_id": "OL123M",
|
||||
"archive_id": "samplearchive123",
|
||||
"availability": "borrow",
|
||||
"availability_reason": "sample",
|
||||
"direct_url": "https://archive.org/download/sample.pdf",
|
||||
"author_name": ["OpenLibrary Demo"],
|
||||
"first_publish_year": 2023,
|
||||
"ia": ["samplearchive123"],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
try:
|
||||
from typing import Iterable
|
||||
|
||||
from SYS.result_table_api import ColumnSpec, ResultModel, metadata_column, title_column
|
||||
from SYS.result_table_adapters import register_provider
|
||||
|
||||
def _ensure_search_result(item: Any) -> SearchResult:
|
||||
if isinstance(item, SearchResult):
|
||||
return item
|
||||
if isinstance(item, dict):
|
||||
data = dict(item)
|
||||
title = str(data.get("title") or data.get("name") or "OpenLibrary")
|
||||
path = str(data.get("path") or data.get("url") or "")
|
||||
detail = str(data.get("detail") or "")
|
||||
annotations = list(data.get("annotations") or [])
|
||||
media_kind = str(data.get("media_kind") or "book")
|
||||
return SearchResult(
|
||||
table="openlibrary",
|
||||
title=title,
|
||||
path=path,
|
||||
detail=detail,
|
||||
annotations=annotations,
|
||||
media_kind=media_kind,
|
||||
columns=data.get("columns") or [],
|
||||
full_metadata={**data, "raw": dict(item)},
|
||||
)
|
||||
return SearchResult(
|
||||
table="openlibrary",
|
||||
title=str(item or "OpenLibrary"),
|
||||
path="",
|
||||
detail="",
|
||||
annotations=[],
|
||||
media_kind="book",
|
||||
full_metadata={"raw": {}},
|
||||
)
|
||||
|
||||
def _adapter(items: Iterable[Any]) -> Iterable[ResultModel]:
|
||||
for item in items:
|
||||
sr = _ensure_search_result(item)
|
||||
metadata = dict(getattr(sr, "full_metadata", {}) or {})
|
||||
raw = metadata.get("raw")
|
||||
if isinstance(raw, dict):
|
||||
normalized = normalize_record(raw)
|
||||
for key, val in normalized.items():
|
||||
metadata.setdefault(key, val)
|
||||
|
||||
def _make_url() -> str:
|
||||
candidate = (
|
||||
metadata.get("selection_url") or
|
||||
metadata.get("direct_url") or
|
||||
metadata.get("url") or
|
||||
metadata.get("path") or
|
||||
sr.path or
|
||||
""
|
||||
)
|
||||
return str(candidate or "").strip()
|
||||
|
||||
selection_url = _make_url()
|
||||
if selection_url:
|
||||
metadata["selection_url"] = selection_url
|
||||
authors_value = metadata.get("authors_display") or metadata.get("authors") or metadata.get("author_name") or ""
|
||||
if isinstance(authors_value, list):
|
||||
authors_value = ", ".join(str(v) for v in authors_value if v)
|
||||
authors_text = str(authors_value or "").strip()
|
||||
if authors_text:
|
||||
metadata["authors_display"] = authors_text
|
||||
year_value = metadata.get("year") or metadata.get("first_publish_year")
|
||||
if year_value and not isinstance(year_value, str):
|
||||
year_value = str(year_value)
|
||||
if year_value:
|
||||
metadata["year"] = str(year_value)
|
||||
metadata.setdefault("openlibrary_id", metadata.get("openlibrary_id") or metadata.get("olid"))
|
||||
metadata.setdefault("source", metadata.get("source") or "openlibrary")
|
||||
yield ResultModel(
|
||||
title=str(sr.title or metadata.get("title") or selection_url or "OpenLibrary"),
|
||||
path=selection_url or None,
|
||||
metadata=metadata,
|
||||
source="openlibrary",
|
||||
)
|
||||
|
||||
def _columns_factory(rows: List[ResultModel]) -> List[ColumnSpec]:
|
||||
cols: List[ColumnSpec] = [title_column()]
|
||||
def _has(key: str) -> bool:
|
||||
return any((row.metadata or {}).get(key) for row in rows)
|
||||
|
||||
if _has("authors_display"):
|
||||
cols.append(
|
||||
ColumnSpec(
|
||||
"authors_display",
|
||||
"Author",
|
||||
lambda r: (r.metadata or {}).get("authors_display") or "",
|
||||
)
|
||||
)
|
||||
if _has("year"):
|
||||
cols.append(metadata_column("year", "Year"))
|
||||
if _has("availability"):
|
||||
cols.append(metadata_column("availability", "Avail"))
|
||||
if _has("archive_id"):
|
||||
cols.append(metadata_column("archive_id", "Archive ID"))
|
||||
if _has("openlibrary_id"):
|
||||
cols.append(metadata_column("openlibrary_id", "OLID"))
|
||||
return cols
|
||||
|
||||
def _selection_fn(row: ResultModel) -> List[str]:
|
||||
metadata = row.metadata or {}
|
||||
url = str(metadata.get("selection_url") or row.path or "").strip()
|
||||
if url:
|
||||
return ["-url", url]
|
||||
return ["-title", row.title or ""]
|
||||
|
||||
register_provider(
|
||||
"openlibrary",
|
||||
_adapter,
|
||||
columns=_columns_factory,
|
||||
selection_fn=_selection_fn,
|
||||
metadata={"description": "OpenLibrary search provider (JSON result table template)"},
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
# Registry ---------------------------------------------------------------
|
||||
|
||||
_METADATA_PROVIDERS: Dict[str,
|
||||
|
||||
@@ -11,18 +11,29 @@ import sys
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
|
||||
from API.HTTP import HTTPClient
|
||||
from API.HTTP import HTTPClient, get_requests_verify_value
|
||||
from ProviderCore.base import Provider, SearchResult
|
||||
from SYS.utils import sanitize_filename
|
||||
from SYS.cli_syntax import get_field, get_free_text, parse_query
|
||||
from SYS.logger import debug, log
|
||||
from Provider.metadata_provider import (
|
||||
archive_item_metadata_to_tags,
|
||||
fetch_archive_item_metadata,
|
||||
)
|
||||
from SYS.utils import unique_path
|
||||
|
||||
_ARCHIVE_VERIFY_VALUE = get_requests_verify_value()
|
||||
|
||||
def _create_archive_session() -> requests.Session:
|
||||
session = requests.Session()
|
||||
session.verify = _ARCHIVE_VERIFY_VALUE
|
||||
return session
|
||||
|
||||
try:
|
||||
from Crypto.Cipher import AES # type: ignore
|
||||
from Crypto.Util import Counter # type: ignore
|
||||
@@ -262,182 +273,6 @@ def title_hint_from_url_slug(u: str) -> str:
|
||||
return slug or "OpenLibrary"
|
||||
|
||||
|
||||
def _coerce_archive_field_list(value: Any) -> List[str]:
|
||||
"""Coerce an Archive.org metadata field to a list of strings."""
|
||||
if value is None:
|
||||
return []
|
||||
if isinstance(value, list):
|
||||
out: List[str] = []
|
||||
for v in value:
|
||||
try:
|
||||
s = str(v).strip()
|
||||
except Exception:
|
||||
continue
|
||||
if s:
|
||||
out.append(s)
|
||||
return out
|
||||
if isinstance(value, (tuple, set)):
|
||||
out = []
|
||||
for v in value:
|
||||
try:
|
||||
s = str(v).strip()
|
||||
except Exception:
|
||||
continue
|
||||
if s:
|
||||
out.append(s)
|
||||
return out
|
||||
try:
|
||||
s = str(value).strip()
|
||||
except Exception:
|
||||
return []
|
||||
return [s] if s else []
|
||||
|
||||
|
||||
def _archive_item_metadata_to_tags(archive_id: str,
|
||||
item_metadata: Dict[str,
|
||||
Any]) -> List[str]:
|
||||
"""Map Archive.org metadata JSON (the `metadata` object) to tag strings.
|
||||
|
||||
This is intentionally best-effort and conservative: it focuses on stable,
|
||||
useful bibliographic fields (title/author/publisher/ISBN/identifier/topics).
|
||||
"""
|
||||
archive_id_clean = str(archive_id or "").strip()
|
||||
meta = item_metadata if isinstance(item_metadata,
|
||||
dict) else {}
|
||||
|
||||
tags: List[str] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
def _add(tag: str) -> None:
|
||||
try:
|
||||
t = str(tag).strip()
|
||||
except Exception:
|
||||
return
|
||||
if not t:
|
||||
return
|
||||
if t.lower() in seen:
|
||||
return
|
||||
seen.add(t.lower())
|
||||
tags.append(t)
|
||||
|
||||
if archive_id_clean:
|
||||
_add(f"internet_archive:{archive_id_clean}")
|
||||
|
||||
# Title
|
||||
for title in _coerce_archive_field_list(meta.get("title"))[:1]:
|
||||
_add(f"title:{title}")
|
||||
|
||||
# Authors/creators
|
||||
creators: List[str] = []
|
||||
creators.extend(_coerce_archive_field_list(meta.get("creator")))
|
||||
creators.extend(_coerce_archive_field_list(meta.get("author")))
|
||||
for creator in creators[:3]:
|
||||
_add(f"author:{creator}")
|
||||
|
||||
# Publisher
|
||||
for publisher in _coerce_archive_field_list(meta.get("publisher"))[:3]:
|
||||
_add(f"publisher:{publisher}")
|
||||
|
||||
# Publish date/year
|
||||
for date_val in _coerce_archive_field_list(meta.get("date"))[:1]:
|
||||
_add(f"publish_date:{date_val}")
|
||||
for year_val in _coerce_archive_field_list(meta.get("year"))[:1]:
|
||||
_add(f"publish_date:{year_val}")
|
||||
|
||||
# Language
|
||||
for lang in _coerce_archive_field_list(meta.get("language"))[:3]:
|
||||
_add(f"language:{lang}")
|
||||
|
||||
# Topics/subjects: follow existing OpenLibrary behavior (un-namespaced tags)
|
||||
for subj in _coerce_archive_field_list(meta.get("subject"))[:15]:
|
||||
if len(subj) > 200:
|
||||
subj = subj[:200]
|
||||
_add(subj)
|
||||
|
||||
# ISBNs and identifiers
|
||||
def _clean_isbn(raw: str) -> str:
|
||||
return str(raw or "").replace("-", "").strip()
|
||||
|
||||
for isbn in _coerce_archive_field_list(meta.get("isbn"))[:10]:
|
||||
isbn_clean = _clean_isbn(isbn)
|
||||
if isbn_clean:
|
||||
_add(f"isbn:{isbn_clean}")
|
||||
|
||||
identifiers: List[str] = []
|
||||
identifiers.extend(_coerce_archive_field_list(meta.get("identifier")))
|
||||
identifiers.extend(_coerce_archive_field_list(meta.get("external-identifier")))
|
||||
added_other = 0
|
||||
for ident in identifiers:
|
||||
ident_s = str(ident or "").strip()
|
||||
if not ident_s:
|
||||
continue
|
||||
low = ident_s.lower()
|
||||
|
||||
if low.startswith("urn:isbn:"):
|
||||
val = _clean_isbn(ident_s.split(":", 2)[-1])
|
||||
if val:
|
||||
_add(f"isbn:{val}")
|
||||
continue
|
||||
if low.startswith("isbn:"):
|
||||
val = _clean_isbn(ident_s.split(":", 1)[-1])
|
||||
if val:
|
||||
_add(f"isbn:{val}")
|
||||
continue
|
||||
if low.startswith("urn:oclc:"):
|
||||
val = ident_s.split(":", 2)[-1].strip()
|
||||
if val:
|
||||
_add(f"oclc:{val}")
|
||||
continue
|
||||
if low.startswith("oclc:"):
|
||||
val = ident_s.split(":", 1)[-1].strip()
|
||||
if val:
|
||||
_add(f"oclc:{val}")
|
||||
continue
|
||||
if low.startswith("urn:lccn:"):
|
||||
val = ident_s.split(":", 2)[-1].strip()
|
||||
if val:
|
||||
_add(f"lccn:{val}")
|
||||
continue
|
||||
if low.startswith("lccn:"):
|
||||
val = ident_s.split(":", 1)[-1].strip()
|
||||
if val:
|
||||
_add(f"lccn:{val}")
|
||||
continue
|
||||
if low.startswith("doi:"):
|
||||
val = ident_s.split(":", 1)[-1].strip()
|
||||
if val:
|
||||
_add(f"doi:{val}")
|
||||
continue
|
||||
|
||||
if archive_id_clean and low == archive_id_clean.lower():
|
||||
continue
|
||||
if added_other >= 5:
|
||||
continue
|
||||
if len(ident_s) > 200:
|
||||
ident_s = ident_s[:200]
|
||||
_add(f"identifier:{ident_s}")
|
||||
added_other += 1
|
||||
|
||||
return tags
|
||||
|
||||
|
||||
def _fetch_archive_item_metadata(archive_id: str,
|
||||
*,
|
||||
timeout: int = 8) -> Dict[str,
|
||||
Any]:
|
||||
ident = str(archive_id or "").strip()
|
||||
if not ident:
|
||||
return {}
|
||||
resp = requests.get(f"https://archive.org/metadata/{ident}", timeout=int(timeout))
|
||||
resp.raise_for_status()
|
||||
data = resp.json() if resp is not None else {}
|
||||
if not isinstance(data, dict):
|
||||
return {}
|
||||
meta = data.get("metadata")
|
||||
return meta if isinstance(meta,
|
||||
dict) else {}
|
||||
|
||||
|
||||
class OpenLibrary(Provider):
|
||||
|
||||
TABLE_AUTO_STAGES = {
|
||||
@@ -466,7 +301,7 @@ class OpenLibrary(Provider):
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(config)
|
||||
self._session = requests.Session()
|
||||
self._session = _create_archive_session()
|
||||
|
||||
class BookNotAvailableError(Exception):
|
||||
"""Raised when a book is not available for borrowing (waitlisted/in use)."""
|
||||
@@ -612,7 +447,7 @@ class OpenLibrary(Provider):
|
||||
@classmethod
|
||||
def _archive_login(cls, email: str, password: str) -> requests.Session:
|
||||
"""Login to archive.org using the token-based services endpoint (matches test-login.py)."""
|
||||
session = requests.Session()
|
||||
session = _create_archive_session()
|
||||
|
||||
token_resp = session.get(
|
||||
"https://archive.org/services/account/login/",
|
||||
@@ -766,7 +601,11 @@ class OpenLibrary(Provider):
|
||||
if not ident:
|
||||
return False, "no-archive-id"
|
||||
try:
|
||||
resp = requests.get(f"https://archive.org/metadata/{ident}", timeout=8)
|
||||
resp = requests.get(
|
||||
f"https://archive.org/metadata/{ident}",
|
||||
timeout=8,
|
||||
verify=_ARCHIVE_VERIFY_VALUE,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json() if resp is not None else {}
|
||||
meta = data.get("metadata",
|
||||
@@ -976,7 +815,11 @@ class OpenLibrary(Provider):
|
||||
"""Check for a directly downloadable original PDF in Archive.org metadata."""
|
||||
try:
|
||||
metadata_url = f"https://archive.org/metadata/{book_id}"
|
||||
response = requests.get(metadata_url, timeout=6)
|
||||
response = requests.get(
|
||||
metadata_url,
|
||||
timeout=6,
|
||||
verify=_ARCHIVE_VERIFY_VALUE,
|
||||
)
|
||||
response.raise_for_status()
|
||||
metadata = response.json()
|
||||
files = metadata.get("files") if isinstance(metadata, dict) else None
|
||||
@@ -993,7 +836,8 @@ class OpenLibrary(Provider):
|
||||
check_response = requests.head(
|
||||
pdf_url,
|
||||
timeout=4,
|
||||
allow_redirects=True
|
||||
allow_redirects=True,
|
||||
verify=_ARCHIVE_VERIFY_VALUE,
|
||||
)
|
||||
if check_response.status_code == 200:
|
||||
return True, pdf_url
|
||||
@@ -1001,235 +845,6 @@ class OpenLibrary(Provider):
|
||||
except Exception:
|
||||
return False, ""
|
||||
|
||||
@staticmethod
|
||||
def scrape_isbn_metadata(isbn: str) -> List[str]:
|
||||
"""Scrape tags for an ISBN using Open Library API.
|
||||
|
||||
Returns tags such as:
|
||||
- title:<...>, author:<...>, publish_date:<...>, publisher:<...>, description:<...>, pages:<...>
|
||||
- identifiers: openlibrary:<...>, lccn:<...>, oclc:<...>, goodreads:<...>, librarything:<...>, doi:<...>, internet_archive:<...>
|
||||
"""
|
||||
new_tags: List[str] = []
|
||||
|
||||
isbn_clean = str(isbn or "").replace("isbn:", "").replace("-", "").strip()
|
||||
if not isbn_clean:
|
||||
return []
|
||||
|
||||
url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json"
|
||||
try:
|
||||
with HTTPClient() as client:
|
||||
response = client.get(url)
|
||||
response.raise_for_status()
|
||||
data = json_module.loads(response.content.decode("utf-8"))
|
||||
except Exception as exc:
|
||||
log(f"Failed to fetch ISBN metadata: {exc}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
if not data:
|
||||
log(f"No ISBN metadata found for: {isbn}")
|
||||
return []
|
||||
|
||||
book_data = next(iter(data.values()), None)
|
||||
if not isinstance(book_data, dict):
|
||||
return []
|
||||
|
||||
if "title" in book_data:
|
||||
new_tags.append(f"title:{book_data['title']}")
|
||||
|
||||
authors = book_data.get("authors")
|
||||
if isinstance(authors, list):
|
||||
for author in authors[:3]:
|
||||
if isinstance(author, dict) and author.get("name"):
|
||||
new_tags.append(f"author:{author['name']}")
|
||||
|
||||
if book_data.get("publish_date"):
|
||||
new_tags.append(f"publish_date:{book_data['publish_date']}")
|
||||
|
||||
publishers = book_data.get("publishers")
|
||||
if isinstance(publishers, list) and publishers:
|
||||
pub = publishers[0]
|
||||
if isinstance(pub, dict) and pub.get("name"):
|
||||
new_tags.append(f"publisher:{pub['name']}")
|
||||
|
||||
if "description" in book_data:
|
||||
desc = book_data.get("description")
|
||||
if isinstance(desc, dict) and "value" in desc:
|
||||
desc = desc.get("value")
|
||||
if desc:
|
||||
desc_str = str(desc).strip()
|
||||
if desc_str:
|
||||
new_tags.append(f"description:{desc_str[:200]}")
|
||||
|
||||
page_count = book_data.get("number_of_pages")
|
||||
if isinstance(page_count, int) and page_count > 0:
|
||||
new_tags.append(f"pages:{page_count}")
|
||||
|
||||
identifiers = book_data.get("identifiers")
|
||||
if isinstance(identifiers, dict):
|
||||
|
||||
def _first(value: Any) -> Any:
|
||||
if isinstance(value, list) and value:
|
||||
return value[0]
|
||||
return value
|
||||
|
||||
for key, ns in (
|
||||
("openlibrary", "openlibrary"),
|
||||
("lccn", "lccn"),
|
||||
("oclc", "oclc"),
|
||||
("goodreads", "goodreads"),
|
||||
("librarything", "librarything"),
|
||||
("doi", "doi"),
|
||||
("internet_archive", "internet_archive"),
|
||||
):
|
||||
val = _first(identifiers.get(key))
|
||||
if val:
|
||||
new_tags.append(f"{ns}:{val}")
|
||||
|
||||
debug(f"Found {len(new_tags)} tag(s) from ISBN lookup")
|
||||
return new_tags
|
||||
|
||||
@staticmethod
|
||||
def scrape_openlibrary_metadata(olid: str) -> List[str]:
|
||||
"""Scrape tags for an OpenLibrary ID using the .json API endpoint."""
|
||||
new_tags: List[str] = []
|
||||
|
||||
olid_text = str(olid or "").strip()
|
||||
if not olid_text:
|
||||
return []
|
||||
|
||||
# Normalize OLID to the common "OL<digits>M" form when possible.
|
||||
olid_norm = olid_text
|
||||
try:
|
||||
if not olid_norm.startswith("OL"):
|
||||
olid_norm = f"OL{olid_norm}"
|
||||
if not olid_norm.endswith("M"):
|
||||
olid_norm = f"{olid_norm}M"
|
||||
except Exception:
|
||||
olid_norm = olid_text
|
||||
|
||||
# Ensure we always include a scrapeable identifier tag.
|
||||
new_tags.append(f"openlibrary:{olid_norm}")
|
||||
|
||||
# Accept OL9674499M, 9674499M, or just digits.
|
||||
olid_clean = olid_text.replace("OL", "").replace("M", "")
|
||||
if not olid_clean.isdigit():
|
||||
olid_clean = olid_text
|
||||
|
||||
if not olid_text.startswith("OL"):
|
||||
url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
|
||||
else:
|
||||
url = f"https://openlibrary.org/books/{olid_text}.json"
|
||||
|
||||
try:
|
||||
with HTTPClient() as client:
|
||||
response = client.get(url)
|
||||
response.raise_for_status()
|
||||
data = json_module.loads(response.content.decode("utf-8"))
|
||||
except Exception as exc:
|
||||
log(f"Failed to fetch OpenLibrary metadata: {exc}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
if not isinstance(data, dict) or not data:
|
||||
log(f"No OpenLibrary metadata found for: {olid_text}")
|
||||
return []
|
||||
|
||||
if "title" in data:
|
||||
new_tags.append(f"title:{data['title']}")
|
||||
|
||||
authors = data.get("authors")
|
||||
if isinstance(authors, list):
|
||||
for author in authors[:3]:
|
||||
if isinstance(author, dict) and author.get("name"):
|
||||
new_tags.append(f"author:{author['name']}")
|
||||
continue
|
||||
|
||||
# Common OL shape: {"key": "/authors/OL...A"} or {"author": {"key": ...}}
|
||||
author_key = None
|
||||
if isinstance(author, dict):
|
||||
if isinstance(author.get("author"), dict):
|
||||
author_key = author.get("author",
|
||||
{}).get("key")
|
||||
if not author_key:
|
||||
author_key = author.get("key")
|
||||
|
||||
if isinstance(author_key, str) and author_key.startswith("/"):
|
||||
try:
|
||||
author_url = f"https://openlibrary.org{author_key}.json"
|
||||
with HTTPClient(timeout=10) as client:
|
||||
author_resp = client.get(author_url)
|
||||
author_resp.raise_for_status()
|
||||
author_data = json_module.loads(
|
||||
author_resp.content.decode("utf-8")
|
||||
)
|
||||
if isinstance(author_data, dict) and author_data.get("name"):
|
||||
new_tags.append(f"author:{author_data['name']}")
|
||||
continue
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if isinstance(author, str) and author:
|
||||
new_tags.append(f"author:{author}")
|
||||
|
||||
if data.get("publish_date"):
|
||||
new_tags.append(f"publish_date:{data['publish_date']}")
|
||||
|
||||
publishers = data.get("publishers")
|
||||
if isinstance(publishers, list) and publishers:
|
||||
pub = publishers[0]
|
||||
if isinstance(pub, dict) and pub.get("name"):
|
||||
new_tags.append(f"publisher:{pub['name']}")
|
||||
elif isinstance(pub, str) and pub:
|
||||
new_tags.append(f"publisher:{pub}")
|
||||
|
||||
if "description" in data:
|
||||
desc = data.get("description")
|
||||
if isinstance(desc, dict) and "value" in desc:
|
||||
desc = desc.get("value")
|
||||
if desc:
|
||||
desc_str = str(desc).strip()
|
||||
if desc_str:
|
||||
new_tags.append(f"description:{desc_str[:200]}")
|
||||
|
||||
page_count = data.get("number_of_pages")
|
||||
if isinstance(page_count, int) and page_count > 0:
|
||||
new_tags.append(f"pages:{page_count}")
|
||||
|
||||
subjects = data.get("subjects")
|
||||
if isinstance(subjects, list):
|
||||
for subject in subjects[:10]:
|
||||
if isinstance(subject, str):
|
||||
subject_clean = subject.strip()
|
||||
if subject_clean and subject_clean not in new_tags:
|
||||
new_tags.append(subject_clean)
|
||||
|
||||
identifiers = data.get("identifiers")
|
||||
if isinstance(identifiers, dict):
|
||||
|
||||
def _first(value: Any) -> Any:
|
||||
if isinstance(value, list) and value:
|
||||
return value[0]
|
||||
return value
|
||||
|
||||
for key, ns in (
|
||||
("isbn_10", "isbn_10"),
|
||||
("isbn_13", "isbn_13"),
|
||||
("lccn", "lccn"),
|
||||
("oclc_numbers", "oclc"),
|
||||
("goodreads", "goodreads"),
|
||||
("internet_archive", "internet_archive"),
|
||||
):
|
||||
val = _first(identifiers.get(key))
|
||||
if val:
|
||||
new_tags.append(f"{ns}:{val}")
|
||||
|
||||
# Some editions expose a direct Archive.org identifier as "ocaid".
|
||||
ocaid = data.get("ocaid")
|
||||
if isinstance(ocaid, str) and ocaid.strip():
|
||||
new_tags.append(f"internet_archive:{ocaid.strip()}")
|
||||
|
||||
debug(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup")
|
||||
return new_tags
|
||||
|
||||
def search(
|
||||
self,
|
||||
query: str,
|
||||
@@ -1293,7 +908,7 @@ class OpenLibrary(Provider):
|
||||
ia_val_local = []
|
||||
ia_ids_local = [str(x) for x in ia_val_local if x]
|
||||
|
||||
session_local = requests.Session()
|
||||
session_local = _create_archive_session()
|
||||
|
||||
try:
|
||||
archive_id_local = _resolve_archive_id(
|
||||
@@ -1423,19 +1038,38 @@ class OpenLibrary(Provider):
|
||||
"borrow"}:
|
||||
annotations.append(availability)
|
||||
|
||||
book_path = (
|
||||
f"https://openlibrary.org/books/{edition_id}" if edition_id else
|
||||
(
|
||||
f"https://openlibrary.org{work_key}"
|
||||
if isinstance(work_key, str) and work_key.startswith("/") else
|
||||
"https://openlibrary.org"
|
||||
)
|
||||
)
|
||||
metadata = {
|
||||
"openlibrary_id": edition_id,
|
||||
"openlibrary_key": work_key,
|
||||
"authors": authors_list,
|
||||
"year": year,
|
||||
"isbn_10": isbn_10,
|
||||
"isbn_13": isbn_13,
|
||||
"ia": ia_ids,
|
||||
"availability": availability,
|
||||
"availability_reason": availability_reason,
|
||||
"archive_id": archive_id,
|
||||
"direct_url": direct_url,
|
||||
"raw": doc,
|
||||
}
|
||||
if book_path:
|
||||
metadata["selection_url"] = book_path
|
||||
metadata["_selection_args"] = ["-url", book_path]
|
||||
metadata["_selection_action"] = ["download-file", "-url", book_path]
|
||||
|
||||
results.append(
|
||||
SearchResult(
|
||||
table="openlibrary",
|
||||
title=book_title,
|
||||
path=(
|
||||
f"https://openlibrary.org/books/{edition_id}" if edition_id else
|
||||
(
|
||||
f"https://openlibrary.org{work_key}"
|
||||
if isinstance(work_key,
|
||||
str) and work_key.startswith("/") else
|
||||
"https://openlibrary.org"
|
||||
)
|
||||
),
|
||||
path=book_path,
|
||||
detail=(
|
||||
(f"By: {', '.join(authors_list)}" if authors_list else "") +
|
||||
(f" ({year})" if year else "")
|
||||
@@ -1443,20 +1077,7 @@ class OpenLibrary(Provider):
|
||||
annotations=annotations,
|
||||
media_kind="book",
|
||||
columns=columns,
|
||||
full_metadata={
|
||||
"openlibrary_id": edition_id,
|
||||
"openlibrary_key": work_key,
|
||||
"authors": authors_list,
|
||||
"year": year,
|
||||
"isbn_10": isbn_10,
|
||||
"isbn_13": isbn_13,
|
||||
"ia": ia_ids,
|
||||
"availability": availability,
|
||||
"availability_reason": availability_reason,
|
||||
"archive_id": archive_id,
|
||||
"direct_url": direct_url,
|
||||
"raw": doc,
|
||||
},
|
||||
full_metadata=metadata,
|
||||
)
|
||||
)
|
||||
|
||||
@@ -1507,8 +1128,8 @@ class OpenLibrary(Provider):
|
||||
|
||||
# Best-effort metadata scrape to attach bibliographic tags for downstream cmdlets.
|
||||
try:
|
||||
archive_meta = _fetch_archive_item_metadata(archive_id)
|
||||
tags = _archive_item_metadata_to_tags(archive_id, archive_meta)
|
||||
archive_meta = fetch_archive_item_metadata(archive_id)
|
||||
tags = archive_item_metadata_to_tags(archive_id, archive_meta)
|
||||
if tags:
|
||||
try:
|
||||
result.tag.update(tags)
|
||||
|
||||
Reference in New Issue
Block a user