This commit is contained in:
2026-01-06 16:19:29 -08:00
parent 41c11d39fd
commit edc33f4528
10 changed files with 1192 additions and 881 deletions

View File

@@ -11,18 +11,29 @@ import sys
import tempfile
import time
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from urllib.parse import urlparse
import requests
from API.HTTP import HTTPClient
from API.HTTP import HTTPClient, get_requests_verify_value
from ProviderCore.base import Provider, SearchResult
from SYS.utils import sanitize_filename
from SYS.cli_syntax import get_field, get_free_text, parse_query
from SYS.logger import debug, log
from Provider.metadata_provider import (
archive_item_metadata_to_tags,
fetch_archive_item_metadata,
)
from SYS.utils import unique_path
_ARCHIVE_VERIFY_VALUE = get_requests_verify_value()
def _create_archive_session() -> requests.Session:
session = requests.Session()
session.verify = _ARCHIVE_VERIFY_VALUE
return session
try:
from Crypto.Cipher import AES # type: ignore
from Crypto.Util import Counter # type: ignore
@@ -262,182 +273,6 @@ def title_hint_from_url_slug(u: str) -> str:
return slug or "OpenLibrary"
def _coerce_archive_field_list(value: Any) -> List[str]:
"""Coerce an Archive.org metadata field to a list of strings."""
if value is None:
return []
if isinstance(value, list):
out: List[str] = []
for v in value:
try:
s = str(v).strip()
except Exception:
continue
if s:
out.append(s)
return out
if isinstance(value, (tuple, set)):
out = []
for v in value:
try:
s = str(v).strip()
except Exception:
continue
if s:
out.append(s)
return out
try:
s = str(value).strip()
except Exception:
return []
return [s] if s else []
def _archive_item_metadata_to_tags(archive_id: str,
item_metadata: Dict[str,
Any]) -> List[str]:
"""Map Archive.org metadata JSON (the `metadata` object) to tag strings.
This is intentionally best-effort and conservative: it focuses on stable,
useful bibliographic fields (title/author/publisher/ISBN/identifier/topics).
"""
archive_id_clean = str(archive_id or "").strip()
meta = item_metadata if isinstance(item_metadata,
dict) else {}
tags: List[str] = []
seen: set[str] = set()
def _add(tag: str) -> None:
try:
t = str(tag).strip()
except Exception:
return
if not t:
return
if t.lower() in seen:
return
seen.add(t.lower())
tags.append(t)
if archive_id_clean:
_add(f"internet_archive:{archive_id_clean}")
# Title
for title in _coerce_archive_field_list(meta.get("title"))[:1]:
_add(f"title:{title}")
# Authors/creators
creators: List[str] = []
creators.extend(_coerce_archive_field_list(meta.get("creator")))
creators.extend(_coerce_archive_field_list(meta.get("author")))
for creator in creators[:3]:
_add(f"author:{creator}")
# Publisher
for publisher in _coerce_archive_field_list(meta.get("publisher"))[:3]:
_add(f"publisher:{publisher}")
# Publish date/year
for date_val in _coerce_archive_field_list(meta.get("date"))[:1]:
_add(f"publish_date:{date_val}")
for year_val in _coerce_archive_field_list(meta.get("year"))[:1]:
_add(f"publish_date:{year_val}")
# Language
for lang in _coerce_archive_field_list(meta.get("language"))[:3]:
_add(f"language:{lang}")
# Topics/subjects: follow existing OpenLibrary behavior (un-namespaced tags)
for subj in _coerce_archive_field_list(meta.get("subject"))[:15]:
if len(subj) > 200:
subj = subj[:200]
_add(subj)
# ISBNs and identifiers
def _clean_isbn(raw: str) -> str:
return str(raw or "").replace("-", "").strip()
for isbn in _coerce_archive_field_list(meta.get("isbn"))[:10]:
isbn_clean = _clean_isbn(isbn)
if isbn_clean:
_add(f"isbn:{isbn_clean}")
identifiers: List[str] = []
identifiers.extend(_coerce_archive_field_list(meta.get("identifier")))
identifiers.extend(_coerce_archive_field_list(meta.get("external-identifier")))
added_other = 0
for ident in identifiers:
ident_s = str(ident or "").strip()
if not ident_s:
continue
low = ident_s.lower()
if low.startswith("urn:isbn:"):
val = _clean_isbn(ident_s.split(":", 2)[-1])
if val:
_add(f"isbn:{val}")
continue
if low.startswith("isbn:"):
val = _clean_isbn(ident_s.split(":", 1)[-1])
if val:
_add(f"isbn:{val}")
continue
if low.startswith("urn:oclc:"):
val = ident_s.split(":", 2)[-1].strip()
if val:
_add(f"oclc:{val}")
continue
if low.startswith("oclc:"):
val = ident_s.split(":", 1)[-1].strip()
if val:
_add(f"oclc:{val}")
continue
if low.startswith("urn:lccn:"):
val = ident_s.split(":", 2)[-1].strip()
if val:
_add(f"lccn:{val}")
continue
if low.startswith("lccn:"):
val = ident_s.split(":", 1)[-1].strip()
if val:
_add(f"lccn:{val}")
continue
if low.startswith("doi:"):
val = ident_s.split(":", 1)[-1].strip()
if val:
_add(f"doi:{val}")
continue
if archive_id_clean and low == archive_id_clean.lower():
continue
if added_other >= 5:
continue
if len(ident_s) > 200:
ident_s = ident_s[:200]
_add(f"identifier:{ident_s}")
added_other += 1
return tags
def _fetch_archive_item_metadata(archive_id: str,
*,
timeout: int = 8) -> Dict[str,
Any]:
ident = str(archive_id or "").strip()
if not ident:
return {}
resp = requests.get(f"https://archive.org/metadata/{ident}", timeout=int(timeout))
resp.raise_for_status()
data = resp.json() if resp is not None else {}
if not isinstance(data, dict):
return {}
meta = data.get("metadata")
return meta if isinstance(meta,
dict) else {}
class OpenLibrary(Provider):
TABLE_AUTO_STAGES = {
@@ -466,7 +301,7 @@ class OpenLibrary(Provider):
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
self._session = requests.Session()
self._session = _create_archive_session()
class BookNotAvailableError(Exception):
"""Raised when a book is not available for borrowing (waitlisted/in use)."""
@@ -612,7 +447,7 @@ class OpenLibrary(Provider):
@classmethod
def _archive_login(cls, email: str, password: str) -> requests.Session:
"""Login to archive.org using the token-based services endpoint (matches test-login.py)."""
session = requests.Session()
session = _create_archive_session()
token_resp = session.get(
"https://archive.org/services/account/login/",
@@ -766,7 +601,11 @@ class OpenLibrary(Provider):
if not ident:
return False, "no-archive-id"
try:
resp = requests.get(f"https://archive.org/metadata/{ident}", timeout=8)
resp = requests.get(
f"https://archive.org/metadata/{ident}",
timeout=8,
verify=_ARCHIVE_VERIFY_VALUE,
)
resp.raise_for_status()
data = resp.json() if resp is not None else {}
meta = data.get("metadata",
@@ -976,7 +815,11 @@ class OpenLibrary(Provider):
"""Check for a directly downloadable original PDF in Archive.org metadata."""
try:
metadata_url = f"https://archive.org/metadata/{book_id}"
response = requests.get(metadata_url, timeout=6)
response = requests.get(
metadata_url,
timeout=6,
verify=_ARCHIVE_VERIFY_VALUE,
)
response.raise_for_status()
metadata = response.json()
files = metadata.get("files") if isinstance(metadata, dict) else None
@@ -993,7 +836,8 @@ class OpenLibrary(Provider):
check_response = requests.head(
pdf_url,
timeout=4,
allow_redirects=True
allow_redirects=True,
verify=_ARCHIVE_VERIFY_VALUE,
)
if check_response.status_code == 200:
return True, pdf_url
@@ -1001,235 +845,6 @@ class OpenLibrary(Provider):
except Exception:
return False, ""
@staticmethod
def scrape_isbn_metadata(isbn: str) -> List[str]:
"""Scrape tags for an ISBN using Open Library API.
Returns tags such as:
- title:<...>, author:<...>, publish_date:<...>, publisher:<...>, description:<...>, pages:<...>
- identifiers: openlibrary:<...>, lccn:<...>, oclc:<...>, goodreads:<...>, librarything:<...>, doi:<...>, internet_archive:<...>
"""
new_tags: List[str] = []
isbn_clean = str(isbn or "").replace("isbn:", "").replace("-", "").strip()
if not isbn_clean:
return []
url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json"
try:
with HTTPClient() as client:
response = client.get(url)
response.raise_for_status()
data = json_module.loads(response.content.decode("utf-8"))
except Exception as exc:
log(f"Failed to fetch ISBN metadata: {exc}", file=sys.stderr)
return []
if not data:
log(f"No ISBN metadata found for: {isbn}")
return []
book_data = next(iter(data.values()), None)
if not isinstance(book_data, dict):
return []
if "title" in book_data:
new_tags.append(f"title:{book_data['title']}")
authors = book_data.get("authors")
if isinstance(authors, list):
for author in authors[:3]:
if isinstance(author, dict) and author.get("name"):
new_tags.append(f"author:{author['name']}")
if book_data.get("publish_date"):
new_tags.append(f"publish_date:{book_data['publish_date']}")
publishers = book_data.get("publishers")
if isinstance(publishers, list) and publishers:
pub = publishers[0]
if isinstance(pub, dict) and pub.get("name"):
new_tags.append(f"publisher:{pub['name']}")
if "description" in book_data:
desc = book_data.get("description")
if isinstance(desc, dict) and "value" in desc:
desc = desc.get("value")
if desc:
desc_str = str(desc).strip()
if desc_str:
new_tags.append(f"description:{desc_str[:200]}")
page_count = book_data.get("number_of_pages")
if isinstance(page_count, int) and page_count > 0:
new_tags.append(f"pages:{page_count}")
identifiers = book_data.get("identifiers")
if isinstance(identifiers, dict):
def _first(value: Any) -> Any:
if isinstance(value, list) and value:
return value[0]
return value
for key, ns in (
("openlibrary", "openlibrary"),
("lccn", "lccn"),
("oclc", "oclc"),
("goodreads", "goodreads"),
("librarything", "librarything"),
("doi", "doi"),
("internet_archive", "internet_archive"),
):
val = _first(identifiers.get(key))
if val:
new_tags.append(f"{ns}:{val}")
debug(f"Found {len(new_tags)} tag(s) from ISBN lookup")
return new_tags
@staticmethod
def scrape_openlibrary_metadata(olid: str) -> List[str]:
"""Scrape tags for an OpenLibrary ID using the .json API endpoint."""
new_tags: List[str] = []
olid_text = str(olid or "").strip()
if not olid_text:
return []
# Normalize OLID to the common "OL<digits>M" form when possible.
olid_norm = olid_text
try:
if not olid_norm.startswith("OL"):
olid_norm = f"OL{olid_norm}"
if not olid_norm.endswith("M"):
olid_norm = f"{olid_norm}M"
except Exception:
olid_norm = olid_text
# Ensure we always include a scrapeable identifier tag.
new_tags.append(f"openlibrary:{olid_norm}")
# Accept OL9674499M, 9674499M, or just digits.
olid_clean = olid_text.replace("OL", "").replace("M", "")
if not olid_clean.isdigit():
olid_clean = olid_text
if not olid_text.startswith("OL"):
url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
else:
url = f"https://openlibrary.org/books/{olid_text}.json"
try:
with HTTPClient() as client:
response = client.get(url)
response.raise_for_status()
data = json_module.loads(response.content.decode("utf-8"))
except Exception as exc:
log(f"Failed to fetch OpenLibrary metadata: {exc}", file=sys.stderr)
return []
if not isinstance(data, dict) or not data:
log(f"No OpenLibrary metadata found for: {olid_text}")
return []
if "title" in data:
new_tags.append(f"title:{data['title']}")
authors = data.get("authors")
if isinstance(authors, list):
for author in authors[:3]:
if isinstance(author, dict) and author.get("name"):
new_tags.append(f"author:{author['name']}")
continue
# Common OL shape: {"key": "/authors/OL...A"} or {"author": {"key": ...}}
author_key = None
if isinstance(author, dict):
if isinstance(author.get("author"), dict):
author_key = author.get("author",
{}).get("key")
if not author_key:
author_key = author.get("key")
if isinstance(author_key, str) and author_key.startswith("/"):
try:
author_url = f"https://openlibrary.org{author_key}.json"
with HTTPClient(timeout=10) as client:
author_resp = client.get(author_url)
author_resp.raise_for_status()
author_data = json_module.loads(
author_resp.content.decode("utf-8")
)
if isinstance(author_data, dict) and author_data.get("name"):
new_tags.append(f"author:{author_data['name']}")
continue
except Exception:
pass
if isinstance(author, str) and author:
new_tags.append(f"author:{author}")
if data.get("publish_date"):
new_tags.append(f"publish_date:{data['publish_date']}")
publishers = data.get("publishers")
if isinstance(publishers, list) and publishers:
pub = publishers[0]
if isinstance(pub, dict) and pub.get("name"):
new_tags.append(f"publisher:{pub['name']}")
elif isinstance(pub, str) and pub:
new_tags.append(f"publisher:{pub}")
if "description" in data:
desc = data.get("description")
if isinstance(desc, dict) and "value" in desc:
desc = desc.get("value")
if desc:
desc_str = str(desc).strip()
if desc_str:
new_tags.append(f"description:{desc_str[:200]}")
page_count = data.get("number_of_pages")
if isinstance(page_count, int) and page_count > 0:
new_tags.append(f"pages:{page_count}")
subjects = data.get("subjects")
if isinstance(subjects, list):
for subject in subjects[:10]:
if isinstance(subject, str):
subject_clean = subject.strip()
if subject_clean and subject_clean not in new_tags:
new_tags.append(subject_clean)
identifiers = data.get("identifiers")
if isinstance(identifiers, dict):
def _first(value: Any) -> Any:
if isinstance(value, list) and value:
return value[0]
return value
for key, ns in (
("isbn_10", "isbn_10"),
("isbn_13", "isbn_13"),
("lccn", "lccn"),
("oclc_numbers", "oclc"),
("goodreads", "goodreads"),
("internet_archive", "internet_archive"),
):
val = _first(identifiers.get(key))
if val:
new_tags.append(f"{ns}:{val}")
# Some editions expose a direct Archive.org identifier as "ocaid".
ocaid = data.get("ocaid")
if isinstance(ocaid, str) and ocaid.strip():
new_tags.append(f"internet_archive:{ocaid.strip()}")
debug(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup")
return new_tags
def search(
self,
query: str,
@@ -1293,7 +908,7 @@ class OpenLibrary(Provider):
ia_val_local = []
ia_ids_local = [str(x) for x in ia_val_local if x]
session_local = requests.Session()
session_local = _create_archive_session()
try:
archive_id_local = _resolve_archive_id(
@@ -1423,19 +1038,38 @@ class OpenLibrary(Provider):
"borrow"}:
annotations.append(availability)
book_path = (
f"https://openlibrary.org/books/{edition_id}" if edition_id else
(
f"https://openlibrary.org{work_key}"
if isinstance(work_key, str) and work_key.startswith("/") else
"https://openlibrary.org"
)
)
metadata = {
"openlibrary_id": edition_id,
"openlibrary_key": work_key,
"authors": authors_list,
"year": year,
"isbn_10": isbn_10,
"isbn_13": isbn_13,
"ia": ia_ids,
"availability": availability,
"availability_reason": availability_reason,
"archive_id": archive_id,
"direct_url": direct_url,
"raw": doc,
}
if book_path:
metadata["selection_url"] = book_path
metadata["_selection_args"] = ["-url", book_path]
metadata["_selection_action"] = ["download-file", "-url", book_path]
results.append(
SearchResult(
table="openlibrary",
title=book_title,
path=(
f"https://openlibrary.org/books/{edition_id}" if edition_id else
(
f"https://openlibrary.org{work_key}"
if isinstance(work_key,
str) and work_key.startswith("/") else
"https://openlibrary.org"
)
),
path=book_path,
detail=(
(f"By: {', '.join(authors_list)}" if authors_list else "") +
(f" ({year})" if year else "")
@@ -1443,20 +1077,7 @@ class OpenLibrary(Provider):
annotations=annotations,
media_kind="book",
columns=columns,
full_metadata={
"openlibrary_id": edition_id,
"openlibrary_key": work_key,
"authors": authors_list,
"year": year,
"isbn_10": isbn_10,
"isbn_13": isbn_13,
"ia": ia_ids,
"availability": availability,
"availability_reason": availability_reason,
"archive_id": archive_id,
"direct_url": direct_url,
"raw": doc,
},
full_metadata=metadata,
)
)
@@ -1507,8 +1128,8 @@ class OpenLibrary(Provider):
# Best-effort metadata scrape to attach bibliographic tags for downstream cmdlets.
try:
archive_meta = _fetch_archive_item_metadata(archive_id)
tags = _archive_item_metadata_to_tags(archive_id, archive_meta)
archive_meta = fetch_archive_item_metadata(archive_id)
tags = archive_item_metadata_to_tags(archive_id, archive_meta)
if tags:
try:
result.tag.update(tags)