Add YAPF style + ignore, and format tracked Python files

This commit is contained in:
2025-12-29 18:42:02 -08:00
parent c019c00aed
commit 507946a3e4
108 changed files with 11664 additions and 6494 deletions

View File

@@ -19,7 +19,7 @@ import requests
from API.HTTP import HTTPClient
from ProviderCore.base import Provider, SearchResult
from ProviderCore.download import download_file, sanitize_filename
from cli_syntax import get_field, get_free_text, parse_query
from SYS.cli_syntax import get_field, get_free_text, parse_query
from SYS.logger import debug, log
from SYS.utils import unique_path
@@ -52,7 +52,9 @@ def _image_paths_to_pdf_bytes(images: List[str]) -> Optional[bytes]:
continue
with Image.open(img_path) as im: # type: ignore[attr-defined]
# Ensure PDF-compatible mode.
if im.mode in {"RGBA", "LA", "P"}:
if im.mode in {"RGBA",
"LA",
"P"}:
im = im.convert("RGB")
else:
im = im.convert("RGB")
@@ -125,7 +127,8 @@ def _resolve_edition_id(doc: Dict[str, Any]) -> str:
def _check_lendable(session: requests.Session, edition_id: str) -> Tuple[bool, str]:
"""Return (lendable, status_text) using OpenLibrary volumes API."""
try:
if not edition_id or not edition_id.startswith("OL") or not edition_id.endswith("M"):
if not edition_id or not edition_id.startswith("OL") or not edition_id.endswith(
"M"):
return False, "not-an-edition"
url = f"https://openlibrary.org/api/volumes/brief/json/OLID:{edition_id}"
@@ -155,7 +158,9 @@ def _check_lendable(session: requests.Session, edition_id: str) -> Tuple[bool, s
def _resolve_archive_id(
session: requests.Session, edition_id: str, ia_candidates: List[str]
session: requests.Session,
edition_id: str,
ia_candidates: List[str]
) -> str:
# Prefer IA identifiers already present in search results.
if ia_candidates:
@@ -165,7 +170,10 @@ def _resolve_archive_id(
# Otherwise query the edition JSON.
try:
resp = session.get(f"https://openlibrary.org/books/{edition_id}.json", timeout=6)
resp = session.get(
f"https://openlibrary.org/books/{edition_id}.json",
timeout=6
)
resp.raise_for_status()
data = resp.json() or {}
@@ -206,13 +214,19 @@ def _archive_id_from_url(url: str) -> str:
# - /details/<id>/...
# - /borrow/<id>
# - /download/<id>/...
if len(parts) >= 2 and parts[0].lower() in {"details", "borrow", "download", "stream"}:
if len(parts) >= 2 and parts[0].lower() in {"details",
"borrow",
"download",
"stream"}:
return str(parts[1]).strip()
# Sometimes the identifier is the first segment.
if len(parts) >= 1:
first = str(parts[0]).strip()
if first and first.lower() not in {"account", "services", "search", "advancedsearch.php"}:
if first and first.lower() not in {"account",
"services",
"search",
"advancedsearch.php"}:
return first
return ""
@@ -249,14 +263,17 @@ def _coerce_archive_field_list(value: Any) -> List[str]:
return [s] if s else []
def _archive_item_metadata_to_tags(archive_id: str, item_metadata: Dict[str, Any]) -> List[str]:
def _archive_item_metadata_to_tags(archive_id: str,
item_metadata: Dict[str,
Any]) -> List[str]:
"""Map Archive.org metadata JSON (the `metadata` object) to tag strings.
This is intentionally best-effort and conservative: it focuses on stable,
useful bibliographic fields (title/author/publisher/ISBN/identifier/topics).
"""
archive_id_clean = str(archive_id or "").strip()
meta = item_metadata if isinstance(item_metadata, dict) else {}
meta = item_metadata if isinstance(item_metadata,
dict) else {}
tags: List[str] = []
seen: set[str] = set()
@@ -374,7 +391,10 @@ def _archive_item_metadata_to_tags(archive_id: str, item_metadata: Dict[str, Any
return tags
def _fetch_archive_item_metadata(archive_id: str, *, timeout: int = 8) -> Dict[str, Any]:
def _fetch_archive_item_metadata(archive_id: str,
*,
timeout: int = 8) -> Dict[str,
Any]:
ident = str(archive_id or "").strip()
if not ident:
return {}
@@ -384,7 +404,8 @@ def _fetch_archive_item_metadata(archive_id: str, *, timeout: int = 8) -> Dict[s
if not isinstance(data, dict):
return {}
meta = data.get("metadata")
return meta if isinstance(meta, dict) else {}
return meta if isinstance(meta,
dict) else {}
class OpenLibrary(Provider):
@@ -404,7 +425,9 @@ class OpenLibrary(Provider):
"""Raised when a book is not available for borrowing (waitlisted/in use)."""
@staticmethod
def _credential_archive(config: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
def _credential_archive(config: Dict[str,
Any]) -> Tuple[Optional[str],
Optional[str]]:
"""Get Archive.org email/password from config.
Supports:
@@ -415,9 +438,11 @@ class OpenLibrary(Provider):
if not isinstance(config, dict):
return None, None
provider_config = config.get("provider", {})
provider_config = config.get("provider",
{})
if isinstance(provider_config, dict):
openlibrary_config = provider_config.get("openlibrary", {})
openlibrary_config = provider_config.get("openlibrary",
{})
if isinstance(openlibrary_config, dict):
email = openlibrary_config.get("email")
password = openlibrary_config.get("password")
@@ -456,7 +481,10 @@ class OpenLibrary(Provider):
"""Login to archive.org using the token-based services endpoint (matches test-login.py)."""
session = requests.Session()
token_resp = session.get("https://archive.org/services/account/login/", timeout=30)
token_resp = session.get(
"https://archive.org/services/account/login/",
timeout=30
)
try:
token_json = token_resp.json()
except Exception as exc:
@@ -473,8 +501,14 @@ class OpenLibrary(Provider):
if not token:
raise RuntimeError("Archive login token missing")
headers = {"Content-Type": "application/x-www-form-urlencoded"}
payload = {"username": email, "password": password, "t": token}
headers = {
"Content-Type": "application/x-www-form-urlencoded"
}
payload = {
"username": email,
"password": password,
"t": token
}
login_resp = session.post(
"https://archive.org/services/account/login/",
@@ -499,22 +533,34 @@ class OpenLibrary(Provider):
@classmethod
def _archive_loan(
cls, session: requests.Session, book_id: str, *, verbose: bool = True
cls,
session: requests.Session,
book_id: str,
*,
verbose: bool = True
) -> requests.Session:
data = {"action": "grant_access", "identifier": book_id}
data = {
"action": "grant_access",
"identifier": book_id
}
session.post(
"https://archive.org/services/loans/loan/searchInside.php", data=data, timeout=30
"https://archive.org/services/loans/loan/searchInside.php",
data=data,
timeout=30
)
data["action"] = "browse_book"
response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30)
response = session.post(
"https://archive.org/services/loans/loan/",
data=data,
timeout=30
)
if response.status_code == 400:
try:
err = (response.json() or {}).get("error")
if (
err
== "This book is not available to borrow at this time. Please try again later."
):
if (err ==
"This book is not available to borrow at this time. Please try again later."
):
raise cls.BookNotAvailableError("Book is waitlisted or in use")
raise RuntimeError(f"Borrow failed: {err or response.text}")
except cls.BookNotAvailableError:
@@ -523,15 +569,26 @@ class OpenLibrary(Provider):
raise RuntimeError("The book cannot be borrowed")
data["action"] = "create_token"
response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30)
response = session.post(
"https://archive.org/services/loans/loan/",
data=data,
timeout=30
)
if "token" in (response.text or ""):
return session
raise RuntimeError("Something went wrong when trying to borrow the book")
@staticmethod
def _archive_return_loan(session: requests.Session, book_id: str) -> None:
data = {"action": "return_loan", "identifier": book_id}
response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30)
data = {
"action": "return_loan",
"identifier": book_id
}
response = session.post(
"https://archive.org/services/loans/loan/",
data=data,
timeout=30
)
if response.status_code == 200:
try:
if (response.json() or {}).get("success"):
@@ -551,8 +608,8 @@ class OpenLibrary(Provider):
if session is None:
return
for url in (
"https://archive.org/account/logout",
"https://archive.org/account/logout.php",
"https://archive.org/account/logout",
"https://archive.org/account/logout.php",
):
try:
resp = session.get(url, timeout=15, allow_redirects=True)
@@ -579,7 +636,9 @@ class OpenLibrary(Provider):
resp = requests.get(f"https://archive.org/metadata/{ident}", timeout=8)
resp.raise_for_status()
data = resp.json() if resp is not None else {}
meta = data.get("metadata", {}) if isinstance(data, dict) else {}
meta = data.get("metadata",
{}) if isinstance(data,
dict) else {}
collection = meta.get("collection") if isinstance(meta, dict) else None
values: List[str] = []
@@ -588,16 +647,20 @@ class OpenLibrary(Provider):
elif isinstance(collection, str):
values = [collection.strip().lower()]
if any(v in {"inlibrary", "printdisabled", "lendinglibrary"} for v in values):
if any(v in {"inlibrary",
"printdisabled",
"lendinglibrary"} for v in values):
return True, "archive-collection"
return False, "archive-not-lendable"
except Exception:
return False, "archive-metadata-error"
@staticmethod
def _archive_get_book_infos(
session: requests.Session, url: str
) -> Tuple[str, List[str], Dict[str, Any]]:
def _archive_get_book_infos(session: requests.Session,
url: str) -> Tuple[str,
List[str],
Dict[str,
Any]]:
"""Extract page links from Archive.org book reader."""
r = session.get(url, timeout=30).text
@@ -620,7 +683,8 @@ class OpenLibrary(Provider):
metadata = data.get("metadata") or {}
links: List[str] = []
br_data = (data.get("brOptions") or {}).get("data", [])
br_data = (data.get("brOptions") or {}).get("data",
[])
if isinstance(br_data, list):
for item in br_data:
if isinstance(item, list):
@@ -639,7 +703,11 @@ class OpenLibrary(Provider):
return f"{directory}/{(len(str(pages)) - len(str(page))) * '0'}{page}.jpg"
@staticmethod
def _archive_deobfuscate_image(image_data: bytes, link: str, obf_header: str) -> bytes:
def _archive_deobfuscate_image(
image_data: bytes,
link: str,
obf_header: str
) -> bytes:
if not AES or not Counter:
raise RuntimeError("Crypto library not available")
@@ -657,11 +725,18 @@ class OpenLibrary(Provider):
counter_bytes = base64.b64decode(counter_b64)
if len(counter_bytes) != 16:
raise ValueError(f"Expected counter to be 16 bytes, got {len(counter_bytes)}")
raise ValueError(
f"Expected counter to be 16 bytes, got {len(counter_bytes)}"
)
prefix = counter_bytes[:8]
initial_value = int.from_bytes(counter_bytes[8:], byteorder="big")
ctr = Counter.new(64, prefix=prefix, initial_value=initial_value, little_endian=False) # type: ignore
ctr = Counter.new(
64,
prefix=prefix,
initial_value=initial_value,
little_endian=False
) # type: ignore
cipher = AES.new(key, AES.MODE_CTR, counter=ctr) # type: ignore
decrypted_part = cipher.decrypt(image_data[:1024])
@@ -699,7 +774,11 @@ class OpenLibrary(Provider):
image = cls._archive_image_name(pages, i, directory)
obf_header = response.headers.get("X-Obfuscate")
if obf_header:
image_content = cls._archive_deobfuscate_image(response.content, link, obf_header)
image_content = cls._archive_deobfuscate_image(
response.content,
link,
obf_header
)
else:
image_content = response.content
@@ -715,7 +794,9 @@ class OpenLibrary(Provider):
links: List[str],
scale: int,
book_id: str,
progress_callback: Optional[Callable[[int, int], None]] = None,
progress_callback: Optional[Callable[[int,
int],
None]] = None,
) -> List[str]:
links_scaled = [f"{link}&rotate=0&scale={scale}" for link in links]
pages = len(links_scaled)
@@ -748,7 +829,8 @@ class OpenLibrary(Provider):
except Exception:
pass
elif tqdm:
for _ in tqdm(futures.as_completed(tasks), total=len(tasks)): # type: ignore
for _ in tqdm(futures.as_completed(tasks),
total=len(tasks)): # type: ignore
pass
else:
for _ in futures.as_completed(tasks):
@@ -770,11 +852,16 @@ class OpenLibrary(Provider):
if not isinstance(file_info, dict):
continue
filename = str(file_info.get("name", ""))
if filename.endswith(".pdf") and file_info.get("source") == "original":
if filename.endswith(".pdf") and file_info.get("source"
) == "original":
pdf_url = (
f"https://archive.org/download/{book_id}/{filename.replace(' ', '%20')}"
)
check_response = requests.head(pdf_url, timeout=4, allow_redirects=True)
check_response = requests.head(
pdf_url,
timeout=4,
allow_redirects=True
)
if check_response.status_code == 200:
return True, pdf_url
return False, ""
@@ -927,7 +1014,8 @@ class OpenLibrary(Provider):
author_key = None
if isinstance(author, dict):
if isinstance(author.get("author"), dict):
author_key = author.get("author", {}).get("key")
author_key = author.get("author",
{}).get("key")
if not author_key:
author_key = author.get("key")
@@ -937,7 +1025,9 @@ class OpenLibrary(Provider):
with HTTPClient(timeout=10) as client:
author_resp = client.get(author_url)
author_resp.raise_for_status()
author_data = json_module.loads(author_resp.content.decode("utf-8"))
author_data = json_module.loads(
author_resp.content.decode("utf-8")
)
if isinstance(author_data, dict) and author_data.get("name"):
new_tags.append(f"author:{author_data['name']}")
continue
@@ -1011,7 +1101,8 @@ class OpenLibrary(Provider):
self,
query: str,
limit: int = 50,
filters: Optional[Dict[str, Any]] = None,
filters: Optional[Dict[str,
Any]] = None,
**kwargs: Any,
) -> List[SearchResult]:
filters = filters or {}
@@ -1032,7 +1123,10 @@ class OpenLibrary(Provider):
try:
resp = self._session.get(
"https://openlibrary.org/search.json",
params={"q": q, "limit": int(limit)},
params={
"q": q,
"limit": int(limit)
},
timeout=10,
)
resp.raise_for_status()
@@ -1048,9 +1142,13 @@ class OpenLibrary(Provider):
# Availability enrichment can be slow if done sequentially (it may require multiple
# network calls per row). Do it concurrently to keep the pipeline responsive.
docs = docs[: int(limit)]
docs = docs[:int(limit)]
def _compute_availability(doc_dict: Dict[str, Any]) -> Tuple[str, str, str, str]:
def _compute_availability(doc_dict: Dict[str,
Any]) -> Tuple[str,
str,
str,
str]:
edition_id_local = _resolve_edition_id(doc_dict)
if not edition_id_local:
return "no-olid", "", "", ""
@@ -1066,7 +1164,9 @@ class OpenLibrary(Provider):
try:
archive_id_local = _resolve_archive_id(
session_local, edition_id_local, ia_ids_local
session_local,
edition_id_local,
ia_ids_local
)
except Exception:
archive_id_local = ""
@@ -1089,17 +1189,23 @@ class OpenLibrary(Provider):
return "unavailable", reason_local, archive_id_local, ""
availability_rows: List[Tuple[str, str, str, str]] = [
("unknown", "", "", "") for _ in range(len(docs))
]
availability_rows: List[Tuple[str,
str,
str,
str]] = [
("unknown",
"",
"",
"") for _ in range(len(docs))
]
if docs:
max_workers = min(8, max(1, len(docs)))
done = 0
with futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_index = {
executor.submit(_compute_availability, doc_dict): i
for i, doc_dict in enumerate(docs)
if isinstance(doc_dict, dict)
executor.submit(_compute_availability,
doc_dict): i
for i, doc_dict in enumerate(docs) if isinstance(doc_dict, dict)
}
for fut in futures.as_completed(list(future_to_index.keys())):
i = future_to_index[fut]
@@ -1145,11 +1251,16 @@ class OpenLibrary(Provider):
isbn_10 = next((str(i) for i in isbn_list if len(str(i)) == 10), "")
columns = [
("Title", book_title),
("Author", ", ".join(authors_list)),
("Year", year),
("Avail", ""),
("OLID", edition_id),
("Title",
book_title),
("Author",
", ".join(authors_list)),
("Year",
year),
("Avail",
""),
("OLID",
edition_id),
]
# Determine availability using the concurrently computed enrichment.
@@ -1170,7 +1281,8 @@ class OpenLibrary(Provider):
annotations.append(f"isbn_10:{isbn_10}")
if ia_ids:
annotations.append("archive")
if availability in {"download", "borrow"}:
if availability in {"download",
"borrow"}:
annotations.append(availability)
results.append(
@@ -1178,17 +1290,17 @@ class OpenLibrary(Provider):
table="openlibrary",
title=book_title,
path=(
f"https://openlibrary.org/books/{edition_id}"
if edition_id
else (
f"https://openlibrary.org/books/{edition_id}" if edition_id else
(
f"https://openlibrary.org{work_key}"
if isinstance(work_key, str) and work_key.startswith("/")
else "https://openlibrary.org"
if isinstance(work_key,
str) and work_key.startswith("/") else
"https://openlibrary.org"
)
),
detail=(
(f"By: {', '.join(authors_list)}" if authors_list else "")
+ (f" ({year})" if year else "")
(f"By: {', '.join(authors_list)}" if authors_list else "") +
(f" ({year})" if year else "")
).strip(),
annotations=annotations,
media_kind="book",
@@ -1216,7 +1328,11 @@ class OpenLibrary(Provider):
self,
result: SearchResult,
output_dir: Path,
progress_callback: Optional[Callable[[str, int, Optional[int], str], None]] = None,
progress_callback: Optional[Callable[[str,
int,
Optional[int],
str],
None]] = None,
) -> Optional[Path]:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
@@ -1245,7 +1361,10 @@ class OpenLibrary(Provider):
archive_id = _archive_id_from_url(str(getattr(result, "path", "") or ""))
if not archive_id:
log("[openlibrary] No archive identifier available; cannot download", file=sys.stderr)
log(
"[openlibrary] No archive identifier available; cannot download",
file=sys.stderr
)
return None
# Best-effort metadata scrape to attach bibliographic tags for downstream cmdlets.
@@ -1290,12 +1409,9 @@ class OpenLibrary(Provider):
session=self._session,
progress_callback=(
(
lambda downloaded, total, label: progress_callback(
"bytes", downloaded, total, label
)
)
if progress_callback is not None
else None
lambda downloaded, total, label:
progress_callback("bytes", downloaded, total, label)
) if progress_callback is not None else None
),
)
if ok:
@@ -1307,7 +1423,10 @@ class OpenLibrary(Provider):
try:
email, password = self._credential_archive(self.config or {})
if not email or not password:
log("[openlibrary] Archive credentials missing; cannot borrow", file=sys.stderr)
log(
"[openlibrary] Archive credentials missing; cannot borrow",
file=sys.stderr
)
return None
lendable = True
@@ -1369,7 +1488,10 @@ class OpenLibrary(Provider):
continue
if not links:
log(f"[openlibrary] Failed to extract pages: {last_exc}", file=sys.stderr)
log(
f"[openlibrary] Failed to extract pages: {last_exc}",
file=sys.stderr
)
return None
try:
@@ -1388,9 +1510,10 @@ class OpenLibrary(Provider):
scale=3,
book_id=archive_id,
progress_callback=(
(lambda done, total: progress_callback("pages", done, total, "pages"))
if progress_callback is not None
else None
(
lambda done, total:
progress_callback("pages", done, total, "pages")
) if progress_callback is not None else None
),
)
@@ -1436,7 +1559,10 @@ class OpenLibrary(Provider):
try:
self._archive_return_loan(session, archive_id)
except Exception as exc:
log(f"[openlibrary] Warning: failed to return loan: {exc}", file=sys.stderr)
log(
f"[openlibrary] Warning: failed to return loan: {exc}",
file=sys.stderr
)
try:
self._archive_logout(session)
except Exception: