This commit is contained in:
2026-01-01 20:37:27 -08:00
parent f3c79609d8
commit deb05c0d44
35 changed files with 5030 additions and 4879 deletions

View File

@@ -229,6 +229,30 @@ def _archive_id_from_url(url: str) -> str:
"advancedsearch.php"}:
return first
def edition_id_from_url(u: str) -> str:
"""Extract an OpenLibrary edition id (OL...M) from a book URL."""
try:
p = urlparse(str(u))
parts = [x for x in (p.path or "").split("/") if x]
except Exception:
parts = []
if len(parts) >= 2 and str(parts[0]).lower() == "books":
return str(parts[1]).strip()
return ""
def title_hint_from_url_slug(u: str) -> str:
"""Derive a human-friendly title hint from the URL slug."""
try:
p = urlparse(str(u))
parts = [x for x in (p.path or "").split("/") if x]
slug = parts[-1] if parts else ""
except Exception:
slug = ""
slug = (slug or "").strip().replace("_", " ")
return slug or "OpenLibrary"
return ""
@@ -415,6 +439,7 @@ class OpenLibrary(Provider):
"openlibrary.org",
"archive.org",
)
URL = URL_DOMAINS
"""Search provider for OpenLibrary books + Archive.org direct/borrow download."""
def __init__(self, config: Optional[Dict[str, Any]] = None):
@@ -1419,6 +1444,64 @@ class OpenLibrary(Provider):
log("[openlibrary] Direct download failed", file=sys.stderr)
return None
# --- Convenience helpers for URL-driven downloads (used by download-file) ---
def search_result_from_url(self, url: str) -> Optional[SearchResult]:
"""Build a minimal SearchResult from a bare OpenLibrary URL."""
edition_id = edition_id_from_url(url)
title_hint = title_hint_from_url_slug(url)
return SearchResult(
table="openlibrary",
title=title_hint,
path=str(url),
media_kind="book",
full_metadata={"openlibrary_id": edition_id} if edition_id else {},
)
def download_url(
self,
url: str,
output_dir: Path,
progress_callback: Optional[Callable[[str, int, Optional[int], str], None]] = None,
) -> Optional[Dict[str, Any]]:
"""Download a book directly from an OpenLibrary URL.
Returns a dict with the downloaded path and SearchResult when successful.
"""
sr = self.search_result_from_url(url)
if sr is None:
return None
downloaded = self.download(sr, output_dir, progress_callback)
if not downloaded:
return None
return {
"path": Path(downloaded),
"search_result": sr,
}
try:
if progress_callback is not None:
progress_callback("step", 0, None, "direct download")
except Exception:
pass
out_path = unique_path(output_dir / f"{safe_title}.pdf")
ok = download_file(
pdf_url,
out_path,
session=self._session,
progress_callback=(
(
lambda downloaded, total, label:
progress_callback("bytes", downloaded, total, label)
) if progress_callback is not None else None
),
)
if ok:
return out_path
log("[openlibrary] Direct download failed", file=sys.stderr)
return None
# 2) Borrow flow (credentials required).
try:
email, password = self._credential_archive(self.config or {})