dfd

2025-12-22 02:11:53 -08:00
parent d0b821b5dd
commit 16316bb3fd
20 changed files with 4218 additions and 2422 deletions
--- a/Provider/openlibrary.py
+++ b/Provider/openlibrary.py
@@ -11,7 +11,8 @@ import sys
 import tempfile
 import time
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple
+from urllib.parse import urlparse

 import requests

@@ -183,7 +184,44 @@ def _resolve_archive_id(session: requests.Session, edition_id: str, ia_candidate
    return ""


+def _archive_id_from_url(url: str) -> str:
+    """Best-effort extraction of an Archive.org item identifier from a URL."""
+
+    u = str(url or "").strip()
+    if not u:
+        return ""
+    try:
+        p = urlparse(u)
+        host = (p.hostname or "").lower().strip()
+        if not host.endswith("archive.org"):
+            return ""
+        parts = [x for x in (p.path or "").split("/") if x]
+    except Exception:
+        return ""
+
+    # Common patterns:
+    # - /details/<id>/...
+    # - /borrow/<id>
+    # - /download/<id>/...
+    if len(parts) >= 2 and parts[0].lower() in {"details", "borrow", "download", "stream"}:
+        return str(parts[1]).strip()
+
+    # Sometimes the identifier is the first segment.
+    if len(parts) >= 1:
+        first = str(parts[0]).strip()
+        if first and first.lower() not in {"account", "services", "search", "advancedsearch.php"}:
+            return first
+
+    return ""
+
+
 class OpenLibrary(Provider):
+    # Domains that should be routed to this provider when the user supplies a URL.
+    # (Used by ProviderCore.registry.match_provider_name_for_url)
+    URL_DOMAINS = (
+        "openlibrary.org",
+        "archive.org",
+    )
    """Search provider for OpenLibrary books + Archive.org direct/borrow download."""

    def __init__(self, config: Optional[Dict[str, Any]] = None):
@@ -311,6 +349,60 @@ class OpenLibrary(Provider):
                pass
        raise RuntimeError("Something went wrong when trying to return the book")

+    @staticmethod
+    def _archive_logout(session: requests.Session) -> None:
+        """Best-effort logout from archive.org.
+
+        Archive sessions are cookie-based; returning the loan is the critical step.
+        Logout is attempted for cleanliness but failures should not abort the workflow.
+        """
+
+        if session is None:
+            return
+        for url in (
+            "https://archive.org/account/logout",
+            "https://archive.org/account/logout.php",
+        ):
+            try:
+                resp = session.get(url, timeout=15, allow_redirects=True)
+                code = int(getattr(resp, "status_code", 0) or 0)
+                if code and code < 500:
+                    return
+            except Exception:
+                continue
+
+    @staticmethod
+    def _archive_is_lendable(book_id: str) -> tuple[bool, str]:
+        """Heuristic lendable check using Archive.org item metadata.
+
+        Some lendable items do not map cleanly to an OpenLibrary edition id.
+        In practice, Archive metadata collections often include markers like:
+        - inlibrary
+        - printdisabled
+        """
+
+        ident = str(book_id or "").strip()
+        if not ident:
+            return False, "no-archive-id"
+        try:
+            resp = requests.get(f"https://archive.org/metadata/{ident}", timeout=8)
+            resp.raise_for_status()
+            data = resp.json() if resp is not None else {}
+            meta = data.get("metadata", {}) if isinstance(data, dict) else {}
+            collection = meta.get("collection") if isinstance(meta, dict) else None
+
+            values: List[str] = []
+            if isinstance(collection, list):
+                values = [str(x).strip().lower() for x in collection if str(x).strip()]
+            elif isinstance(collection, str):
+                values = [collection.strip().lower()]
+
+            if any(v in {"inlibrary", "printdisabled", "lendinglibrary"} for v in values):
+                return True, "archive-collection"
+            return False, "archive-not-lendable"
+        except Exception:
+            return False, "archive-metadata-error"
+
    @staticmethod
    def _archive_get_book_infos(session: requests.Session, url: str) -> Tuple[str, List[str], Dict[str, Any]]:
        """Extract page links from Archive.org book reader."""
@@ -430,6 +522,7 @@ class OpenLibrary(Provider):
        links: List[str],
        scale: int,
        book_id: str,
+        progress_callback: Optional[Callable[[int, int], None]] = None,
    ) -> List[str]:
        links_scaled = [f"{link}&rotate=0&scale={scale}" for link in links]
        pages = len(links_scaled)
@@ -448,7 +541,20 @@ class OpenLibrary(Provider):
                        pages=pages,
                    )
                )
-            if tqdm:
+            if progress_callback is not None:
+                done = 0
+                total = len(tasks)
+                for fut in futures.as_completed(tasks):
+                    try:
+                        _ = fut.result()
+                    except Exception:
+                        pass
+                    done += 1
+                    try:
+                        progress_callback(done, total)
+                    except Exception:
+                        pass
+            elif tqdm:
                for _ in tqdm(futures.as_completed(tasks), total=len(tasks)):  # type: ignore
                    pass
            else:
@@ -904,15 +1010,20 @@ class OpenLibrary(Provider):

        return results

-    def download(self, result: SearchResult, output_dir: Path) -> Optional[Path]:
+    def download(
+        self,
+        result: SearchResult,
+        output_dir: Path,
+        progress_callback: Optional[Callable[[str, int, Optional[int], str], None]] = None,
+    ) -> Optional[Path]:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

        meta = result.full_metadata or {}
        edition_id = str(meta.get("openlibrary_id") or "").strip()
-        if not edition_id:
-            log("[openlibrary] Missing openlibrary_id; cannot download", file=sys.stderr)
-            return None
+
+        # Accept direct Archive.org URLs too (details/borrow/download) even when no OL edition id is known.
+        archive_id = str(meta.get("archive_id") or "").strip()

        ia_ids = meta.get("ia") or []
        if isinstance(ia_ids, str):
@@ -921,12 +1032,23 @@ class OpenLibrary(Provider):
            ia_ids = []
        ia_candidates = [str(x) for x in ia_ids if x]

-        archive_id = _resolve_archive_id(self._session, edition_id, ia_candidates)
+        if not archive_id:
+            archive_id = _first_str(ia_candidates) or ""
+
+        if not archive_id and edition_id:
+            archive_id = _resolve_archive_id(self._session, edition_id, ia_candidates)
+
+        if not archive_id:
+            # Try to extract identifier from the SearchResult path (URL).
+            archive_id = _archive_id_from_url(str(getattr(result, "path", "") or ""))
+
        if not archive_id:
            log("[openlibrary] No archive identifier available; cannot download", file=sys.stderr)
            return None

        safe_title = sanitize_filename(result.title)
+        if not safe_title or "http" in safe_title.lower():
+            safe_title = sanitize_filename(archive_id) or "archive"

        # 1) Direct download if available.
        try:
@@ -935,8 +1057,22 @@ class OpenLibrary(Provider):
            can_direct, pdf_url = False, ""

        if can_direct and pdf_url:
+            try:
+                if progress_callback is not None:
+                    progress_callback("step", 0, None, "direct download")
+            except Exception:
+                pass
            out_path = unique_path(output_dir / f"{safe_title}.pdf")
-            ok = download_file(pdf_url, out_path, session=self._session)
+            ok = download_file(
+                pdf_url,
+                out_path,
+                session=self._session,
+                progress_callback=(
+                    (lambda downloaded, total, label: progress_callback("bytes", downloaded, total, label))
+                    if progress_callback is not None
+                    else None
+                ),
+            )
            if ok:
                return out_path
            log("[openlibrary] Direct download failed", file=sys.stderr)
@@ -949,65 +1085,131 @@ class OpenLibrary(Provider):
                log("[openlibrary] Archive credentials missing; cannot borrow", file=sys.stderr)
                return None

-            lendable, reason = _check_lendable(self._session, edition_id)
+            lendable = True
+            reason = ""
+            if edition_id:
+                lendable, reason = _check_lendable(self._session, edition_id)
+                if not lendable:
+                    # OpenLibrary API can be a false-negative; fall back to Archive metadata.
+                    lendable2, reason2 = self._archive_is_lendable(archive_id)
+                    if lendable2:
+                        lendable, reason = True, reason2
+            else:
+                lendable, reason = self._archive_is_lendable(archive_id)
+
            if not lendable:
                log(f"[openlibrary] Not lendable: {reason}", file=sys.stderr)
                return None

            session = self._archive_login(email, password)
+            loaned = False
            try:
-                session = self._archive_loan(session, archive_id, verbose=False)
-            except self.BookNotAvailableError:
-                log("[openlibrary] Book not available to borrow", file=sys.stderr)
-                return None
-            except Exception:
-                log("[openlibrary] Borrow failed", file=sys.stderr)
-                return None
-
-            urls = [f"https://archive.org/borrow/{archive_id}", f"https://archive.org/details/{archive_id}"]
-            title = safe_title
-            links: Optional[List[str]] = None
-            last_exc: Optional[Exception] = None
-            for u in urls:
                try:
-                    title_raw, links, _metadata = self._archive_get_book_infos(session, u)
-                    if title_raw:
-                        title = sanitize_filename(title_raw)
-                    break
-                except Exception as exc:
-                    last_exc = exc
-                    continue
-
-            if not links:
-                log(f"[openlibrary] Failed to extract pages: {last_exc}", file=sys.stderr)
-                return None
-
-            temp_dir = tempfile.mkdtemp(prefix=f"{title}_", dir=str(output_dir))
-            try:
-                images = self._archive_download(session=session, n_threads=10, directory=temp_dir, links=links, scale=3, book_id=archive_id)
-
-                pdf_bytes = _image_paths_to_pdf_bytes(images)
-                if not pdf_bytes:
-                    # Keep images folder for manual conversion.
-                    log("[openlibrary] PDF conversion failed; keeping images folder", file=sys.stderr)
-                    return Path(temp_dir)
-
-                pdf_path = unique_path(output_dir / f"{title}.pdf")
-                with open(pdf_path, "wb") as f:
-                    f.write(pdf_bytes)
-
-                try:
-                    shutil.rmtree(temp_dir)
+                    if progress_callback is not None:
+                        progress_callback("step", 0, None, "login")
                except Exception:
                    pass
-                return pdf_path

-            except Exception:
                try:
-                    shutil.rmtree(temp_dir)
+                    session = self._archive_loan(session, archive_id, verbose=False)
+                    loaned = True
+                except self.BookNotAvailableError:
+                    log("[openlibrary] Book not available to borrow", file=sys.stderr)
+                    return None
+                except Exception:
+                    log("[openlibrary] Borrow failed", file=sys.stderr)
+                    return None
+
+                try:
+                    if progress_callback is not None:
+                        progress_callback("step", 0, None, "borrow")
+                except Exception:
+                    pass
+
+                urls = [f"https://archive.org/borrow/{archive_id}", f"https://archive.org/details/{archive_id}"]
+                title = safe_title
+                links: Optional[List[str]] = None
+                last_exc: Optional[Exception] = None
+                for u in urls:
+                    try:
+                        title_raw, links, _metadata = self._archive_get_book_infos(session, u)
+                        if title_raw:
+                            title = sanitize_filename(title_raw)
+                        break
+                    except Exception as exc:
+                        last_exc = exc
+                        continue
+
+                if not links:
+                    log(f"[openlibrary] Failed to extract pages: {last_exc}", file=sys.stderr)
+                    return None
+
+                try:
+                    if progress_callback is not None:
+                        progress_callback("step", 0, None, "download pages")
+                except Exception:
+                    pass
+
+                temp_dir = tempfile.mkdtemp(prefix=f"{title}_", dir=str(output_dir))
+                try:
+                    images = self._archive_download(
+                        session=session,
+                        n_threads=10,
+                        directory=temp_dir,
+                        links=links,
+                        scale=3,
+                        book_id=archive_id,
+                        progress_callback=(
+                            (lambda done, total: progress_callback("pages", done, total, "pages"))
+                            if progress_callback is not None
+                            else None
+                        ),
+                    )
+
+                    pdf_bytes = _image_paths_to_pdf_bytes(images)
+                    if not pdf_bytes:
+                        # Keep images folder for manual conversion.
+                        log("[openlibrary] PDF conversion failed; keeping images folder", file=sys.stderr)
+                        return Path(temp_dir)
+
+                    try:
+                        if progress_callback is not None:
+                            progress_callback("step", 0, None, "stitch pdf")
+                    except Exception:
+                        pass
+
+                    pdf_path = unique_path(output_dir / f"{title}.pdf")
+                    with open(pdf_path, "wb") as f:
+                        f.write(pdf_bytes)
+
+                    try:
+                        shutil.rmtree(temp_dir)
+                    except Exception:
+                        pass
+                    return pdf_path
+
+                except Exception:
+                    try:
+                        shutil.rmtree(temp_dir)
+                    except Exception:
+                        pass
+                    raise
+            finally:
+                # Always return the loan after a successful borrow, even if download/stitch fails.
+                if loaned:
+                    try:
+                        if progress_callback is not None:
+                            progress_callback("step", 0, None, "return book")
+                    except Exception:
+                        pass
+                    try:
+                        self._archive_return_loan(session, archive_id)
+                    except Exception as exc:
+                        log(f"[openlibrary] Warning: failed to return loan: {exc}", file=sys.stderr)
+                try:
+                    self._archive_logout(session)
                except Exception:
                    pass
-                raise

        except Exception as exc:
            log(f"[openlibrary] Borrow workflow error: {exc}", file=sys.stderr)