dfd

2025-12-22 02:11:53 -08:00
parent d0b821b5dd
commit 16316bb3fd
20 changed files with 4218 additions and 2422 deletions
--- a/Provider/libgen.py
+++ b/Provider/libgen.py
@@ -23,6 +23,15 @@ except ImportError:


 class Libgen(Provider):
+    # Domains that should be routed to this provider when the user supplies a URL.
+    # (Used by ProviderCore.registry.match_provider_name_for_url)
+    URL_DOMAINS = (
+        "libgen.gl",
+        "libgen.li",
+        "libgen.is",
+        "libgen.rs",
+        "libgen.st",
+    )
    """Search provider for Library Genesis books."""

    def search(
--- a/Provider/metadata_provider.py
+++ b/Provider/metadata_provider.py
@@ -1,9 +1,11 @@
 from __future__ import annotations

 from abc import ABC, abstractmethod
-from typing import Any, Dict, List, Optional, Type
+from typing import Any, Dict, List, Optional, Type, cast
 import requests
 import sys
+import json
+import subprocess

 from SYS.logger import log, debug

@@ -13,6 +15,12 @@ except ImportError:  # pragma: no cover - optional
    musicbrainzngs = None


+try:  # Optional dependency
+    import yt_dlp  # type: ignore
+except ImportError:  # pragma: no cover - optional
+    yt_dlp = None
+
+
 class MetadataProvider(ABC):
    """Base class for metadata providers (music, movies, books, etc.)."""

@@ -351,6 +359,157 @@ class MusicBrainzMetadataProvider(MetadataProvider):
        return tags


+class YtdlpMetadataProvider(MetadataProvider):
+    """Metadata provider that extracts tags from a supported URL using yt-dlp.
+
+    This does NOT download media; it only probes metadata.
+    """
+
+    @property
+    def name(self) -> str:  # type: ignore[override]
+        return "ytdlp"
+
+    def _extract_info(self, url: str) -> Optional[Dict[str, Any]]:
+        url = (url or "").strip()
+        if not url:
+            return None
+
+        # Prefer Python module when available.
+        if yt_dlp is not None:
+            try:
+                opts: Any = {
+                    "quiet": True,
+                    "no_warnings": True,
+                    "skip_download": True,
+                    "noprogress": True,
+                    "socket_timeout": 15,
+                    "retries": 1,
+                    "playlist_items": "1-10",
+                }
+                with yt_dlp.YoutubeDL(opts) as ydl:  # type: ignore[attr-defined]
+                    info = ydl.extract_info(url, download=False)
+                return cast(Dict[str, Any], info) if isinstance(info, dict) else None
+            except Exception:
+                pass
+
+        # Fallback to CLI.
+        try:
+            cmd = [
+                "yt-dlp",
+                "-J",
+                "--no-warnings",
+                "--skip-download",
+                "--playlist-items",
+                "1-10",
+                url,
+            ]
+            proc = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+            if proc.returncode != 0:
+                return None
+            payload = (proc.stdout or "").strip()
+            if not payload:
+                return None
+            data = json.loads(payload)
+            return data if isinstance(data, dict) else None
+        except Exception:
+            return None
+
+    def search(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
+        url = (query or "").strip()
+        if not url.startswith(("http://", "https://")):
+            return []
+
+        info = self._extract_info(url)
+        if not isinstance(info, dict):
+            return []
+
+        upload_date = str(info.get("upload_date") or "")
+        release_date = str(info.get("release_date") or "")
+        year = (release_date or upload_date)[:4] if (release_date or upload_date) else ""
+
+        # Provide basic columns for the standard metadata selection table.
+        # NOTE: This is best-effort; many extractors don't provide artist/album.
+        artist = (
+            info.get("artist")
+            or info.get("uploader")
+            or info.get("channel")
+            or ""
+        )
+        album = info.get("album") or info.get("playlist_title") or ""
+        title = info.get("title") or ""
+
+        return [
+            {
+                "title": title,
+                "artist": str(artist or ""),
+                "album": str(album or ""),
+                "year": str(year or ""),
+                "provider": self.name,
+                "url": url,
+                "raw": info,
+            }
+        ]
+
+    def to_tags(self, item: Dict[str, Any]) -> List[str]:
+        raw = item.get("raw")
+        if not isinstance(raw, dict):
+            return super().to_tags(item)
+
+        tags: List[str] = []
+        try:
+            from metadata import extract_ytdlp_tags
+        except Exception:
+            extract_ytdlp_tags = None  # type: ignore[assignment]
+
+        if extract_ytdlp_tags:
+            try:
+                tags.extend(extract_ytdlp_tags(raw))
+            except Exception:
+                pass
+
+        # Subtitle availability tags
+        def _langs(value: Any) -> List[str]:
+            if not isinstance(value, dict):
+                return []
+            out: List[str] = []
+            for k in value.keys():
+                if isinstance(k, str) and k.strip():
+                    out.append(k.strip().lower())
+            return sorted(set(out))
+
+        # If this is a playlist container, subtitle/captions are usually per-entry.
+        info_for_subs: Dict[str, Any] = raw
+        entries = raw.get("entries")
+        if isinstance(entries, list) and entries:
+            first = entries[0]
+            if isinstance(first, dict):
+                info_for_subs = first
+
+        for lang in _langs(info_for_subs.get("subtitles")):
+            tags.append(f"subs:{lang}")
+        for lang in _langs(info_for_subs.get("automatic_captions")):
+            tags.append(f"subs_auto:{lang}")
+
+        # Always include source tag for parity with other providers.
+        tags.append(f"source:{self.name}")
+
+        # Dedup case-insensitively, preserve order.
+        seen = set()
+        out: List[str] = []
+        for t in tags:
+            if not isinstance(t, str):
+                continue
+            s = t.strip()
+            if not s:
+                continue
+            k = s.lower()
+            if k in seen:
+                continue
+            seen.add(k)
+            out.append(s)
+        return out
+
+
 # Registry ---------------------------------------------------------------

 _METADATA_PROVIDERS: Dict[str, Type[MetadataProvider]] = {
@@ -359,6 +518,7 @@ _METADATA_PROVIDERS: Dict[str, Type[MetadataProvider]] = {
    "googlebooks": GoogleBooksMetadataProvider,
    "google": GoogleBooksMetadataProvider,
    "musicbrainz": MusicBrainzMetadataProvider,
+    "ytdlp": YtdlpMetadataProvider,
 }


@@ -370,7 +530,7 @@ def list_metadata_providers(config: Optional[Dict[str, Any]] = None) -> Dict[str
    availability: Dict[str, bool] = {}
    for name, cls in _METADATA_PROVIDERS.items():
        try:
-            provider = cls(config)
+            _ = cls(config)
            # Basic availability check: perform lightweight validation if defined
            availability[name] = True
        except Exception:
--- a/Provider/openlibrary.py
+++ b/Provider/openlibrary.py
@@ -11,7 +11,8 @@ import sys
 import tempfile
 import time
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple
+from urllib.parse import urlparse

 import requests

@@ -183,7 +184,44 @@ def _resolve_archive_id(session: requests.Session, edition_id: str, ia_candidate
    return ""


+def _archive_id_from_url(url: str) -> str:
+    """Best-effort extraction of an Archive.org item identifier from a URL."""
+
+    u = str(url or "").strip()
+    if not u:
+        return ""
+    try:
+        p = urlparse(u)
+        host = (p.hostname or "").lower().strip()
+        if not host.endswith("archive.org"):
+            return ""
+        parts = [x for x in (p.path or "").split("/") if x]
+    except Exception:
+        return ""
+
+    # Common patterns:
+    # - /details/<id>/...
+    # - /borrow/<id>
+    # - /download/<id>/...
+    if len(parts) >= 2 and parts[0].lower() in {"details", "borrow", "download", "stream"}:
+        return str(parts[1]).strip()
+
+    # Sometimes the identifier is the first segment.
+    if len(parts) >= 1:
+        first = str(parts[0]).strip()
+        if first and first.lower() not in {"account", "services", "search", "advancedsearch.php"}:
+            return first
+
+    return ""
+
+
 class OpenLibrary(Provider):
+    # Domains that should be routed to this provider when the user supplies a URL.
+    # (Used by ProviderCore.registry.match_provider_name_for_url)
+    URL_DOMAINS = (
+        "openlibrary.org",
+        "archive.org",
+    )
    """Search provider for OpenLibrary books + Archive.org direct/borrow download."""

    def __init__(self, config: Optional[Dict[str, Any]] = None):
@@ -311,6 +349,60 @@ class OpenLibrary(Provider):
                pass
        raise RuntimeError("Something went wrong when trying to return the book")

+    @staticmethod
+    def _archive_logout(session: requests.Session) -> None:
+        """Best-effort logout from archive.org.
+
+        Archive sessions are cookie-based; returning the loan is the critical step.
+        Logout is attempted for cleanliness but failures should not abort the workflow.
+        """
+
+        if session is None:
+            return
+        for url in (
+            "https://archive.org/account/logout",
+            "https://archive.org/account/logout.php",
+        ):
+            try:
+                resp = session.get(url, timeout=15, allow_redirects=True)
+                code = int(getattr(resp, "status_code", 0) or 0)
+                if code and code < 500:
+                    return
+            except Exception:
+                continue
+
+    @staticmethod
+    def _archive_is_lendable(book_id: str) -> tuple[bool, str]:
+        """Heuristic lendable check using Archive.org item metadata.
+
+        Some lendable items do not map cleanly to an OpenLibrary edition id.
+        In practice, Archive metadata collections often include markers like:
+        - inlibrary
+        - printdisabled
+        """
+
+        ident = str(book_id or "").strip()
+        if not ident:
+            return False, "no-archive-id"
+        try:
+            resp = requests.get(f"https://archive.org/metadata/{ident}", timeout=8)
+            resp.raise_for_status()
+            data = resp.json() if resp is not None else {}
+            meta = data.get("metadata", {}) if isinstance(data, dict) else {}
+            collection = meta.get("collection") if isinstance(meta, dict) else None
+
+            values: List[str] = []
+            if isinstance(collection, list):
+                values = [str(x).strip().lower() for x in collection if str(x).strip()]
+            elif isinstance(collection, str):
+                values = [collection.strip().lower()]
+
+            if any(v in {"inlibrary", "printdisabled", "lendinglibrary"} for v in values):
+                return True, "archive-collection"
+            return False, "archive-not-lendable"
+        except Exception:
+            return False, "archive-metadata-error"
+
    @staticmethod
    def _archive_get_book_infos(session: requests.Session, url: str) -> Tuple[str, List[str], Dict[str, Any]]:
        """Extract page links from Archive.org book reader."""
@@ -430,6 +522,7 @@ class OpenLibrary(Provider):
        links: List[str],
        scale: int,
        book_id: str,
+        progress_callback: Optional[Callable[[int, int], None]] = None,
    ) -> List[str]:
        links_scaled = [f"{link}&rotate=0&scale={scale}" for link in links]
        pages = len(links_scaled)
@@ -448,7 +541,20 @@ class OpenLibrary(Provider):
                        pages=pages,
                    )
                )
-            if tqdm:
+            if progress_callback is not None:
+                done = 0
+                total = len(tasks)
+                for fut in futures.as_completed(tasks):
+                    try:
+                        _ = fut.result()
+                    except Exception:
+                        pass
+                    done += 1
+                    try:
+                        progress_callback(done, total)
+                    except Exception:
+                        pass
+            elif tqdm:
                for _ in tqdm(futures.as_completed(tasks), total=len(tasks)):  # type: ignore
                    pass
            else:
@@ -904,15 +1010,20 @@ class OpenLibrary(Provider):

        return results

-    def download(self, result: SearchResult, output_dir: Path) -> Optional[Path]:
+    def download(
+        self,
+        result: SearchResult,
+        output_dir: Path,
+        progress_callback: Optional[Callable[[str, int, Optional[int], str], None]] = None,
+    ) -> Optional[Path]:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

        meta = result.full_metadata or {}
        edition_id = str(meta.get("openlibrary_id") or "").strip()
-        if not edition_id:
-            log("[openlibrary] Missing openlibrary_id; cannot download", file=sys.stderr)
-            return None
+
+        # Accept direct Archive.org URLs too (details/borrow/download) even when no OL edition id is known.
+        archive_id = str(meta.get("archive_id") or "").strip()

        ia_ids = meta.get("ia") or []
        if isinstance(ia_ids, str):
@@ -921,12 +1032,23 @@ class OpenLibrary(Provider):
            ia_ids = []
        ia_candidates = [str(x) for x in ia_ids if x]

-        archive_id = _resolve_archive_id(self._session, edition_id, ia_candidates)
+        if not archive_id:
+            archive_id = _first_str(ia_candidates) or ""
+
+        if not archive_id and edition_id:
+            archive_id = _resolve_archive_id(self._session, edition_id, ia_candidates)
+
+        if not archive_id:
+            # Try to extract identifier from the SearchResult path (URL).
+            archive_id = _archive_id_from_url(str(getattr(result, "path", "") or ""))
+
        if not archive_id:
            log("[openlibrary] No archive identifier available; cannot download", file=sys.stderr)
            return None

        safe_title = sanitize_filename(result.title)
+        if not safe_title or "http" in safe_title.lower():
+            safe_title = sanitize_filename(archive_id) or "archive"

        # 1) Direct download if available.
        try:
@@ -935,8 +1057,22 @@ class OpenLibrary(Provider):
            can_direct, pdf_url = False, ""

        if can_direct and pdf_url:
+            try:
+                if progress_callback is not None:
+                    progress_callback("step", 0, None, "direct download")
+            except Exception:
+                pass
            out_path = unique_path(output_dir / f"{safe_title}.pdf")
-            ok = download_file(pdf_url, out_path, session=self._session)
+            ok = download_file(
+                pdf_url,
+                out_path,
+                session=self._session,
+                progress_callback=(
+                    (lambda downloaded, total, label: progress_callback("bytes", downloaded, total, label))
+                    if progress_callback is not None
+                    else None
+                ),
+            )
            if ok:
                return out_path
            log("[openlibrary] Direct download failed", file=sys.stderr)
@@ -949,65 +1085,131 @@ class OpenLibrary(Provider):
                log("[openlibrary] Archive credentials missing; cannot borrow", file=sys.stderr)
                return None

-            lendable, reason = _check_lendable(self._session, edition_id)
+            lendable = True
+            reason = ""
+            if edition_id:
+                lendable, reason = _check_lendable(self._session, edition_id)
+                if not lendable:
+                    # OpenLibrary API can be a false-negative; fall back to Archive metadata.
+                    lendable2, reason2 = self._archive_is_lendable(archive_id)
+                    if lendable2:
+                        lendable, reason = True, reason2
+            else:
+                lendable, reason = self._archive_is_lendable(archive_id)
+
            if not lendable:
                log(f"[openlibrary] Not lendable: {reason}", file=sys.stderr)
                return None

            session = self._archive_login(email, password)
+            loaned = False
            try:
-                session = self._archive_loan(session, archive_id, verbose=False)
-            except self.BookNotAvailableError:
-                log("[openlibrary] Book not available to borrow", file=sys.stderr)
-                return None
-            except Exception:
-                log("[openlibrary] Borrow failed", file=sys.stderr)
-                return None
-
-            urls = [f"https://archive.org/borrow/{archive_id}", f"https://archive.org/details/{archive_id}"]
-            title = safe_title
-            links: Optional[List[str]] = None
-            last_exc: Optional[Exception] = None
-            for u in urls:
                try:
-                    title_raw, links, _metadata = self._archive_get_book_infos(session, u)
-                    if title_raw:
-                        title = sanitize_filename(title_raw)
-                    break
-                except Exception as exc:
-                    last_exc = exc
-                    continue
-
-            if not links:
-                log(f"[openlibrary] Failed to extract pages: {last_exc}", file=sys.stderr)
-                return None
-
-            temp_dir = tempfile.mkdtemp(prefix=f"{title}_", dir=str(output_dir))
-            try:
-                images = self._archive_download(session=session, n_threads=10, directory=temp_dir, links=links, scale=3, book_id=archive_id)
-
-                pdf_bytes = _image_paths_to_pdf_bytes(images)
-                if not pdf_bytes:
-                    # Keep images folder for manual conversion.
-                    log("[openlibrary] PDF conversion failed; keeping images folder", file=sys.stderr)
-                    return Path(temp_dir)
-
-                pdf_path = unique_path(output_dir / f"{title}.pdf")
-                with open(pdf_path, "wb") as f:
-                    f.write(pdf_bytes)
-
-                try:
-                    shutil.rmtree(temp_dir)
+                    if progress_callback is not None:
+                        progress_callback("step", 0, None, "login")
                except Exception:
                    pass
-                return pdf_path

-            except Exception:
                try:
-                    shutil.rmtree(temp_dir)
+                    session = self._archive_loan(session, archive_id, verbose=False)
+                    loaned = True
+                except self.BookNotAvailableError:
+                    log("[openlibrary] Book not available to borrow", file=sys.stderr)
+                    return None
+                except Exception:
+                    log("[openlibrary] Borrow failed", file=sys.stderr)
+                    return None
+
+                try:
+                    if progress_callback is not None:
+                        progress_callback("step", 0, None, "borrow")
+                except Exception:
+                    pass
+
+                urls = [f"https://archive.org/borrow/{archive_id}", f"https://archive.org/details/{archive_id}"]
+                title = safe_title
+                links: Optional[List[str]] = None
+                last_exc: Optional[Exception] = None
+                for u in urls:
+                    try:
+                        title_raw, links, _metadata = self._archive_get_book_infos(session, u)
+                        if title_raw:
+                            title = sanitize_filename(title_raw)
+                        break
+                    except Exception as exc:
+                        last_exc = exc
+                        continue
+
+                if not links:
+                    log(f"[openlibrary] Failed to extract pages: {last_exc}", file=sys.stderr)
+                    return None
+
+                try:
+                    if progress_callback is not None:
+                        progress_callback("step", 0, None, "download pages")
+                except Exception:
+                    pass
+
+                temp_dir = tempfile.mkdtemp(prefix=f"{title}_", dir=str(output_dir))
+                try:
+                    images = self._archive_download(
+                        session=session,
+                        n_threads=10,
+                        directory=temp_dir,
+                        links=links,
+                        scale=3,
+                        book_id=archive_id,
+                        progress_callback=(
+                            (lambda done, total: progress_callback("pages", done, total, "pages"))
+                            if progress_callback is not None
+                            else None
+                        ),
+                    )
+
+                    pdf_bytes = _image_paths_to_pdf_bytes(images)
+                    if not pdf_bytes:
+                        # Keep images folder for manual conversion.
+                        log("[openlibrary] PDF conversion failed; keeping images folder", file=sys.stderr)
+                        return Path(temp_dir)
+
+                    try:
+                        if progress_callback is not None:
+                            progress_callback("step", 0, None, "stitch pdf")
+                    except Exception:
+                        pass
+
+                    pdf_path = unique_path(output_dir / f"{title}.pdf")
+                    with open(pdf_path, "wb") as f:
+                        f.write(pdf_bytes)
+
+                    try:
+                        shutil.rmtree(temp_dir)
+                    except Exception:
+                        pass
+                    return pdf_path
+
+                except Exception:
+                    try:
+                        shutil.rmtree(temp_dir)
+                    except Exception:
+                        pass
+                    raise
+            finally:
+                # Always return the loan after a successful borrow, even if download/stitch fails.
+                if loaned:
+                    try:
+                        if progress_callback is not None:
+                            progress_callback("step", 0, None, "return book")
+                    except Exception:
+                        pass
+                    try:
+                        self._archive_return_loan(session, archive_id)
+                    except Exception as exc:
+                        log(f"[openlibrary] Warning: failed to return loan: {exc}", file=sys.stderr)
+                try:
+                    self._archive_logout(session)
                except Exception:
                    pass
-                raise

        except Exception as exc:
            log(f"[openlibrary] Borrow workflow error: {exc}", file=sys.stderr)