This commit is contained in:
nose
2025-12-22 02:11:53 -08:00
parent d0b821b5dd
commit 16316bb3fd
20 changed files with 4218 additions and 2422 deletions

View File

@@ -23,6 +23,15 @@ except ImportError:
class Libgen(Provider):
# Domains that should be routed to this provider when the user supplies a URL.
# (Used by ProviderCore.registry.match_provider_name_for_url)
URL_DOMAINS = (
"libgen.gl",
"libgen.li",
"libgen.is",
"libgen.rs",
"libgen.st",
)
"""Search provider for Library Genesis books."""
def search(

View File

@@ -1,9 +1,11 @@
from __future__ import annotations
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Optional, Type
from typing import Any, Dict, List, Optional, Type, cast
import requests
import sys
import json
import subprocess
from SYS.logger import log, debug
@@ -13,6 +15,12 @@ except ImportError: # pragma: no cover - optional
musicbrainzngs = None
try: # Optional dependency
import yt_dlp # type: ignore
except ImportError: # pragma: no cover - optional
yt_dlp = None
class MetadataProvider(ABC):
"""Base class for metadata providers (music, movies, books, etc.)."""
@@ -351,6 +359,157 @@ class MusicBrainzMetadataProvider(MetadataProvider):
return tags
class YtdlpMetadataProvider(MetadataProvider):
"""Metadata provider that extracts tags from a supported URL using yt-dlp.
This does NOT download media; it only probes metadata.
"""
@property
def name(self) -> str: # type: ignore[override]
return "ytdlp"
def _extract_info(self, url: str) -> Optional[Dict[str, Any]]:
url = (url or "").strip()
if not url:
return None
# Prefer Python module when available.
if yt_dlp is not None:
try:
opts: Any = {
"quiet": True,
"no_warnings": True,
"skip_download": True,
"noprogress": True,
"socket_timeout": 15,
"retries": 1,
"playlist_items": "1-10",
}
with yt_dlp.YoutubeDL(opts) as ydl: # type: ignore[attr-defined]
info = ydl.extract_info(url, download=False)
return cast(Dict[str, Any], info) if isinstance(info, dict) else None
except Exception:
pass
# Fallback to CLI.
try:
cmd = [
"yt-dlp",
"-J",
"--no-warnings",
"--skip-download",
"--playlist-items",
"1-10",
url,
]
proc = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
if proc.returncode != 0:
return None
payload = (proc.stdout or "").strip()
if not payload:
return None
data = json.loads(payload)
return data if isinstance(data, dict) else None
except Exception:
return None
def search(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
url = (query or "").strip()
if not url.startswith(("http://", "https://")):
return []
info = self._extract_info(url)
if not isinstance(info, dict):
return []
upload_date = str(info.get("upload_date") or "")
release_date = str(info.get("release_date") or "")
year = (release_date or upload_date)[:4] if (release_date or upload_date) else ""
# Provide basic columns for the standard metadata selection table.
# NOTE: This is best-effort; many extractors don't provide artist/album.
artist = (
info.get("artist")
or info.get("uploader")
or info.get("channel")
or ""
)
album = info.get("album") or info.get("playlist_title") or ""
title = info.get("title") or ""
return [
{
"title": title,
"artist": str(artist or ""),
"album": str(album or ""),
"year": str(year or ""),
"provider": self.name,
"url": url,
"raw": info,
}
]
def to_tags(self, item: Dict[str, Any]) -> List[str]:
raw = item.get("raw")
if not isinstance(raw, dict):
return super().to_tags(item)
tags: List[str] = []
try:
from metadata import extract_ytdlp_tags
except Exception:
extract_ytdlp_tags = None # type: ignore[assignment]
if extract_ytdlp_tags:
try:
tags.extend(extract_ytdlp_tags(raw))
except Exception:
pass
# Subtitle availability tags
def _langs(value: Any) -> List[str]:
if not isinstance(value, dict):
return []
out: List[str] = []
for k in value.keys():
if isinstance(k, str) and k.strip():
out.append(k.strip().lower())
return sorted(set(out))
# If this is a playlist container, subtitle/captions are usually per-entry.
info_for_subs: Dict[str, Any] = raw
entries = raw.get("entries")
if isinstance(entries, list) and entries:
first = entries[0]
if isinstance(first, dict):
info_for_subs = first
for lang in _langs(info_for_subs.get("subtitles")):
tags.append(f"subs:{lang}")
for lang in _langs(info_for_subs.get("automatic_captions")):
tags.append(f"subs_auto:{lang}")
# Always include source tag for parity with other providers.
tags.append(f"source:{self.name}")
# Dedup case-insensitively, preserve order.
seen = set()
out: List[str] = []
for t in tags:
if not isinstance(t, str):
continue
s = t.strip()
if not s:
continue
k = s.lower()
if k in seen:
continue
seen.add(k)
out.append(s)
return out
# Registry ---------------------------------------------------------------
_METADATA_PROVIDERS: Dict[str, Type[MetadataProvider]] = {
@@ -359,6 +518,7 @@ _METADATA_PROVIDERS: Dict[str, Type[MetadataProvider]] = {
"googlebooks": GoogleBooksMetadataProvider,
"google": GoogleBooksMetadataProvider,
"musicbrainz": MusicBrainzMetadataProvider,
"ytdlp": YtdlpMetadataProvider,
}
@@ -370,7 +530,7 @@ def list_metadata_providers(config: Optional[Dict[str, Any]] = None) -> Dict[str
availability: Dict[str, bool] = {}
for name, cls in _METADATA_PROVIDERS.items():
try:
provider = cls(config)
_ = cls(config)
# Basic availability check: perform lightweight validation if defined
availability[name] = True
except Exception:

View File

@@ -11,7 +11,8 @@ import sys
import tempfile
import time
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from typing import Any, Callable, Dict, List, Optional, Tuple
from urllib.parse import urlparse
import requests
@@ -183,7 +184,44 @@ def _resolve_archive_id(session: requests.Session, edition_id: str, ia_candidate
return ""
def _archive_id_from_url(url: str) -> str:
"""Best-effort extraction of an Archive.org item identifier from a URL."""
u = str(url or "").strip()
if not u:
return ""
try:
p = urlparse(u)
host = (p.hostname or "").lower().strip()
if not host.endswith("archive.org"):
return ""
parts = [x for x in (p.path or "").split("/") if x]
except Exception:
return ""
# Common patterns:
# - /details/<id>/...
# - /borrow/<id>
# - /download/<id>/...
if len(parts) >= 2 and parts[0].lower() in {"details", "borrow", "download", "stream"}:
return str(parts[1]).strip()
# Sometimes the identifier is the first segment.
if len(parts) >= 1:
first = str(parts[0]).strip()
if first and first.lower() not in {"account", "services", "search", "advancedsearch.php"}:
return first
return ""
class OpenLibrary(Provider):
# Domains that should be routed to this provider when the user supplies a URL.
# (Used by ProviderCore.registry.match_provider_name_for_url)
URL_DOMAINS = (
"openlibrary.org",
"archive.org",
)
"""Search provider for OpenLibrary books + Archive.org direct/borrow download."""
def __init__(self, config: Optional[Dict[str, Any]] = None):
@@ -311,6 +349,60 @@ class OpenLibrary(Provider):
pass
raise RuntimeError("Something went wrong when trying to return the book")
@staticmethod
def _archive_logout(session: requests.Session) -> None:
"""Best-effort logout from archive.org.
Archive sessions are cookie-based; returning the loan is the critical step.
Logout is attempted for cleanliness but failures should not abort the workflow.
"""
if session is None:
return
for url in (
"https://archive.org/account/logout",
"https://archive.org/account/logout.php",
):
try:
resp = session.get(url, timeout=15, allow_redirects=True)
code = int(getattr(resp, "status_code", 0) or 0)
if code and code < 500:
return
except Exception:
continue
@staticmethod
def _archive_is_lendable(book_id: str) -> tuple[bool, str]:
"""Heuristic lendable check using Archive.org item metadata.
Some lendable items do not map cleanly to an OpenLibrary edition id.
In practice, Archive metadata collections often include markers like:
- inlibrary
- printdisabled
"""
ident = str(book_id or "").strip()
if not ident:
return False, "no-archive-id"
try:
resp = requests.get(f"https://archive.org/metadata/{ident}", timeout=8)
resp.raise_for_status()
data = resp.json() if resp is not None else {}
meta = data.get("metadata", {}) if isinstance(data, dict) else {}
collection = meta.get("collection") if isinstance(meta, dict) else None
values: List[str] = []
if isinstance(collection, list):
values = [str(x).strip().lower() for x in collection if str(x).strip()]
elif isinstance(collection, str):
values = [collection.strip().lower()]
if any(v in {"inlibrary", "printdisabled", "lendinglibrary"} for v in values):
return True, "archive-collection"
return False, "archive-not-lendable"
except Exception:
return False, "archive-metadata-error"
@staticmethod
def _archive_get_book_infos(session: requests.Session, url: str) -> Tuple[str, List[str], Dict[str, Any]]:
"""Extract page links from Archive.org book reader."""
@@ -430,6 +522,7 @@ class OpenLibrary(Provider):
links: List[str],
scale: int,
book_id: str,
progress_callback: Optional[Callable[[int, int], None]] = None,
) -> List[str]:
links_scaled = [f"{link}&rotate=0&scale={scale}" for link in links]
pages = len(links_scaled)
@@ -448,7 +541,20 @@ class OpenLibrary(Provider):
pages=pages,
)
)
if tqdm:
if progress_callback is not None:
done = 0
total = len(tasks)
for fut in futures.as_completed(tasks):
try:
_ = fut.result()
except Exception:
pass
done += 1
try:
progress_callback(done, total)
except Exception:
pass
elif tqdm:
for _ in tqdm(futures.as_completed(tasks), total=len(tasks)): # type: ignore
pass
else:
@@ -904,15 +1010,20 @@ class OpenLibrary(Provider):
return results
def download(self, result: SearchResult, output_dir: Path) -> Optional[Path]:
def download(
self,
result: SearchResult,
output_dir: Path,
progress_callback: Optional[Callable[[str, int, Optional[int], str], None]] = None,
) -> Optional[Path]:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
meta = result.full_metadata or {}
edition_id = str(meta.get("openlibrary_id") or "").strip()
if not edition_id:
log("[openlibrary] Missing openlibrary_id; cannot download", file=sys.stderr)
return None
# Accept direct Archive.org URLs too (details/borrow/download) even when no OL edition id is known.
archive_id = str(meta.get("archive_id") or "").strip()
ia_ids = meta.get("ia") or []
if isinstance(ia_ids, str):
@@ -921,12 +1032,23 @@ class OpenLibrary(Provider):
ia_ids = []
ia_candidates = [str(x) for x in ia_ids if x]
archive_id = _resolve_archive_id(self._session, edition_id, ia_candidates)
if not archive_id:
archive_id = _first_str(ia_candidates) or ""
if not archive_id and edition_id:
archive_id = _resolve_archive_id(self._session, edition_id, ia_candidates)
if not archive_id:
# Try to extract identifier from the SearchResult path (URL).
archive_id = _archive_id_from_url(str(getattr(result, "path", "") or ""))
if not archive_id:
log("[openlibrary] No archive identifier available; cannot download", file=sys.stderr)
return None
safe_title = sanitize_filename(result.title)
if not safe_title or "http" in safe_title.lower():
safe_title = sanitize_filename(archive_id) or "archive"
# 1) Direct download if available.
try:
@@ -935,8 +1057,22 @@ class OpenLibrary(Provider):
can_direct, pdf_url = False, ""
if can_direct and pdf_url:
try:
if progress_callback is not None:
progress_callback("step", 0, None, "direct download")
except Exception:
pass
out_path = unique_path(output_dir / f"{safe_title}.pdf")
ok = download_file(pdf_url, out_path, session=self._session)
ok = download_file(
pdf_url,
out_path,
session=self._session,
progress_callback=(
(lambda downloaded, total, label: progress_callback("bytes", downloaded, total, label))
if progress_callback is not None
else None
),
)
if ok:
return out_path
log("[openlibrary] Direct download failed", file=sys.stderr)
@@ -949,65 +1085,131 @@ class OpenLibrary(Provider):
log("[openlibrary] Archive credentials missing; cannot borrow", file=sys.stderr)
return None
lendable, reason = _check_lendable(self._session, edition_id)
lendable = True
reason = ""
if edition_id:
lendable, reason = _check_lendable(self._session, edition_id)
if not lendable:
# OpenLibrary API can be a false-negative; fall back to Archive metadata.
lendable2, reason2 = self._archive_is_lendable(archive_id)
if lendable2:
lendable, reason = True, reason2
else:
lendable, reason = self._archive_is_lendable(archive_id)
if not lendable:
log(f"[openlibrary] Not lendable: {reason}", file=sys.stderr)
return None
session = self._archive_login(email, password)
loaned = False
try:
session = self._archive_loan(session, archive_id, verbose=False)
except self.BookNotAvailableError:
log("[openlibrary] Book not available to borrow", file=sys.stderr)
return None
except Exception:
log("[openlibrary] Borrow failed", file=sys.stderr)
return None
urls = [f"https://archive.org/borrow/{archive_id}", f"https://archive.org/details/{archive_id}"]
title = safe_title
links: Optional[List[str]] = None
last_exc: Optional[Exception] = None
for u in urls:
try:
title_raw, links, _metadata = self._archive_get_book_infos(session, u)
if title_raw:
title = sanitize_filename(title_raw)
break
except Exception as exc:
last_exc = exc
continue
if not links:
log(f"[openlibrary] Failed to extract pages: {last_exc}", file=sys.stderr)
return None
temp_dir = tempfile.mkdtemp(prefix=f"{title}_", dir=str(output_dir))
try:
images = self._archive_download(session=session, n_threads=10, directory=temp_dir, links=links, scale=3, book_id=archive_id)
pdf_bytes = _image_paths_to_pdf_bytes(images)
if not pdf_bytes:
# Keep images folder for manual conversion.
log("[openlibrary] PDF conversion failed; keeping images folder", file=sys.stderr)
return Path(temp_dir)
pdf_path = unique_path(output_dir / f"{title}.pdf")
with open(pdf_path, "wb") as f:
f.write(pdf_bytes)
try:
shutil.rmtree(temp_dir)
if progress_callback is not None:
progress_callback("step", 0, None, "login")
except Exception:
pass
return pdf_path
except Exception:
try:
shutil.rmtree(temp_dir)
session = self._archive_loan(session, archive_id, verbose=False)
loaned = True
except self.BookNotAvailableError:
log("[openlibrary] Book not available to borrow", file=sys.stderr)
return None
except Exception:
log("[openlibrary] Borrow failed", file=sys.stderr)
return None
try:
if progress_callback is not None:
progress_callback("step", 0, None, "borrow")
except Exception:
pass
urls = [f"https://archive.org/borrow/{archive_id}", f"https://archive.org/details/{archive_id}"]
title = safe_title
links: Optional[List[str]] = None
last_exc: Optional[Exception] = None
for u in urls:
try:
title_raw, links, _metadata = self._archive_get_book_infos(session, u)
if title_raw:
title = sanitize_filename(title_raw)
break
except Exception as exc:
last_exc = exc
continue
if not links:
log(f"[openlibrary] Failed to extract pages: {last_exc}", file=sys.stderr)
return None
try:
if progress_callback is not None:
progress_callback("step", 0, None, "download pages")
except Exception:
pass
temp_dir = tempfile.mkdtemp(prefix=f"{title}_", dir=str(output_dir))
try:
images = self._archive_download(
session=session,
n_threads=10,
directory=temp_dir,
links=links,
scale=3,
book_id=archive_id,
progress_callback=(
(lambda done, total: progress_callback("pages", done, total, "pages"))
if progress_callback is not None
else None
),
)
pdf_bytes = _image_paths_to_pdf_bytes(images)
if not pdf_bytes:
# Keep images folder for manual conversion.
log("[openlibrary] PDF conversion failed; keeping images folder", file=sys.stderr)
return Path(temp_dir)
try:
if progress_callback is not None:
progress_callback("step", 0, None, "stitch pdf")
except Exception:
pass
pdf_path = unique_path(output_dir / f"{title}.pdf")
with open(pdf_path, "wb") as f:
f.write(pdf_bytes)
try:
shutil.rmtree(temp_dir)
except Exception:
pass
return pdf_path
except Exception:
try:
shutil.rmtree(temp_dir)
except Exception:
pass
raise
finally:
# Always return the loan after a successful borrow, even if download/stitch fails.
if loaned:
try:
if progress_callback is not None:
progress_callback("step", 0, None, "return book")
except Exception:
pass
try:
self._archive_return_loan(session, archive_id)
except Exception as exc:
log(f"[openlibrary] Warning: failed to return loan: {exc}", file=sys.stderr)
try:
self._archive_logout(session)
except Exception:
pass
raise
except Exception as exc:
log(f"[openlibrary] Borrow workflow error: {exc}", file=sys.stderr)