dfd
This commit is contained in:
@@ -23,6 +23,15 @@ except ImportError:
|
||||
|
||||
|
||||
class Libgen(Provider):
|
||||
# Domains that should be routed to this provider when the user supplies a URL.
|
||||
# (Used by ProviderCore.registry.match_provider_name_for_url)
|
||||
URL_DOMAINS = (
|
||||
"libgen.gl",
|
||||
"libgen.li",
|
||||
"libgen.is",
|
||||
"libgen.rs",
|
||||
"libgen.st",
|
||||
)
|
||||
"""Search provider for Library Genesis books."""
|
||||
|
||||
def search(
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Dict, List, Optional, Type
|
||||
from typing import Any, Dict, List, Optional, Type, cast
|
||||
import requests
|
||||
import sys
|
||||
import json
|
||||
import subprocess
|
||||
|
||||
from SYS.logger import log, debug
|
||||
|
||||
@@ -13,6 +15,12 @@ except ImportError: # pragma: no cover - optional
|
||||
musicbrainzngs = None
|
||||
|
||||
|
||||
try: # Optional dependency
|
||||
import yt_dlp # type: ignore
|
||||
except ImportError: # pragma: no cover - optional
|
||||
yt_dlp = None
|
||||
|
||||
|
||||
class MetadataProvider(ABC):
|
||||
"""Base class for metadata providers (music, movies, books, etc.)."""
|
||||
|
||||
@@ -351,6 +359,157 @@ class MusicBrainzMetadataProvider(MetadataProvider):
|
||||
return tags
|
||||
|
||||
|
||||
class YtdlpMetadataProvider(MetadataProvider):
|
||||
"""Metadata provider that extracts tags from a supported URL using yt-dlp.
|
||||
|
||||
This does NOT download media; it only probes metadata.
|
||||
"""
|
||||
|
||||
@property
|
||||
def name(self) -> str: # type: ignore[override]
|
||||
return "ytdlp"
|
||||
|
||||
def _extract_info(self, url: str) -> Optional[Dict[str, Any]]:
|
||||
url = (url or "").strip()
|
||||
if not url:
|
||||
return None
|
||||
|
||||
# Prefer Python module when available.
|
||||
if yt_dlp is not None:
|
||||
try:
|
||||
opts: Any = {
|
||||
"quiet": True,
|
||||
"no_warnings": True,
|
||||
"skip_download": True,
|
||||
"noprogress": True,
|
||||
"socket_timeout": 15,
|
||||
"retries": 1,
|
||||
"playlist_items": "1-10",
|
||||
}
|
||||
with yt_dlp.YoutubeDL(opts) as ydl: # type: ignore[attr-defined]
|
||||
info = ydl.extract_info(url, download=False)
|
||||
return cast(Dict[str, Any], info) if isinstance(info, dict) else None
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to CLI.
|
||||
try:
|
||||
cmd = [
|
||||
"yt-dlp",
|
||||
"-J",
|
||||
"--no-warnings",
|
||||
"--skip-download",
|
||||
"--playlist-items",
|
||||
"1-10",
|
||||
url,
|
||||
]
|
||||
proc = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
|
||||
if proc.returncode != 0:
|
||||
return None
|
||||
payload = (proc.stdout or "").strip()
|
||||
if not payload:
|
||||
return None
|
||||
data = json.loads(payload)
|
||||
return data if isinstance(data, dict) else None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def search(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
|
||||
url = (query or "").strip()
|
||||
if not url.startswith(("http://", "https://")):
|
||||
return []
|
||||
|
||||
info = self._extract_info(url)
|
||||
if not isinstance(info, dict):
|
||||
return []
|
||||
|
||||
upload_date = str(info.get("upload_date") or "")
|
||||
release_date = str(info.get("release_date") or "")
|
||||
year = (release_date or upload_date)[:4] if (release_date or upload_date) else ""
|
||||
|
||||
# Provide basic columns for the standard metadata selection table.
|
||||
# NOTE: This is best-effort; many extractors don't provide artist/album.
|
||||
artist = (
|
||||
info.get("artist")
|
||||
or info.get("uploader")
|
||||
or info.get("channel")
|
||||
or ""
|
||||
)
|
||||
album = info.get("album") or info.get("playlist_title") or ""
|
||||
title = info.get("title") or ""
|
||||
|
||||
return [
|
||||
{
|
||||
"title": title,
|
||||
"artist": str(artist or ""),
|
||||
"album": str(album or ""),
|
||||
"year": str(year or ""),
|
||||
"provider": self.name,
|
||||
"url": url,
|
||||
"raw": info,
|
||||
}
|
||||
]
|
||||
|
||||
def to_tags(self, item: Dict[str, Any]) -> List[str]:
|
||||
raw = item.get("raw")
|
||||
if not isinstance(raw, dict):
|
||||
return super().to_tags(item)
|
||||
|
||||
tags: List[str] = []
|
||||
try:
|
||||
from metadata import extract_ytdlp_tags
|
||||
except Exception:
|
||||
extract_ytdlp_tags = None # type: ignore[assignment]
|
||||
|
||||
if extract_ytdlp_tags:
|
||||
try:
|
||||
tags.extend(extract_ytdlp_tags(raw))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Subtitle availability tags
|
||||
def _langs(value: Any) -> List[str]:
|
||||
if not isinstance(value, dict):
|
||||
return []
|
||||
out: List[str] = []
|
||||
for k in value.keys():
|
||||
if isinstance(k, str) and k.strip():
|
||||
out.append(k.strip().lower())
|
||||
return sorted(set(out))
|
||||
|
||||
# If this is a playlist container, subtitle/captions are usually per-entry.
|
||||
info_for_subs: Dict[str, Any] = raw
|
||||
entries = raw.get("entries")
|
||||
if isinstance(entries, list) and entries:
|
||||
first = entries[0]
|
||||
if isinstance(first, dict):
|
||||
info_for_subs = first
|
||||
|
||||
for lang in _langs(info_for_subs.get("subtitles")):
|
||||
tags.append(f"subs:{lang}")
|
||||
for lang in _langs(info_for_subs.get("automatic_captions")):
|
||||
tags.append(f"subs_auto:{lang}")
|
||||
|
||||
# Always include source tag for parity with other providers.
|
||||
tags.append(f"source:{self.name}")
|
||||
|
||||
# Dedup case-insensitively, preserve order.
|
||||
seen = set()
|
||||
out: List[str] = []
|
||||
for t in tags:
|
||||
if not isinstance(t, str):
|
||||
continue
|
||||
s = t.strip()
|
||||
if not s:
|
||||
continue
|
||||
k = s.lower()
|
||||
if k in seen:
|
||||
continue
|
||||
seen.add(k)
|
||||
out.append(s)
|
||||
return out
|
||||
|
||||
|
||||
# Registry ---------------------------------------------------------------
|
||||
|
||||
_METADATA_PROVIDERS: Dict[str, Type[MetadataProvider]] = {
|
||||
@@ -359,6 +518,7 @@ _METADATA_PROVIDERS: Dict[str, Type[MetadataProvider]] = {
|
||||
"googlebooks": GoogleBooksMetadataProvider,
|
||||
"google": GoogleBooksMetadataProvider,
|
||||
"musicbrainz": MusicBrainzMetadataProvider,
|
||||
"ytdlp": YtdlpMetadataProvider,
|
||||
}
|
||||
|
||||
|
||||
@@ -370,7 +530,7 @@ def list_metadata_providers(config: Optional[Dict[str, Any]] = None) -> Dict[str
|
||||
availability: Dict[str, bool] = {}
|
||||
for name, cls in _METADATA_PROVIDERS.items():
|
||||
try:
|
||||
provider = cls(config)
|
||||
_ = cls(config)
|
||||
# Basic availability check: perform lightweight validation if defined
|
||||
availability[name] = True
|
||||
except Exception:
|
||||
|
||||
@@ -11,7 +11,8 @@ import sys
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
|
||||
@@ -183,7 +184,44 @@ def _resolve_archive_id(session: requests.Session, edition_id: str, ia_candidate
|
||||
return ""
|
||||
|
||||
|
||||
def _archive_id_from_url(url: str) -> str:
|
||||
"""Best-effort extraction of an Archive.org item identifier from a URL."""
|
||||
|
||||
u = str(url or "").strip()
|
||||
if not u:
|
||||
return ""
|
||||
try:
|
||||
p = urlparse(u)
|
||||
host = (p.hostname or "").lower().strip()
|
||||
if not host.endswith("archive.org"):
|
||||
return ""
|
||||
parts = [x for x in (p.path or "").split("/") if x]
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
# Common patterns:
|
||||
# - /details/<id>/...
|
||||
# - /borrow/<id>
|
||||
# - /download/<id>/...
|
||||
if len(parts) >= 2 and parts[0].lower() in {"details", "borrow", "download", "stream"}:
|
||||
return str(parts[1]).strip()
|
||||
|
||||
# Sometimes the identifier is the first segment.
|
||||
if len(parts) >= 1:
|
||||
first = str(parts[0]).strip()
|
||||
if first and first.lower() not in {"account", "services", "search", "advancedsearch.php"}:
|
||||
return first
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
class OpenLibrary(Provider):
|
||||
# Domains that should be routed to this provider when the user supplies a URL.
|
||||
# (Used by ProviderCore.registry.match_provider_name_for_url)
|
||||
URL_DOMAINS = (
|
||||
"openlibrary.org",
|
||||
"archive.org",
|
||||
)
|
||||
"""Search provider for OpenLibrary books + Archive.org direct/borrow download."""
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
@@ -311,6 +349,60 @@ class OpenLibrary(Provider):
|
||||
pass
|
||||
raise RuntimeError("Something went wrong when trying to return the book")
|
||||
|
||||
@staticmethod
|
||||
def _archive_logout(session: requests.Session) -> None:
|
||||
"""Best-effort logout from archive.org.
|
||||
|
||||
Archive sessions are cookie-based; returning the loan is the critical step.
|
||||
Logout is attempted for cleanliness but failures should not abort the workflow.
|
||||
"""
|
||||
|
||||
if session is None:
|
||||
return
|
||||
for url in (
|
||||
"https://archive.org/account/logout",
|
||||
"https://archive.org/account/logout.php",
|
||||
):
|
||||
try:
|
||||
resp = session.get(url, timeout=15, allow_redirects=True)
|
||||
code = int(getattr(resp, "status_code", 0) or 0)
|
||||
if code and code < 500:
|
||||
return
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
@staticmethod
|
||||
def _archive_is_lendable(book_id: str) -> tuple[bool, str]:
|
||||
"""Heuristic lendable check using Archive.org item metadata.
|
||||
|
||||
Some lendable items do not map cleanly to an OpenLibrary edition id.
|
||||
In practice, Archive metadata collections often include markers like:
|
||||
- inlibrary
|
||||
- printdisabled
|
||||
"""
|
||||
|
||||
ident = str(book_id or "").strip()
|
||||
if not ident:
|
||||
return False, "no-archive-id"
|
||||
try:
|
||||
resp = requests.get(f"https://archive.org/metadata/{ident}", timeout=8)
|
||||
resp.raise_for_status()
|
||||
data = resp.json() if resp is not None else {}
|
||||
meta = data.get("metadata", {}) if isinstance(data, dict) else {}
|
||||
collection = meta.get("collection") if isinstance(meta, dict) else None
|
||||
|
||||
values: List[str] = []
|
||||
if isinstance(collection, list):
|
||||
values = [str(x).strip().lower() for x in collection if str(x).strip()]
|
||||
elif isinstance(collection, str):
|
||||
values = [collection.strip().lower()]
|
||||
|
||||
if any(v in {"inlibrary", "printdisabled", "lendinglibrary"} for v in values):
|
||||
return True, "archive-collection"
|
||||
return False, "archive-not-lendable"
|
||||
except Exception:
|
||||
return False, "archive-metadata-error"
|
||||
|
||||
@staticmethod
|
||||
def _archive_get_book_infos(session: requests.Session, url: str) -> Tuple[str, List[str], Dict[str, Any]]:
|
||||
"""Extract page links from Archive.org book reader."""
|
||||
@@ -430,6 +522,7 @@ class OpenLibrary(Provider):
|
||||
links: List[str],
|
||||
scale: int,
|
||||
book_id: str,
|
||||
progress_callback: Optional[Callable[[int, int], None]] = None,
|
||||
) -> List[str]:
|
||||
links_scaled = [f"{link}&rotate=0&scale={scale}" for link in links]
|
||||
pages = len(links_scaled)
|
||||
@@ -448,7 +541,20 @@ class OpenLibrary(Provider):
|
||||
pages=pages,
|
||||
)
|
||||
)
|
||||
if tqdm:
|
||||
if progress_callback is not None:
|
||||
done = 0
|
||||
total = len(tasks)
|
||||
for fut in futures.as_completed(tasks):
|
||||
try:
|
||||
_ = fut.result()
|
||||
except Exception:
|
||||
pass
|
||||
done += 1
|
||||
try:
|
||||
progress_callback(done, total)
|
||||
except Exception:
|
||||
pass
|
||||
elif tqdm:
|
||||
for _ in tqdm(futures.as_completed(tasks), total=len(tasks)): # type: ignore
|
||||
pass
|
||||
else:
|
||||
@@ -904,15 +1010,20 @@ class OpenLibrary(Provider):
|
||||
|
||||
return results
|
||||
|
||||
def download(self, result: SearchResult, output_dir: Path) -> Optional[Path]:
|
||||
def download(
|
||||
self,
|
||||
result: SearchResult,
|
||||
output_dir: Path,
|
||||
progress_callback: Optional[Callable[[str, int, Optional[int], str], None]] = None,
|
||||
) -> Optional[Path]:
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
meta = result.full_metadata or {}
|
||||
edition_id = str(meta.get("openlibrary_id") or "").strip()
|
||||
if not edition_id:
|
||||
log("[openlibrary] Missing openlibrary_id; cannot download", file=sys.stderr)
|
||||
return None
|
||||
|
||||
# Accept direct Archive.org URLs too (details/borrow/download) even when no OL edition id is known.
|
||||
archive_id = str(meta.get("archive_id") or "").strip()
|
||||
|
||||
ia_ids = meta.get("ia") or []
|
||||
if isinstance(ia_ids, str):
|
||||
@@ -921,12 +1032,23 @@ class OpenLibrary(Provider):
|
||||
ia_ids = []
|
||||
ia_candidates = [str(x) for x in ia_ids if x]
|
||||
|
||||
archive_id = _resolve_archive_id(self._session, edition_id, ia_candidates)
|
||||
if not archive_id:
|
||||
archive_id = _first_str(ia_candidates) or ""
|
||||
|
||||
if not archive_id and edition_id:
|
||||
archive_id = _resolve_archive_id(self._session, edition_id, ia_candidates)
|
||||
|
||||
if not archive_id:
|
||||
# Try to extract identifier from the SearchResult path (URL).
|
||||
archive_id = _archive_id_from_url(str(getattr(result, "path", "") or ""))
|
||||
|
||||
if not archive_id:
|
||||
log("[openlibrary] No archive identifier available; cannot download", file=sys.stderr)
|
||||
return None
|
||||
|
||||
safe_title = sanitize_filename(result.title)
|
||||
if not safe_title or "http" in safe_title.lower():
|
||||
safe_title = sanitize_filename(archive_id) or "archive"
|
||||
|
||||
# 1) Direct download if available.
|
||||
try:
|
||||
@@ -935,8 +1057,22 @@ class OpenLibrary(Provider):
|
||||
can_direct, pdf_url = False, ""
|
||||
|
||||
if can_direct and pdf_url:
|
||||
try:
|
||||
if progress_callback is not None:
|
||||
progress_callback("step", 0, None, "direct download")
|
||||
except Exception:
|
||||
pass
|
||||
out_path = unique_path(output_dir / f"{safe_title}.pdf")
|
||||
ok = download_file(pdf_url, out_path, session=self._session)
|
||||
ok = download_file(
|
||||
pdf_url,
|
||||
out_path,
|
||||
session=self._session,
|
||||
progress_callback=(
|
||||
(lambda downloaded, total, label: progress_callback("bytes", downloaded, total, label))
|
||||
if progress_callback is not None
|
||||
else None
|
||||
),
|
||||
)
|
||||
if ok:
|
||||
return out_path
|
||||
log("[openlibrary] Direct download failed", file=sys.stderr)
|
||||
@@ -949,65 +1085,131 @@ class OpenLibrary(Provider):
|
||||
log("[openlibrary] Archive credentials missing; cannot borrow", file=sys.stderr)
|
||||
return None
|
||||
|
||||
lendable, reason = _check_lendable(self._session, edition_id)
|
||||
lendable = True
|
||||
reason = ""
|
||||
if edition_id:
|
||||
lendable, reason = _check_lendable(self._session, edition_id)
|
||||
if not lendable:
|
||||
# OpenLibrary API can be a false-negative; fall back to Archive metadata.
|
||||
lendable2, reason2 = self._archive_is_lendable(archive_id)
|
||||
if lendable2:
|
||||
lendable, reason = True, reason2
|
||||
else:
|
||||
lendable, reason = self._archive_is_lendable(archive_id)
|
||||
|
||||
if not lendable:
|
||||
log(f"[openlibrary] Not lendable: {reason}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
session = self._archive_login(email, password)
|
||||
loaned = False
|
||||
try:
|
||||
session = self._archive_loan(session, archive_id, verbose=False)
|
||||
except self.BookNotAvailableError:
|
||||
log("[openlibrary] Book not available to borrow", file=sys.stderr)
|
||||
return None
|
||||
except Exception:
|
||||
log("[openlibrary] Borrow failed", file=sys.stderr)
|
||||
return None
|
||||
|
||||
urls = [f"https://archive.org/borrow/{archive_id}", f"https://archive.org/details/{archive_id}"]
|
||||
title = safe_title
|
||||
links: Optional[List[str]] = None
|
||||
last_exc: Optional[Exception] = None
|
||||
for u in urls:
|
||||
try:
|
||||
title_raw, links, _metadata = self._archive_get_book_infos(session, u)
|
||||
if title_raw:
|
||||
title = sanitize_filename(title_raw)
|
||||
break
|
||||
except Exception as exc:
|
||||
last_exc = exc
|
||||
continue
|
||||
|
||||
if not links:
|
||||
log(f"[openlibrary] Failed to extract pages: {last_exc}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
temp_dir = tempfile.mkdtemp(prefix=f"{title}_", dir=str(output_dir))
|
||||
try:
|
||||
images = self._archive_download(session=session, n_threads=10, directory=temp_dir, links=links, scale=3, book_id=archive_id)
|
||||
|
||||
pdf_bytes = _image_paths_to_pdf_bytes(images)
|
||||
if not pdf_bytes:
|
||||
# Keep images folder for manual conversion.
|
||||
log("[openlibrary] PDF conversion failed; keeping images folder", file=sys.stderr)
|
||||
return Path(temp_dir)
|
||||
|
||||
pdf_path = unique_path(output_dir / f"{title}.pdf")
|
||||
with open(pdf_path, "wb") as f:
|
||||
f.write(pdf_bytes)
|
||||
|
||||
try:
|
||||
shutil.rmtree(temp_dir)
|
||||
if progress_callback is not None:
|
||||
progress_callback("step", 0, None, "login")
|
||||
except Exception:
|
||||
pass
|
||||
return pdf_path
|
||||
|
||||
except Exception:
|
||||
try:
|
||||
shutil.rmtree(temp_dir)
|
||||
session = self._archive_loan(session, archive_id, verbose=False)
|
||||
loaned = True
|
||||
except self.BookNotAvailableError:
|
||||
log("[openlibrary] Book not available to borrow", file=sys.stderr)
|
||||
return None
|
||||
except Exception:
|
||||
log("[openlibrary] Borrow failed", file=sys.stderr)
|
||||
return None
|
||||
|
||||
try:
|
||||
if progress_callback is not None:
|
||||
progress_callback("step", 0, None, "borrow")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
urls = [f"https://archive.org/borrow/{archive_id}", f"https://archive.org/details/{archive_id}"]
|
||||
title = safe_title
|
||||
links: Optional[List[str]] = None
|
||||
last_exc: Optional[Exception] = None
|
||||
for u in urls:
|
||||
try:
|
||||
title_raw, links, _metadata = self._archive_get_book_infos(session, u)
|
||||
if title_raw:
|
||||
title = sanitize_filename(title_raw)
|
||||
break
|
||||
except Exception as exc:
|
||||
last_exc = exc
|
||||
continue
|
||||
|
||||
if not links:
|
||||
log(f"[openlibrary] Failed to extract pages: {last_exc}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
try:
|
||||
if progress_callback is not None:
|
||||
progress_callback("step", 0, None, "download pages")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
temp_dir = tempfile.mkdtemp(prefix=f"{title}_", dir=str(output_dir))
|
||||
try:
|
||||
images = self._archive_download(
|
||||
session=session,
|
||||
n_threads=10,
|
||||
directory=temp_dir,
|
||||
links=links,
|
||||
scale=3,
|
||||
book_id=archive_id,
|
||||
progress_callback=(
|
||||
(lambda done, total: progress_callback("pages", done, total, "pages"))
|
||||
if progress_callback is not None
|
||||
else None
|
||||
),
|
||||
)
|
||||
|
||||
pdf_bytes = _image_paths_to_pdf_bytes(images)
|
||||
if not pdf_bytes:
|
||||
# Keep images folder for manual conversion.
|
||||
log("[openlibrary] PDF conversion failed; keeping images folder", file=sys.stderr)
|
||||
return Path(temp_dir)
|
||||
|
||||
try:
|
||||
if progress_callback is not None:
|
||||
progress_callback("step", 0, None, "stitch pdf")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
pdf_path = unique_path(output_dir / f"{title}.pdf")
|
||||
with open(pdf_path, "wb") as f:
|
||||
f.write(pdf_bytes)
|
||||
|
||||
try:
|
||||
shutil.rmtree(temp_dir)
|
||||
except Exception:
|
||||
pass
|
||||
return pdf_path
|
||||
|
||||
except Exception:
|
||||
try:
|
||||
shutil.rmtree(temp_dir)
|
||||
except Exception:
|
||||
pass
|
||||
raise
|
||||
finally:
|
||||
# Always return the loan after a successful borrow, even if download/stitch fails.
|
||||
if loaned:
|
||||
try:
|
||||
if progress_callback is not None:
|
||||
progress_callback("step", 0, None, "return book")
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
self._archive_return_loan(session, archive_id)
|
||||
except Exception as exc:
|
||||
log(f"[openlibrary] Warning: failed to return loan: {exc}", file=sys.stderr)
|
||||
try:
|
||||
self._archive_logout(session)
|
||||
except Exception:
|
||||
pass
|
||||
raise
|
||||
|
||||
except Exception as exc:
|
||||
log(f"[openlibrary] Borrow workflow error: {exc}", file=sys.stderr)
|
||||
|
||||
Reference in New Issue
Block a user