This commit is contained in:
nose
2025-12-12 21:55:38 -08:00
parent e2ffcab030
commit 85750247cc
78 changed files with 5726 additions and 6239 deletions

View File

@@ -1,5 +1,5 @@
"""Provider plugin modules.
Concrete provider implementations live in this package.
The public entrypoint/registry is Provider.registry.
The public entrypoint/registry is ProviderCore.registry.
"""

View File

@@ -1,84 +0,0 @@
from __future__ import annotations
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
@dataclass
class SearchResult:
"""Unified search result format across all search providers."""
table: str # Provider name: "libgen", "soulseek", "bandcamp", "youtube", etc.
title: str # Display title/filename
path: str # Download target (URL, path, magnet, identifier)
detail: str = "" # Additional description
annotations: List[str] = field(default_factory=list) # Tags: ["120MB", "flac", "ready"]
media_kind: str = "other" # Type: "book", "audio", "video", "game", "magnet"
size_bytes: Optional[int] = None
tag: set[str] = field(default_factory=set) # Searchable tag values
columns: List[Tuple[str, str]] = field(default_factory=list) # Display columns
full_metadata: Dict[str, Any] = field(default_factory=dict) # Extra metadata
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for pipeline processing."""
return {
"table": self.table,
"title": self.title,
"path": self.path,
"detail": self.detail,
"annotations": self.annotations,
"media_kind": self.media_kind,
"size_bytes": self.size_bytes,
"tag": list(self.tag),
"columns": list(self.columns),
"full_metadata": self.full_metadata,
}
class SearchProvider(ABC):
"""Base class for search providers."""
def __init__(self, config: Optional[Dict[str, Any]] = None):
self.config = config or {}
self.name = self.__class__.__name__.lower()
@abstractmethod
def search(
self,
query: str,
limit: int = 50,
filters: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> List[SearchResult]:
"""Search for items matching the query."""
def download(self, result: SearchResult, output_dir: Path) -> Optional[Path]:
"""Download an item from a search result."""
return None
def validate(self) -> bool:
"""Check if provider is available and properly configured."""
return True
class FileProvider(ABC):
"""Base class for file upload providers."""
def __init__(self, config: Optional[Dict[str, Any]] = None):
self.config = config or {}
self.name = self.__class__.__name__.lower()
@abstractmethod
def upload(self, file_path: str, **kwargs: Any) -> str:
"""Upload a file and return the URL."""
def validate(self) -> bool:
"""Check if provider is available/configured."""
return True

View File

@@ -3,7 +3,7 @@ from __future__ import annotations
import sys
from typing import Any, Dict, List, Optional
from Provider._base import SearchProvider, SearchResult
from ProviderCore.base import SearchProvider, SearchResult
from SYS.logger import log, debug
try:

View File

@@ -1,12 +1,24 @@
from __future__ import annotations
import logging
import re
import requests
import sys
from typing import Any, Dict, List, Optional
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple
from urllib.parse import quote, urljoin, urlparse, unquote
from Provider._base import SearchProvider, SearchResult
from ProviderCore.base import SearchProvider, SearchResult
from SYS.logger import log
# Optional dependencies
try:
from bs4 import BeautifulSoup
except ImportError:
BeautifulSoup = None
class Libgen(SearchProvider):
"""Search provider for Library Genesis books."""
@@ -20,8 +32,7 @@ class Libgen(SearchProvider):
filters = filters or {}
try:
from Provider.unified_book_downloader import UnifiedBookDownloader
from Provider.query_parser import parse_query, get_field, get_free_text
from cli_syntax import get_field, get_free_text, parse_query
parsed = parse_query(query)
isbn = get_field(parsed, "isbn")
@@ -31,8 +42,11 @@ class Libgen(SearchProvider):
search_query = isbn or title or author or free_text or query
downloader = UnifiedBookDownloader(config=self.config)
books = downloader.search_libgen(search_query, limit=limit)
books = search_libgen(
search_query,
limit=limit,
log_error=lambda msg: log(msg, file=sys.stderr),
)
results: List[SearchResult] = []
for idx, book in enumerate(books, 1):
@@ -91,8 +105,455 @@ class Libgen(SearchProvider):
def validate(self) -> bool:
try:
from Provider.unified_book_downloader import UnifiedBookDownloader # noqa: F401
return True
return BeautifulSoup is not None
except Exception:
return False
LogFn = Optional[Callable[[str], None]]
ErrorFn = Optional[Callable[[str], None]]
DEFAULT_TIMEOUT = 20.0
DEFAULT_LIMIT = 50
# Mirrors to try in order
MIRRORS = [
"https://libgen.is",
"https://libgen.rs",
"https://libgen.st",
"http://libgen.is",
"http://libgen.rs",
"http://libgen.st",
"https://libgen.li", # Different structure, fallback
"http://libgen.li",
"https://libgen.gl", # Different structure, fallback
"http://libgen.gl",
]
logging.getLogger(__name__).setLevel(logging.INFO)
def _call(logger: LogFn, message: str) -> None:
if logger:
logger(message)
class LibgenSearch:
"""Robust LibGen searcher."""
def __init__(self, session: Optional[requests.Session] = None):
self.session = session or requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
})
def search(self, query: str, limit: int = DEFAULT_LIMIT) -> List[Dict[str, Any]]:
"""Search LibGen mirrors."""
if not BeautifulSoup:
logging.error("BeautifulSoup not installed. Cannot search LibGen.")
return []
for mirror in MIRRORS:
try:
if "libgen.li" in mirror or "libgen.gl" in mirror:
results = self._search_libgen_li(mirror, query, limit)
else:
results = self._search_libgen_rs(mirror, query, limit)
if results:
return results
except Exception as e:
logging.debug(f"Mirror {mirror} failed: {e}")
continue
return []
def _search_libgen_rs(self, mirror: str, query: str, limit: int) -> List[Dict[str, Any]]:
"""Search libgen.rs/is/st style mirrors."""
url = f"{mirror}/search.php"
params = {
"req": query,
"res": 100,
"column": "def",
"open": 0,
"view": "simple",
"phrase": 1,
}
resp = self.session.get(url, params=params, timeout=DEFAULT_TIMEOUT)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
table = soup.find("table", {"class": "c"})
if not table:
tables = soup.find_all("table")
for t in tables:
if len(t.find_all("tr")) > 5:
table = t
break
if not table:
return []
results: List[Dict[str, Any]] = []
rows = table.find_all("tr")[1:]
for row in rows:
cols = row.find_all("td")
if len(cols) < 9:
continue
try:
libgen_id = cols[0].get_text(strip=True)
authors = [a.get_text(strip=True) for a in cols[1].find_all("a")]
if not authors:
authors = [cols[1].get_text(strip=True)]
title_tag = cols[2].find("a")
title = title_tag.get_text(strip=True) if title_tag else cols[2].get_text(strip=True)
md5 = ""
if title_tag and title_tag.has_attr("href"):
href = title_tag["href"]
match = re.search(r"md5=([a-fA-F0-9]{32})", href)
if match:
md5 = match.group(1)
publisher = cols[3].get_text(strip=True)
year = cols[4].get_text(strip=True)
pages = cols[5].get_text(strip=True)
language = cols[6].get_text(strip=True)
size = cols[7].get_text(strip=True)
extension = cols[8].get_text(strip=True)
mirror_links = []
for i in range(9, len(cols)):
a = cols[i].find("a")
if a and a.has_attr("href"):
mirror_links.append(a["href"])
if md5:
download_link = f"http://library.lol/main/{md5}"
elif mirror_links:
download_link = mirror_links[0]
else:
download_link = ""
results.append({
"id": libgen_id,
"title": title,
"author": ", ".join(authors),
"publisher": publisher,
"year": year,
"pages": pages,
"language": language,
"filesize_str": size,
"extension": extension,
"md5": md5,
"mirror_url": download_link,
"cover": "",
})
if len(results) >= limit:
break
except Exception as e:
logging.debug(f"Error parsing row: {e}")
continue
return results
def _search_libgen_li(self, mirror: str, query: str, limit: int) -> List[Dict[str, Any]]:
"""Search libgen.li/gl style mirrors."""
url = f"{mirror}/index.php"
params = {
"req": query,
"res": 100,
"covers": "on",
"filesuns": "all",
}
resp = self.session.get(url, params=params, timeout=DEFAULT_TIMEOUT)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
table = soup.find("table", {"id": "tablelibgen"})
if not table:
table = soup.find("table", {"class": "table table-striped"})
if not table:
return []
results: List[Dict[str, Any]] = []
rows = table.find_all("tr")[1:]
for row in rows:
cols = row.find_all("td")
if len(cols) < 9:
continue
try:
title_col = cols[1]
title_link = title_col.find("a")
title = title_link.get_text(strip=True) if title_link else title_col.get_text(strip=True)
libgen_id = ""
if title_link and title_link.has_attr("href"):
href = title_link["href"]
match = re.search(r"id=(\d+)", href)
if match:
libgen_id = match.group(1)
authors = cols[2].get_text(strip=True)
publisher = cols[3].get_text(strip=True)
year = cols[4].get_text(strip=True)
language = cols[5].get_text(strip=True)
pages = cols[6].get_text(strip=True)
size = cols[7].get_text(strip=True)
extension = cols[8].get_text(strip=True)
mirror_url = ""
if title_link:
href = title_link["href"]
if href.startswith("/"):
mirror_url = mirror + href
else:
mirror_url = urljoin(mirror, href)
results.append({
"id": libgen_id,
"title": title,
"author": authors,
"publisher": publisher,
"year": year,
"pages": pages,
"language": language,
"filesize_str": size,
"extension": extension,
"md5": "",
"mirror_url": mirror_url,
})
if len(results) >= limit:
break
except Exception:
continue
return results
def search_libgen(
query: str,
limit: int = DEFAULT_LIMIT,
*,
log_info: LogFn = None,
log_error: ErrorFn = None,
session: Optional[requests.Session] = None,
) -> List[Dict[str, Any]]:
"""Search Libgen using the robust scraper."""
searcher = LibgenSearch(session=session)
try:
results = searcher.search(query, limit=limit)
_call(log_info, f"[libgen] Found {len(results)} results")
return results
except Exception as e:
_call(log_error, f"[libgen] Search failed: {e}")
return []
def _resolve_download_url(
session: requests.Session,
url: str,
log_info: LogFn = None,
) -> Optional[str]:
"""Resolve the final download URL by following the LibGen chain."""
current_url = url
visited = set()
for _ in range(6):
if current_url in visited:
break
visited.add(current_url)
_call(log_info, f"[resolve] Checking: {current_url}")
if current_url.lower().endswith((".pdf", ".epub", ".mobi", ".djvu", ".azw3", ".cbz", ".cbr")):
return current_url
try:
with session.get(current_url, stream=True, timeout=30) as resp:
resp.raise_for_status()
ct = resp.headers.get("Content-Type", "").lower()
if "text/html" not in ct:
return current_url
content = resp.text
except Exception as e:
_call(log_info, f"[resolve] Failed to fetch {current_url}: {e}")
return None
soup = BeautifulSoup(content, "html.parser")
get_link = soup.find("a", string=re.compile(r"^GET$", re.IGNORECASE))
if not get_link:
h2_get = soup.find("h2", string=re.compile(r"^GET$", re.IGNORECASE))
if h2_get and h2_get.parent.name == "a":
get_link = h2_get.parent
if get_link and get_link.has_attr("href"):
return urljoin(current_url, get_link["href"])
if "series.php" in current_url:
edition_link = soup.find("a", href=re.compile(r"edition\.php"))
if edition_link:
current_url = urljoin(current_url, edition_link["href"])
continue
if "edition.php" in current_url:
file_link = soup.find("a", href=re.compile(r"file\.php"))
if file_link:
current_url = urljoin(current_url, file_link["href"])
continue
if "file.php" in current_url:
libgen_link = soup.find("a", title="libgen")
if not libgen_link:
libgen_link = soup.find("a", string=re.compile(r"Libgen", re.IGNORECASE))
if libgen_link and libgen_link.has_attr("href"):
current_url = urljoin(current_url, libgen_link["href"])
continue
if "ads.php" in current_url:
get_php_link = soup.find("a", href=re.compile(r"get\.php"))
if get_php_link:
return urljoin(current_url, get_php_link["href"])
for text in ["Cloudflare", "IPFS.io", "Infura"]:
link = soup.find("a", string=re.compile(text, re.IGNORECASE))
if link and link.has_attr("href"):
return urljoin(current_url, link["href"])
break
return None
def _guess_filename_extension(download_url: str, headers: Dict[str, str]) -> Optional[str]:
"""Guess the file extension from headers or the download URL."""
content_disposition = headers.get("content-disposition", "")
if content_disposition:
match = re.search(r"filename\*?=(?:UTF-8\'\'|\"?)([^\";]+)", content_disposition, flags=re.IGNORECASE)
if match:
filename = unquote(match.group(1).strip('"'))
suffix = Path(filename).suffix
if suffix:
return suffix.lstrip(".")
parsed = urlparse(download_url)
suffix = Path(parsed.path).suffix
if suffix:
return suffix.lstrip(".")
content_type = headers.get("content-type", "").lower()
mime_map = {
"application/pdf": "pdf",
"application/epub+zip": "epub",
"application/x-mobipocket-ebook": "mobi",
"application/x-cbr": "cbr",
"application/x-cbz": "cbz",
"application/zip": "zip",
}
for mime, ext in mime_map.items():
if mime in content_type:
return ext
return None
def _apply_extension(path: Path, extension: Optional[str]) -> Path:
"""Rename the path to match the detected extension, if needed."""
if not extension:
return path
suffix = extension if extension.startswith(".") else f".{extension}"
if path.suffix.lower() == suffix.lower():
return path
candidate = path.with_suffix(suffix)
base_stem = path.stem
counter = 1
while candidate.exists() and counter < 100:
candidate = path.with_name(f"{base_stem}({counter}){suffix}")
counter += 1
try:
path.replace(candidate)
return candidate
except Exception:
return path
def download_from_mirror(
mirror_url: str,
output_path: Path,
*,
log_info: LogFn = None,
log_error: ErrorFn = None,
session: Optional[requests.Session] = None,
progress_callback: Optional[Callable[[int, int], None]] = None,
) -> Tuple[bool, Optional[Path]]:
"""Download file from a LibGen mirror URL with optional progress tracking."""
session = session or requests.Session()
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
try:
_call(log_info, f"[download] Resolving download link from: {mirror_url}")
download_url = _resolve_download_url(session, mirror_url, log_info)
if not download_url:
_call(log_error, "[download] Could not find direct download link")
return False, None
_call(log_info, f"[download] Downloading from: {download_url}")
downloaded = 0
total_size = 0
headers: Dict[str, str] = {}
with session.get(download_url, stream=True, timeout=60) as r:
r.raise_for_status()
headers = dict(r.headers)
ct = headers.get("content-type", "").lower()
if "text/html" in ct:
_call(log_error, "[download] Final URL returned HTML, not a file.")
return False, None
total_size = int(headers.get("content-length", 0) or 0)
with open(output_path, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
downloaded += len(chunk)
if progress_callback:
progress_callback(downloaded, total_size)
final_extension = _guess_filename_extension(download_url, headers)
final_path = _apply_extension(output_path, final_extension)
if progress_callback and total_size > 0:
progress_callback(downloaded, total_size)
_call(log_info, f"[download] Saved to {final_path}")
return True, final_path
except Exception as e:
_call(log_error, f"[download] Download failed: {e}")
return False, None

View File

@@ -1,523 +0,0 @@
"""Shared Library Genesis search and download helpers.
Replaces the old libgen backend with a robust scraper based on libgen-api-enhanced logic.
Targets libgen.is/rs/st mirrors and parses the results table directly.
"""
from __future__ import annotations
import logging
import re
import requests
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple
from urllib.parse import quote, urljoin, urlparse, unquote
# Optional dependencies
try:
from bs4 import BeautifulSoup
except ImportError:
BeautifulSoup = None
LogFn = Optional[Callable[[str], None]]
ErrorFn = Optional[Callable[[str], None]]
DEFAULT_TIMEOUT = 20.0
DEFAULT_LIMIT = 50
# Mirrors to try in order
MIRRORS = [
"https://libgen.is",
"https://libgen.rs",
"https://libgen.st",
"http://libgen.is",
"http://libgen.rs",
"http://libgen.st",
"https://libgen.li", # Different structure, fallback
"http://libgen.li",
"https://libgen.gl", # Different structure, fallback
"http://libgen.gl",
]
logging.getLogger(__name__).setLevel(logging.INFO)
def _call(logger: LogFn, message: str) -> None:
if logger:
logger(message)
class LibgenSearch:
"""Robust LibGen searcher."""
def __init__(self, session: Optional[requests.Session] = None):
self.session = session or requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
})
def search(self, query: str, limit: int = DEFAULT_LIMIT) -> List[Dict[str, Any]]:
"""Search LibGen mirrors."""
if not BeautifulSoup:
logging.error("BeautifulSoup not installed. Cannot search LibGen.")
return []
for mirror in MIRRORS:
try:
if "libgen.li" in mirror or "libgen.gl" in mirror:
results = self._search_libgen_li(mirror, query, limit)
else:
results = self._search_libgen_rs(mirror, query, limit)
if results:
return results
except Exception as e:
logging.debug(f"Mirror {mirror} failed: {e}")
continue
return []
def _search_libgen_rs(self, mirror: str, query: str, limit: int) -> List[Dict[str, Any]]:
"""Search libgen.rs/is/st style mirrors."""
# Search URL: /search.php?req=QUERY&res=100&column=def
url = f"{mirror}/search.php"
params = {
"req": query,
"res": 100, # Request more to filter later
"column": "def",
"open": 0,
"view": "simple",
"phrase": 1,
}
resp = self.session.get(url, params=params, timeout=DEFAULT_TIMEOUT)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
# Find the table with results. usually class 'c'
table = soup.find("table", {"class": "c"})
if not table:
# Try finding by structure (table with many rows)
tables = soup.find_all("table")
for t in tables:
if len(t.find_all("tr")) > 5:
table = t
break
if not table:
return []
results = []
# Skip header row
rows = table.find_all("tr")[1:]
for row in rows:
cols = row.find_all("td")
if len(cols) < 9:
continue
# Columns:
# 0: ID
# 1: Author(s)
# 2: Title
# 3: Publisher
# 4: Year
# 5: Pages
# 6: Language
# 7: Size
# 8: Extension
# 9+: Mirrors
try:
libgen_id = cols[0].get_text(strip=True)
authors = [a.get_text(strip=True) for a in cols[1].find_all("a")]
if not authors:
authors = [cols[1].get_text(strip=True)]
title_tag = cols[2].find("a")
title = title_tag.get_text(strip=True) if title_tag else cols[2].get_text(strip=True)
# Extract MD5 from title link if possible (often in href)
# href='book/index.php?md5=...'
md5 = ""
if title_tag and title_tag.has_attr("href"):
href = title_tag["href"]
match = re.search(r"md5=([a-fA-F0-9]{32})", href)
if match:
md5 = match.group(1)
publisher = cols[3].get_text(strip=True)
year = cols[4].get_text(strip=True)
pages = cols[5].get_text(strip=True)
language = cols[6].get_text(strip=True)
size = cols[7].get_text(strip=True)
extension = cols[8].get_text(strip=True)
# Mirrors
# Usually col 9 is http://library.lol/main/MD5
mirror_links = []
for i in range(9, len(cols)):
a = cols[i].find("a")
if a and a.has_attr("href"):
mirror_links.append(a["href"])
# Construct direct download page link (library.lol)
# If we have MD5, we can guess it: http://library.lol/main/{md5}
if md5:
download_link = f"http://library.lol/main/{md5}"
elif mirror_links:
download_link = mirror_links[0]
else:
download_link = ""
results.append({
"id": libgen_id,
"title": title,
"author": ", ".join(authors),
"publisher": publisher,
"year": year,
"pages": pages,
"language": language,
"filesize_str": size,
"extension": extension,
"md5": md5,
"mirror_url": download_link,
"cover": "", # Could extract from hover if needed
})
if len(results) >= limit:
break
except Exception as e:
logging.debug(f"Error parsing row: {e}")
continue
return results
def _search_libgen_li(self, mirror: str, query: str, limit: int) -> List[Dict[str, Any]]:
"""Search libgen.li/gl style mirrors."""
# Search URL: /index.php?req=QUERY&columns[]=t&columns[]=a...
url = f"{mirror}/index.php"
params = {
"req": query,
"res": 100,
"covers": "on",
"filesuns": "all",
}
resp = self.session.get(url, params=params, timeout=DEFAULT_TIMEOUT)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
table = soup.find("table", {"id": "tablelibgen"})
if not table:
table = soup.find("table", {"class": "table table-striped"})
if not table:
return []
results = []
rows = table.find_all("tr")[1:]
for row in rows:
cols = row.find_all("td")
if len(cols) < 9:
continue
try:
# Structure is different
# 0: Cover
# 1: Title (with link to file.php?id=...)
# 2: Author
# 3: Publisher
# 4: Year
# 5: Language
# 6: Pages
# 7: Size
# 8: Extension
# 9: Mirrors
title_col = cols[1]
title_link = title_col.find("a")
title = title_link.get_text(strip=True) if title_link else title_col.get_text(strip=True)
# Extract ID from link
libgen_id = ""
if title_link and title_link.has_attr("href"):
href = title_link["href"]
# href is usually "file.php?id=..." or "edition.php?id=..."
match = re.search(r"id=(\d+)", href)
if match:
libgen_id = match.group(1)
authors = cols[2].get_text(strip=True)
publisher = cols[3].get_text(strip=True)
year = cols[4].get_text(strip=True)
language = cols[5].get_text(strip=True)
pages = cols[6].get_text(strip=True)
size = cols[7].get_text(strip=True)
extension = cols[8].get_text(strip=True)
# Mirror link
# Usually in col 9 or title link
mirror_url = ""
if title_link:
href = title_link["href"]
if href.startswith("/"):
mirror_url = mirror + href
else:
mirror_url = urljoin(mirror, href)
results.append({
"id": libgen_id,
"title": title,
"author": authors,
"publisher": publisher,
"year": year,
"pages": pages,
"language": language,
"filesize_str": size,
"extension": extension,
"md5": "", # .li doesn't show MD5 easily in table
"mirror_url": mirror_url,
})
if len(results) >= limit:
break
except Exception:
continue
return results
def search_libgen(
query: str,
limit: int = DEFAULT_LIMIT,
*,
log_info: LogFn = None,
log_error: ErrorFn = None,
session: Optional[requests.Session] = None,
) -> List[Dict[str, Any]]:
"""Search Libgen using the robust scraper."""
searcher = LibgenSearch(session=session)
try:
results = searcher.search(query, limit=limit)
_call(log_info, f"[libgen] Found {len(results)} results")
return results
except Exception as e:
_call(log_error, f"[libgen] Search failed: {e}")
return []
def _resolve_download_url(
session: requests.Session,
url: str,
log_info: LogFn = None
) -> Optional[str]:
"""Resolve the final download URL by following the LibGen chain."""
current_url = url
visited = set()
# Max hops to prevent infinite loops
for _ in range(6):
if current_url in visited:
break
visited.add(current_url)
_call(log_info, f"[resolve] Checking: {current_url}")
# Simple heuristic: if it looks like a file, return it
if current_url.lower().endswith(('.pdf', '.epub', '.mobi', '.djvu', '.azw3', '.cbz', '.cbr')):
return current_url
try:
# Use HEAD first to check content type if possible, but some mirrors block HEAD or return 405
# So we'll just GET with stream=True to peek headers/content without downloading everything
with session.get(current_url, stream=True, timeout=30) as resp:
resp.raise_for_status()
ct = resp.headers.get("Content-Type", "").lower()
if "text/html" not in ct:
# It's a binary file
return current_url
# It's HTML, read content
content = resp.text
except Exception as e:
_call(log_info, f"[resolve] Failed to fetch {current_url}: {e}")
return None
soup = BeautifulSoup(content, "html.parser")
# 1. Check for "GET" link (library.lol / ads.php style)
# Usually <h2>GET</h2> inside <a> or just text "GET"
get_link = soup.find("a", string=re.compile(r"^GET$", re.IGNORECASE))
if not get_link:
# Try finding <a> containing <h2>GET</h2>
h2_get = soup.find("h2", string=re.compile(r"^GET$", re.IGNORECASE))
if h2_get and h2_get.parent.name == "a":
get_link = h2_get.parent
if get_link and get_link.has_attr("href"):
return urljoin(current_url, get_link["href"])
# 2. Check for "series.php" -> "edition.php"
if "series.php" in current_url:
# Find first edition link
edition_link = soup.find("a", href=re.compile(r"edition\.php"))
if edition_link:
current_url = urljoin(current_url, edition_link["href"])
continue
# 3. Check for "edition.php" -> "file.php"
if "edition.php" in current_url:
file_link = soup.find("a", href=re.compile(r"file\.php"))
if file_link:
current_url = urljoin(current_url, file_link["href"])
continue
# 4. Check for "file.php" -> "ads.php" (Libgen badge)
if "file.php" in current_url:
# Look for link with title="libgen" or text "Libgen"
libgen_link = soup.find("a", title="libgen")
if not libgen_link:
libgen_link = soup.find("a", string=re.compile(r"Libgen", re.IGNORECASE))
if libgen_link and libgen_link.has_attr("href"):
current_url = urljoin(current_url, libgen_link["href"])
continue
# 5. Check for "ads.php" -> "get.php" (Fallback if GET link logic above failed)
if "ads.php" in current_url:
get_php_link = soup.find("a", href=re.compile(r"get\.php"))
if get_php_link:
return urljoin(current_url, get_php_link["href"])
# 6. Library.lol / generic fallback
for text in ["Cloudflare", "IPFS.io", "Infura"]:
link = soup.find("a", string=re.compile(text, re.IGNORECASE))
if link and link.has_attr("href"):
return urljoin(current_url, link["href"])
# If we found nothing new, stop
break
return None
def _guess_filename_extension(download_url: str, headers: Dict[str, str]) -> Optional[str]:
"""Guess the file extension from headers or the download URL."""
content_disposition = headers.get("content-disposition", "")
if content_disposition:
match = re.search(r'filename\*?=(?:UTF-8\'\'|"?)([^";]+)', content_disposition, flags=re.IGNORECASE)
if match:
filename = unquote(match.group(1).strip('"'))
suffix = Path(filename).suffix
if suffix:
return suffix.lstrip('.')
parsed = urlparse(download_url)
suffix = Path(parsed.path).suffix
if suffix:
return suffix.lstrip('.')
content_type = headers.get('content-type', '').lower()
mime_map = {
'application/pdf': 'pdf',
'application/epub+zip': 'epub',
'application/x-mobipocket-ebook': 'mobi',
'application/x-cbr': 'cbr',
'application/x-cbz': 'cbz',
'application/zip': 'zip',
}
for mime, ext in mime_map.items():
if mime in content_type:
return ext
return None
def _apply_extension(path: Path, extension: Optional[str]) -> Path:
"""Rename the path to match the detected extension, if needed."""
if not extension:
return path
suffix = extension if extension.startswith('.') else f'.{extension}'
if path.suffix.lower() == suffix.lower():
return path
candidate = path.with_suffix(suffix)
base_stem = path.stem
counter = 1
while candidate.exists() and counter < 100:
candidate = path.with_name(f"{base_stem}({counter}){suffix}")
counter += 1
try:
path.replace(candidate)
return candidate
except Exception:
return path
def download_from_mirror(
mirror_url: str,
output_path: Path,
*,
log_info: LogFn = None,
log_error: ErrorFn = None,
session: Optional[requests.Session] = None,
progress_callback: Optional[Callable[[int, int], None]] = None,
) -> Tuple[bool, Optional[Path]]:
"""Download file from a LibGen mirror URL with optional progress tracking."""
session = session or requests.Session()
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
try:
_call(log_info, f"[download] Resolving download link from: {mirror_url}")
download_url = _resolve_download_url(session, mirror_url, log_info)
if not download_url:
_call(log_error, "[download] Could not find direct download link")
return False, None
_call(log_info, f"[download] Downloading from: {download_url}")
downloaded = 0
total_size = 0
headers: Dict[str, str] = {}
with session.get(download_url, stream=True, timeout=60) as r:
r.raise_for_status()
headers = dict(r.headers)
# Verify it's not HTML (error page)
ct = headers.get("content-type", "").lower()
if "text/html" in ct:
_call(log_error, "[download] Final URL returned HTML, not a file.")
return False, None
total_size = int(headers.get("content-length", 0) or 0)
with open(output_path, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
downloaded += len(chunk)
if progress_callback:
progress_callback(downloaded, total_size)
final_extension = _guess_filename_extension(download_url, headers)
final_path = _apply_extension(output_path, final_extension)
if progress_callback and total_size > 0:
progress_callback(downloaded, total_size)
_call(log_info, f"[download] Saved to {final_path}")
return True, final_path
except Exception as e:
_call(log_error, f"[download] Download failed: {e}")
return False, None

View File

@@ -6,7 +6,7 @@ from typing import Any
import requests
from Provider._base import FileProvider
from ProviderCore.base import FileProvider
class Matrix(FileProvider):

358
Provider/openlibrary.py Normal file
View File

@@ -0,0 +1,358 @@
from __future__ import annotations
import shutil
import sys
import tempfile
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import requests
from ProviderCore.base import SearchProvider, SearchResult
from ProviderCore.download import download_file, sanitize_filename
from cli_syntax import get_field, get_free_text, parse_query
from SYS.logger import log
from SYS.utils import unique_path
def _looks_like_isbn(text: str) -> bool:
t = (text or "").replace("-", "").strip()
return t.isdigit() and len(t) in (10, 13)
def _first_str(value: Any) -> Optional[str]:
if isinstance(value, str):
v = value.strip()
return v if v else None
if isinstance(value, list) and value:
first = value[0]
if isinstance(first, str):
v = first.strip()
return v if v else None
return str(first) if first is not None else None
return None
def _resolve_edition_id(doc: Dict[str, Any]) -> str:
# OpenLibrary Search API typically provides edition_key: ["OL...M", ...]
edition_key = doc.get("edition_key")
if isinstance(edition_key, list) and edition_key:
return str(edition_key[0]).strip()
# Fallback: sometimes key can be /books/OL...M
key = doc.get("key")
if isinstance(key, str) and key.startswith("/books/"):
return key.split("/books/", 1)[1].strip("/")
return ""
def _check_lendable(session: requests.Session, edition_id: str) -> Tuple[bool, str]:
"""Return (lendable, status_text) using OpenLibrary volumes API."""
try:
if not edition_id or not edition_id.startswith("OL") or not edition_id.endswith("M"):
return False, "not-an-edition"
url = f"https://openlibrary.org/api/volumes/brief/json/OLID:{edition_id}"
resp = session.get(url, timeout=10)
resp.raise_for_status()
data = resp.json() or {}
wrapped = data.get(f"OLID:{edition_id}")
if not isinstance(wrapped, dict):
return False, "no-availability"
items = wrapped.get("items")
if not isinstance(items, list) or not items:
return False, "no-items"
first = items[0]
status_val = ""
if isinstance(first, dict):
status_val = str(first.get("status", ""))
else:
status_val = str(first)
return ("lendable" in status_val.lower()), status_val
except requests.exceptions.Timeout:
return False, "api-timeout"
except Exception:
return False, "api-error"
def _resolve_archive_id(session: requests.Session, edition_id: str, ia_candidates: List[str]) -> str:
# Prefer IA identifiers already present in search results.
if ia_candidates:
first = ia_candidates[0].strip()
if first:
return first
# Otherwise query the edition JSON.
try:
resp = session.get(f"https://openlibrary.org/books/{edition_id}.json", timeout=10)
resp.raise_for_status()
data = resp.json() or {}
ocaid = data.get("ocaid")
if isinstance(ocaid, str) and ocaid.strip():
return ocaid.strip()
identifiers = data.get("identifiers")
if isinstance(identifiers, dict):
ia = identifiers.get("internet_archive")
ia_id = _first_str(ia)
if ia_id:
return ia_id
except Exception:
pass
return ""
class OpenLibrary(SearchProvider):
"""Search provider for OpenLibrary books + Archive.org direct/borrow download."""
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
self._session = requests.Session()
def search(
self,
query: str,
limit: int = 50,
filters: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> List[SearchResult]:
filters = filters or {}
parsed = parse_query(query)
isbn = get_field(parsed, "isbn")
author = get_field(parsed, "author")
title = get_field(parsed, "title")
free_text = get_free_text(parsed)
q = (isbn or title or author or free_text or query or "").strip()
if not q:
return []
if _looks_like_isbn(q):
q = f"isbn:{q.replace('-', '')}"
try:
resp = self._session.get(
"https://openlibrary.org/search.json",
params={"q": q, "limit": int(limit)},
timeout=10,
)
resp.raise_for_status()
data = resp.json() or {}
except Exception as exc:
log(f"[openlibrary] Search failed: {exc}", file=sys.stderr)
return []
results: List[SearchResult] = []
docs = data.get("docs") or []
if not isinstance(docs, list):
return []
for doc in docs[: int(limit)]:
if not isinstance(doc, dict):
continue
book_title = str(doc.get("title") or "").strip() or "Unknown"
authors = doc.get("author_name") or []
if isinstance(authors, str):
authors = [authors]
if not isinstance(authors, list):
authors = []
authors_list = [str(a) for a in authors if a]
year_val = doc.get("first_publish_year")
year = str(year_val) if year_val is not None else ""
edition_id = _resolve_edition_id(doc)
ia_val = doc.get("ia") or []
if isinstance(ia_val, str):
ia_val = [ia_val]
if not isinstance(ia_val, list):
ia_val = []
ia_ids = [str(x) for x in ia_val if x]
isbn_list = doc.get("isbn") or []
if isinstance(isbn_list, str):
isbn_list = [isbn_list]
if not isinstance(isbn_list, list):
isbn_list = []
isbn_13 = next((str(i) for i in isbn_list if len(str(i)) == 13), "")
isbn_10 = next((str(i) for i in isbn_list if len(str(i)) == 10), "")
columns = [
("Title", book_title),
("Author", ", ".join(authors_list)),
("Year", year),
("OLID", edition_id),
]
annotations: List[str] = []
if isbn_13:
annotations.append(f"isbn_13:{isbn_13}")
elif isbn_10:
annotations.append(f"isbn_10:{isbn_10}")
if ia_ids:
annotations.append("archive")
results.append(
SearchResult(
table="openlibrary",
title=book_title,
path=(f"https://openlibrary.org/books/{edition_id}" if edition_id else "https://openlibrary.org"),
detail=(
(f"By: {', '.join(authors_list)}" if authors_list else "")
+ (f" ({year})" if year else "")
).strip(),
annotations=annotations,
media_kind="book",
columns=columns,
full_metadata={
"openlibrary_id": edition_id,
"authors": authors_list,
"year": year,
"isbn_10": isbn_10,
"isbn_13": isbn_13,
"ia": ia_ids,
"raw": doc,
},
)
)
return results
def download(self, result: SearchResult, output_dir: Path) -> Optional[Path]:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
meta = result.full_metadata or {}
edition_id = str(meta.get("openlibrary_id") or "").strip()
if not edition_id:
log("[openlibrary] Missing openlibrary_id; cannot download", file=sys.stderr)
return None
ia_ids = meta.get("ia") or []
if isinstance(ia_ids, str):
ia_ids = [ia_ids]
if not isinstance(ia_ids, list):
ia_ids = []
ia_candidates = [str(x) for x in ia_ids if x]
archive_id = _resolve_archive_id(self._session, edition_id, ia_candidates)
if not archive_id:
log("[openlibrary] No archive identifier available; cannot download", file=sys.stderr)
return None
safe_title = sanitize_filename(result.title)
# 1) Direct download if available.
try:
from API.archive_client import check_direct_download
can_direct, pdf_url = check_direct_download(archive_id)
except Exception:
can_direct, pdf_url = False, ""
if can_direct and pdf_url:
out_path = unique_path(output_dir / f"{safe_title}.pdf")
ok = download_file(pdf_url, out_path, session=self._session)
if ok:
return out_path
log("[openlibrary] Direct download failed", file=sys.stderr)
return None
# 2) Borrow flow (credentials required).
try:
from API.archive_client import BookNotAvailableError, credential_openlibrary, download as archive_download
from API.archive_client import get_book_infos, loan, login
email, password = credential_openlibrary(self.config or {})
if not email or not password:
log("[openlibrary] Archive credentials missing; cannot borrow", file=sys.stderr)
return None
lendable, reason = _check_lendable(self._session, edition_id)
if not lendable:
log(f"[openlibrary] Not lendable: {reason}", file=sys.stderr)
return None
session = login(email, password)
try:
session = loan(session, archive_id, verbose=False)
except BookNotAvailableError:
log("[openlibrary] Book not available to borrow", file=sys.stderr)
return None
except SystemExit:
log("[openlibrary] Borrow failed", file=sys.stderr)
return None
urls = [f"https://archive.org/borrow/{archive_id}", f"https://archive.org/details/{archive_id}"]
title = safe_title
links: Optional[List[str]] = None
last_exc: Optional[Exception] = None
for u in urls:
try:
title_raw, links, _metadata = get_book_infos(session, u)
if title_raw:
title = sanitize_filename(title_raw)
break
except Exception as exc:
last_exc = exc
continue
if not links:
log(f"[openlibrary] Failed to extract pages: {last_exc}", file=sys.stderr)
return None
temp_dir = tempfile.mkdtemp(prefix=f"{title}_", dir=str(output_dir))
try:
images = archive_download(session=session, n_threads=10, directory=temp_dir, links=links, scale=3, book_id=archive_id)
try:
import img2pdf # type: ignore
pdf_bytes = img2pdf.convert(images) if images else None
if not pdf_bytes:
log("[openlibrary] PDF conversion failed", file=sys.stderr)
try:
shutil.rmtree(temp_dir)
except Exception:
pass
return None
pdf_path = unique_path(output_dir / f"{title}.pdf")
with open(pdf_path, "wb") as f:
f.write(pdf_bytes)
try:
shutil.rmtree(temp_dir)
except Exception:
pass
return pdf_path
except ImportError:
# Keep images folder.
return Path(temp_dir)
except Exception:
try:
shutil.rmtree(temp_dir)
except Exception:
pass
raise
except Exception as exc:
log(f"[openlibrary] Borrow workflow error: {exc}", file=sys.stderr)
return None
def validate(self) -> bool:
return True

View File

@@ -1,159 +0,0 @@
"""Dynamic query parser for filtering and field extraction.
Supports query syntax like:
- isbn:0557677203
- author:"Albert Pike"
- title:"Morals and Dogma"
- year:2010
- isbn:0557677203 author:"Albert Pike"
- Mixed with free text: "Morals" isbn:0557677203
This allows flexible query strings that can be parsed by any search provider
to extract specific fields for filtering and searching.
"""
from typing import Dict, List, Tuple, Optional, Any
import re
def parse_query(query: str) -> Dict[str, Any]:
"""Parse a query string into field:value pairs and free text.
Args:
query: Query string like 'isbn:0557677203 author:"Albert Pike" Morals'
Returns:
Dictionary with:
- 'fields': Dict[field_name, field_value] for structured fields
- 'text': str with remaining free text
- 'raw': str original query
"""
result = {
'fields': {},
'text': '',
'raw': query,
}
if not query or not query.strip():
return result
query = query.strip()
remaining_parts = []
# Pattern to match: field:value or field:"quoted value"
# Matches: word: followed by either quoted string or unquoted word
pattern = r'(\w+):(?:"([^"]*)"|(\S+))'
pos = 0
for match in re.finditer(pattern, query):
# Add any text before this match
if match.start() > pos:
before_text = query[pos:match.start()].strip()
if before_text:
remaining_parts.append(before_text)
field_name = match.group(1).lower()
field_value = match.group(2) if match.group(2) is not None else match.group(3)
result['fields'][field_name] = field_value
pos = match.end()
# Add any remaining text after last match
if pos < len(query):
remaining_text = query[pos:].strip()
if remaining_text:
remaining_parts.append(remaining_text)
result['text'] = ' '.join(remaining_parts)
return result
def get_field(parsed_query: Dict[str, Any], field_name: str, default: Optional[str] = None) -> Optional[str]:
"""Get a field value from parsed query, with optional default.
Args:
parsed_query: Result from parse_query()
field_name: Field name to look up (case-insensitive)
default: Default value if field not found
Returns:
Field value or default
"""
return parsed_query.get('fields', {}).get(field_name.lower(), default)
def has_field(parsed_query: Dict[str, Any], field_name: str) -> bool:
"""Check if a field exists in parsed query.
Args:
parsed_query: Result from parse_query()
field_name: Field name to check (case-insensitive)
Returns:
True if field exists
"""
return field_name.lower() in parsed_query.get('fields', {})
def get_free_text(parsed_query: Dict[str, Any]) -> str:
"""Get the free text portion of a parsed query.
Args:
parsed_query: Result from parse_query()
Returns:
Free text or empty string
"""
return parsed_query.get('text', '')
def build_query_for_provider(
parsed_query: Dict[str, Any],
provider: str,
extraction_map: Optional[Dict[str, str]] = None
) -> Tuple[str, Dict[str, str]]:
"""Build a search query and filters dict for a specific provider.
Different providers have different search syntax. This function
extracts the appropriate fields for each provider.
Args:
parsed_query: Result from parse_query()
provider: Provider name ('libgen', 'openlibrary', 'soulseek')
extraction_map: Optional mapping of field names to provider-specific names
e.g. {'isbn': 'isbn', 'author': 'author', 'title': 'title'}
Returns:
Tuple of (search_query: str, extracted_fields: Dict[field, value])
"""
extraction_map = extraction_map or {}
extracted = {}
free_text = get_free_text(parsed_query)
# Extract fields based on map
for field_name, provider_key in extraction_map.items():
if has_field(parsed_query, field_name):
extracted[provider_key] = get_field(parsed_query, field_name)
# If provider-specific extraction needed, providers can implement it
# For now, return the free text as query
return free_text, extracted
if __name__ == '__main__':
# Test cases
test_queries = [
'isbn:0557677203',
'isbn:0557677203 author:"Albert Pike"',
'Morals and Dogma isbn:0557677203',
'title:"Morals and Dogma" author:"Albert Pike" year:2010',
'search term without fields',
'author:"John Smith" title:"A Book"',
]
for query in test_queries:
print(f"\nQuery: {query}")
parsed = parse_query(query)
print(f" Fields: {parsed['fields']}")
print(f" Text: {parsed['text']}")

View File

@@ -1,110 +0,0 @@
"""Provider registry.
Concrete provider implementations live in the `Provider/` package.
This module is the single source of truth for provider discovery.
"""
from __future__ import annotations
from typing import Any, Dict, Optional, Type
import sys
from SYS.logger import log
from Provider._base import FileProvider, SearchProvider, SearchResult
from Provider.bandcamp import Bandcamp
from Provider.libgen import Libgen
from Provider.matrix import Matrix
from Provider.soulseek import Soulseek, download_soulseek_file
from Provider.youtube import YouTube
from Provider.zeroxzero import ZeroXZero
_SEARCH_PROVIDERS: Dict[str, Type[SearchProvider]] = {
"libgen": Libgen,
"soulseek": Soulseek,
"bandcamp": Bandcamp,
"youtube": YouTube,
}
def get_search_provider(name: str, config: Optional[Dict[str, Any]] = None) -> Optional[SearchProvider]:
"""Get a search provider by name."""
provider_class = _SEARCH_PROVIDERS.get((name or "").lower())
if provider_class is None:
log(f"[provider] Unknown search provider: {name}", file=sys.stderr)
return None
try:
provider = provider_class(config)
if not provider.validate():
log(f"[provider] Provider '{name}' is not available", file=sys.stderr)
return None
return provider
except Exception as exc:
log(f"[provider] Error initializing '{name}': {exc}", file=sys.stderr)
return None
def list_search_providers(config: Optional[Dict[str, Any]] = None) -> Dict[str, bool]:
"""List all search providers and their availability."""
availability: Dict[str, bool] = {}
for name, provider_class in _SEARCH_PROVIDERS.items():
try:
provider = provider_class(config)
availability[name] = provider.validate()
except Exception:
availability[name] = False
return availability
_FILE_PROVIDERS: Dict[str, Type[FileProvider]] = {
"0x0": ZeroXZero,
"matrix": Matrix,
}
def get_file_provider(name: str, config: Optional[Dict[str, Any]] = None) -> Optional[FileProvider]:
"""Get a file provider by name."""
provider_class = _FILE_PROVIDERS.get((name or "").lower())
if provider_class is None:
log(f"[provider] Unknown file provider: {name}", file=sys.stderr)
return None
try:
provider = provider_class(config)
if not provider.validate():
log(f"[provider] File provider '{name}' is not available", file=sys.stderr)
return None
return provider
except Exception as exc:
log(f"[provider] Error initializing file provider '{name}': {exc}", file=sys.stderr)
return None
def list_file_providers(config: Optional[Dict[str, Any]] = None) -> Dict[str, bool]:
"""List all file providers and their availability."""
availability: Dict[str, bool] = {}
for name, provider_class in _FILE_PROVIDERS.items():
try:
provider = provider_class(config)
availability[name] = provider.validate()
except Exception:
availability[name] = False
return availability
__all__ = [
"SearchResult",
"SearchProvider",
"FileProvider",
"get_search_provider",
"list_search_providers",
"get_file_provider",
"list_file_providers",
"download_soulseek_file",
]

View File

@@ -11,7 +11,7 @@ import time
from pathlib import Path
from typing import Any, Dict, List, Optional
from Provider._base import SearchProvider, SearchResult
from ProviderCore.base import SearchProvider, SearchResult
from SYS.logger import log, debug

View File

@@ -1,707 +0,0 @@
"""Unified book downloader - handles Archive.org borrowing and Libgen fallback.
This module provides a single interface for downloading books from multiple sources:
1. Try Archive.org direct download (if available)
2. Try Archive.org borrowing (if user has credentials)
3. Fallback to Libgen search by ISBN
4. Attempt Libgen download
All sources integrated with proper metadata scraping and error handling.
"""
import logging
import asyncio
import requests
from typing import Optional, Dict, Any, Tuple, List, Callable, cast
from pathlib import Path
from SYS.logger import debug
logger = logging.getLogger(__name__)
class UnifiedBookDownloader:
"""Unified interface for downloading books from multiple sources."""
def __init__(self, config: Optional[Dict[str, Any]] = None, output_dir: Optional[str] = None):
"""Initialize the unified book downloader.
Args:
config: Configuration dict with credentials
output_dir: Default output directory
"""
self.config = config or {}
self.output_dir = output_dir
self.session = requests.Session()
# Import download functions from their modules
self._init_downloaders()
def _init_downloaders(self) -> None:
"""Initialize downloader functions from their modules."""
try:
from API.archive_client import (
check_direct_download,
get_openlibrary_by_isbn,
loan
)
self.check_direct_download = check_direct_download
self.get_openlibrary_by_isbn = get_openlibrary_by_isbn
self.loan_func = loan
logger.debug("[UnifiedBookDownloader] Loaded archive.org downloaders from archive_client")
except Exception as e:
logger.warning(f"[UnifiedBookDownloader] Failed to load archive.org functions: {e}")
self.check_direct_download = None
self.get_openlibrary_by_isbn = None
self.loan_func = None
try:
from Provider.libgen_service import (
DEFAULT_LIMIT as _LIBGEN_DEFAULT_LIMIT,
download_from_mirror as _libgen_download,
search_libgen as _libgen_search,
)
def _log_info(message: str) -> None:
debug(f"[UnifiedBookDownloader] {message}")
def _log_error(message: str) -> None:
logger.error(f"[UnifiedBookDownloader] {message}")
self.search_libgen = lambda query, limit=_LIBGEN_DEFAULT_LIMIT: _libgen_search(
query,
limit=limit,
log_info=_log_info,
log_error=_log_error,
)
self.download_from_mirror = lambda mirror_url, output_path: _libgen_download(
mirror_url,
output_path,
log_info=_log_info,
log_error=_log_error,
)
logger.debug("[UnifiedBookDownloader] Loaded Libgen helpers")
except Exception as e:
logger.warning(f"[UnifiedBookDownloader] Failed to load Libgen helpers: {e}")
self.search_libgen = None
self.download_from_mirror = None
def get_download_options(self, book_data: Dict[str, Any]) -> Dict[str, Any]:
"""Get all available download options for a book.
Checks in priority order:
1. Archive.org direct download (public domain)
2. Archive.org borrowing (if credentials available and book is borrowable)
3. Libgen fallback (by ISBN)
Args:
book_data: Book metadata dict with at least 'openlibrary_id' or 'isbn'
Returns:
Dict with available download methods and metadata
"""
options = {
'book_title': book_data.get('title', 'Unknown'),
'book_author': book_data.get('author', 'Unknown'),
'isbn': book_data.get('isbn', ''),
'openlibrary_id': book_data.get('openlibrary_id', ''),
'methods': [], # Will be sorted by priority
'metadata': {}
}
# Extract book ID from openlibrary_id (e.g., OL8513721M -> 8513721, OL8513721W -> 8513721)
ol_id = book_data.get('openlibrary_id', '')
book_id = None
if ol_id.startswith('OL') and len(ol_id) > 2:
# Remove 'OL' prefix (keep everything after it including the suffix letter)
# The book_id is all digits after 'OL'
book_id = ''.join(c for c in ol_id[2:] if c.isdigit())
# PRIORITY 1: Check direct download (fastest, no auth needed)
if self.check_direct_download:
try:
can_download, pdf_url = self.check_direct_download(book_id)
if can_download:
options['methods'].append({
'type': 'archive.org_direct',
'label': 'Archive.org Direct Download',
'requires_auth': False,
'pdf_url': pdf_url,
'book_id': book_id,
'priority': 1 # Highest priority
})
logger.info(f"[UnifiedBookDownloader] Direct download available for {book_id}")
except Exception as e:
logger.debug(f"[UnifiedBookDownloader] Direct download check failed: {e}")
# PRIORITY 2: Check borrowing option (requires auth, 14-day loan)
# First verify the book is actually lendable via OpenLibrary API
if self._has_archive_credentials():
is_lendable, status = self._check_book_lendable_status(ol_id)
if is_lendable:
options['methods'].append({
'type': 'archive.org_borrow',
'label': 'Archive.org Borrow',
'requires_auth': True,
'book_id': book_id,
'priority': 2 # Second priority
})
logger.info(f"[UnifiedBookDownloader] Borrow option available for {book_id} (status: {status})")
else:
logger.debug(f"[UnifiedBookDownloader] Borrow not available for {book_id} (status: {status})")
# PRIORITY 3: Check Libgen fallback (by ISBN, no auth needed, most reliable)
isbn = book_data.get('isbn', '')
title = book_data.get('title', '')
author = book_data.get('author', '')
if self.search_libgen:
# Can use Libgen if we have ISBN OR title (or both)
if isbn or title:
options['methods'].append({
'type': 'libgen',
'label': 'Libgen Search & Download',
'requires_auth': False,
'isbn': isbn,
'title': title,
'author': author,
'priority': 3 # Third priority (fallback)
})
logger.info(f"[UnifiedBookDownloader] Libgen fallback available (ISBN: {isbn if isbn else 'N/A'}, Title: {title})")
# Sort by priority (higher priority first)
options['methods'].sort(key=lambda x: x.get('priority', 999))
return options
def _has_archive_credentials(self) -> bool:
"""Check if Archive.org credentials are available."""
try:
from API.archive_client import credential_openlibrary
email, password = credential_openlibrary(self.config)
return bool(email and password)
except Exception:
return False
def _check_book_lendable_status(self, ol_id: str) -> Tuple[bool, Optional[str]]:
"""Check if a book is lendable via OpenLibrary API.
Queries: https://openlibrary.org/api/volumes/brief/json/OLID:{ol_id}
Note: Only works with Edition IDs (OL...M), not Work IDs (OL...W)
Args:
ol_id: OpenLibrary ID (e.g., OL8513721M for Edition or OL4801915W for Work)
Returns:
Tuple of (is_lendable: bool, status_reason: Optional[str])
"""
try:
if not ol_id.startswith('OL'):
return False, "Invalid OpenLibrary ID format"
# If this is a Work ID (ends with W), we can't query Volumes API
# Work IDs are abstract umbrella records, not specific editions
if ol_id.endswith('W'):
logger.debug(f"[UnifiedBookDownloader] Work ID {ol_id} - skipping Volumes API (not lendable)")
return False, "Work ID not supported by Volumes API (not a specific edition)"
# If it ends with M, it's an Edition ID - proceed with query
if not ol_id.endswith('M'):
logger.debug(f"[UnifiedBookDownloader] Unknown ID type {ol_id} (not M or W)")
return False, "Invalid OpenLibrary ID type"
url = f"https://openlibrary.org/api/volumes/brief/json/OLID:{ol_id}"
response = self.session.get(url, timeout=10)
response.raise_for_status()
data = response.json()
# Empty response means no records found
if not data:
logger.debug(f"[UnifiedBookDownloader] Empty response for {ol_id}")
return False, "No availability data found"
# The response is wrapped in OLID key
olid_key = f"OLID:{ol_id}"
if olid_key not in data:
logger.debug(f"[UnifiedBookDownloader] OLID key not found in response")
return False, "No availability data found"
olid_data = data[olid_key]
# Check items array for lendable status
if 'items' in olid_data and olid_data['items'] and len(olid_data['items']) > 0:
items = olid_data['items']
# Check the first item for lending status
first_item = items[0]
# Handle both dict and string representations (PowerShell converts to string)
if isinstance(first_item, dict):
status = first_item.get('status', '')
else:
# String representation - check if 'lendable' is in it
status = str(first_item).lower()
is_lendable = 'lendable' in str(status).lower()
if is_lendable:
logger.info(f"[UnifiedBookDownloader] Book {ol_id} is lendable")
return True, "LENDABLE"
else:
status_str = status.get('status', 'NOT_LENDABLE') if isinstance(status, dict) else 'NOT_LENDABLE'
logger.debug(f"[UnifiedBookDownloader] Book {ol_id} is not lendable (status: {status_str})")
return False, status_str
else:
# No items array or empty
logger.debug(f"[UnifiedBookDownloader] No items found for {ol_id}")
return False, "Not available for lending"
except requests.exceptions.Timeout:
logger.warning(f"[UnifiedBookDownloader] OpenLibrary API timeout for {ol_id}")
return False, "API timeout"
except Exception as e:
logger.debug(f"[UnifiedBookDownloader] Failed to check lendable status for {ol_id}: {e}")
return False, f"API error"
async def download_book(self, method: Dict[str, Any], output_dir: Optional[str] = None) -> Tuple[bool, str]:
"""Download a book using the specified method.
Args:
method: Download method dict from get_download_options()
output_dir: Directory to save the book
Returns:
Tuple of (success: bool, message: str)
"""
output_dir = output_dir or self.output_dir or str(Path.home() / "Downloads")
method_type = method.get('type', '')
logger.info(f"[UnifiedBookDownloader] Starting download with method: {method_type}")
try:
if method_type == 'archive.org_direct':
return await self._download_archive_direct(method, output_dir)
elif method_type == 'archive.org_borrow':
return await self._download_archive_borrow(method, output_dir)
elif method_type == 'libgen':
return await self._download_libgen(method, output_dir)
else:
return False, f"Unknown download method: {method_type}"
except Exception as e:
logger.error(f"[UnifiedBookDownloader] Download error: {e}", exc_info=True)
return False, f"Download failed: {str(e)}"
async def _download_archive_direct(self, method: Dict[str, Any], output_dir: str) -> Tuple[bool, str]:
"""Download directly from Archive.org."""
try:
pdf_url = method.get('pdf_url', '')
book_id = method.get('book_id', '')
if not pdf_url:
return False, "No PDF URL available"
# Determine output filename
filename = f"{book_id}.pdf"
output_path = Path(output_dir) / filename
logger.info(f"[UnifiedBookDownloader] Downloading PDF from: {pdf_url}")
# Download in a thread to avoid blocking
loop = asyncio.get_event_loop()
success = await loop.run_in_executor(
None,
self._download_file,
pdf_url,
str(output_path)
)
if success:
logger.info(f"[UnifiedBookDownloader] Successfully downloaded to: {output_path}")
return True, f"Downloaded to: {output_path}"
else:
return False, "Failed to download PDF"
except Exception as e:
logger.error(f"[UnifiedBookDownloader] Archive direct download error: {e}")
return False, f"Archive download failed: {str(e)}"
async def _download_archive_borrow(self, method: Dict[str, Any], output_dir: str) -> Tuple[bool, str]:
"""Download via Archive.org borrowing (requires credentials).
Process (follows archive_client.py pattern):
1. Login to Archive.org with credentials
2. Call loan endpoint to borrow the book (14-day loan)
3. Get book info (page links, metadata)
4. Download all pages as images
5. Merge images into PDF
The loan function from archive_client.py handles:
- Checking if book needs borrowing (status 400 = "doesn't need to be borrowed")
- Creating borrow token for access
- Handling borrow failures
get_book_infos() extracts page links from the borrowed book viewer
download() downloads all pages using thread pool
img2pdf merges pages into searchable PDF
"""
try:
from API.archive_client import credential_openlibrary
book_id = method.get('book_id', '')
# Get credentials
email, password = credential_openlibrary(self.config)
if not email or not password:
return False, "Archive.org credentials not configured"
logger.info(f"[UnifiedBookDownloader] Logging into Archive.org...")
# Login and borrow (in thread, following download_book.py pattern)
loop = asyncio.get_event_loop()
borrow_result = await loop.run_in_executor(
None,
self._archive_borrow_and_download,
email,
password,
book_id,
output_dir
)
if borrow_result and isinstance(borrow_result, tuple):
success, filepath = borrow_result
if success:
logger.info(f"[UnifiedBookDownloader] Borrow succeeded: {filepath}")
return True, filepath
else:
logger.warning(f"[UnifiedBookDownloader] Borrow failed: {filepath}")
return False, filepath
else:
return False, "Failed to borrow book from Archive.org"
except Exception as e:
logger.error(f"[UnifiedBookDownloader] Archive borrow error: {e}")
return False, f"Archive borrow failed: {str(e)}"
async def _download_libgen(self, method: Dict[str, Any], output_dir: str) -> Tuple[bool, str]:
"""Download via Libgen search and download with mirror fallback."""
try:
isbn = method.get('isbn', '')
title = method.get('title', '')
if not isbn and not title:
return False, "Need ISBN or title for Libgen search"
if not self.search_libgen:
return False, "Libgen searcher not available"
# Define wrapper functions to safely call the methods
search_func = self.search_libgen
if search_func is None:
return False, "Search function not available"
preloaded_results = method.get('results')
loop = asyncio.get_event_loop()
if preloaded_results:
results = list(preloaded_results)
if not results:
results = await loop.run_in_executor(None, lambda: search_func(isbn or title, 10))
else:
results = await loop.run_in_executor(None, lambda: search_func(isbn or title, 10))
if not results:
logger.warning(f"[UnifiedBookDownloader] No Libgen results for: {isbn or title}")
return False, f"No Libgen results found for: {isbn or title}"
logger.info(f"[UnifiedBookDownloader] Found {len(results)} Libgen results")
# Determine output filename (use first result for naming)
first_result = results[0]
filename = f"{first_result.get('title', 'book')}"
filename = "".join(c for c in filename if c.isalnum() or c in (' ', '.', '-'))[:100]
# Try each result's mirror until one succeeds
for idx, result in enumerate(results, 1):
mirror_url = result.get('mirror_url', '')
if not mirror_url:
logger.debug(f"[UnifiedBookDownloader] Result {idx}: No mirror URL")
continue
# Use extension from this result if available
extension = result.get('extension', 'pdf')
if extension and not extension.startswith('.'):
extension = f".{extension}"
elif not extension:
extension = '.pdf'
output_path = Path(output_dir) / (filename + extension)
logger.info(f"[UnifiedBookDownloader] Trying mirror {idx}/{len(results)}: {mirror_url}")
download_func = self.download_from_mirror
if download_func is None:
return False, "Download function not available"
download_callable = cast(Callable[[str, str], Tuple[bool, Optional[Path]]], download_func)
def download_wrapper():
return download_callable(mirror_url, str(output_path))
# Download (in thread)
try:
success, downloaded_path = await loop.run_in_executor(None, download_wrapper)
if success:
dest_path = Path(downloaded_path) if downloaded_path else output_path
# Validate downloaded file is not HTML (common Libgen issue)
if dest_path.exists():
try:
with open(dest_path, 'rb') as f:
file_start = f.read(1024).decode('utf-8', errors='ignore').lower()
if '<!doctype' in file_start or '<html' in file_start:
logger.warning(f"[UnifiedBookDownloader] Mirror {idx} returned HTML instead of file, trying next mirror...")
dest_path.unlink() # Delete the HTML file
continue
except Exception as e:
logger.debug(f"[UnifiedBookDownloader] Could not validate file content: {e}")
logger.info(f"[UnifiedBookDownloader] Successfully downloaded from mirror {idx} to: {dest_path}")
return True, str(dest_path)
else:
logger.warning(f"[UnifiedBookDownloader] Mirror {idx} download failed, trying next...")
except Exception as e:
logger.warning(f"[UnifiedBookDownloader] Mirror {idx} error: {e}, trying next...")
continue
return False, f"All {len(results)} mirrors failed"
except Exception as e:
logger.error(f"[UnifiedBookDownloader] Libgen download error: {e}")
return False, f"Libgen download failed: {str(e)}"
async def download_libgen_selection(
self,
selected: Dict[str, Any],
remaining: Optional[List[Dict[str, Any]]] = None,
output_dir: Optional[str] = None,
) -> Tuple[bool, str]:
"""Download a specific Libgen result with optional fallbacks."""
if not isinstance(selected, dict):
return False, "Selected result must be a dictionary"
ordered_results: List[Dict[str, Any]] = [selected]
if remaining:
for item in remaining:
if isinstance(item, dict) and item is not selected:
ordered_results.append(item)
method: Dict[str, Any] = {
'type': 'libgen',
'isbn': selected.get('isbn', '') or '',
'title': selected.get('title', '') or '',
'author': selected.get('author', '') or '',
'results': ordered_results,
}
return await self.download_book(method, output_dir)
def download_libgen_selection_sync(
self,
selected: Dict[str, Any],
remaining: Optional[List[Dict[str, Any]]] = None,
output_dir: Optional[str] = None,
) -> Tuple[bool, str]:
"""Synchronous helper for downloading a Libgen selection."""
async def _run() -> Tuple[bool, str]:
return await self.download_libgen_selection(selected, remaining, output_dir)
loop = asyncio.new_event_loop()
try:
asyncio.set_event_loop(loop)
return loop.run_until_complete(_run())
finally:
loop.close()
asyncio.set_event_loop(None)
def _download_file(self, url: str, output_path: str) -> bool:
"""Download a file from URL."""
try:
response = requests.get(url, stream=True, timeout=30)
response.raise_for_status()
with open(output_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
return True
except Exception as e:
logger.error(f"[UnifiedBookDownloader] File download error: {e}")
return False
def _archive_borrow_and_download(self, email: str, password: str, book_id: str, output_dir: str) -> Tuple[bool, str]:
"""Borrow a book from Archive.org and download pages as PDF.
This follows the exact process from archive_client.py:
1. Login with credentials
2. Call loan() to create 14-day borrow
3. Get book info (extract page url)
4. Download all pages as images
5. Merge images into searchable PDF
Returns tuple of (success: bool, filepath/message: str)
"""
try:
from API.archive_client import login, loan, get_book_infos, download
import tempfile
import shutil
logger.info(f"[UnifiedBookDownloader] Logging into Archive.org as {email}")
session = login(email, password)
logger.info(f"[UnifiedBookDownloader] Attempting to borrow book: {book_id}")
# Call loan to create the 14-day borrow
session = loan(session, book_id, verbose=True)
# If we get here, borrowing succeeded
logger.info(f"[UnifiedBookDownloader] Successfully borrowed book: {book_id}")
# Now get the book info (page url and metadata)
logger.info(f"[UnifiedBookDownloader] Extracting book page information...")
# Try both URL formats: with /borrow and without
book_url = [
f"https://archive.org/borrow/{book_id}", # Try borrow page first (for borrowed books)
f"https://archive.org/details/{book_id}" # Fallback to details page
]
title = None
links = None
metadata = None
last_error = None
for book_url in book_url:
try:
logger.debug(f"[UnifiedBookDownloader] Trying to get book info from: {book_url}")
response = session.get(book_url, timeout=10)
# Log response status
if response.status_code != 200:
logger.debug(f"[UnifiedBookDownloader] URL returned {response.status_code}: {book_url}")
# Continue to try next URL
continue
# Try to parse the response
title, links, metadata = get_book_infos(session, book_url)
logger.info(f"[UnifiedBookDownloader] Successfully got info from: {book_url}")
logger.info(f"[UnifiedBookDownloader] Found {len(links)} pages to download")
break
except Exception as e:
logger.debug(f"[UnifiedBookDownloader] Failed with {book_url}: {e}")
last_error = e
continue
if links is None:
logger.error(f"[UnifiedBookDownloader] Failed to get book info from all url: {last_error}")
# Borrow extraction failed - return False
return False, "Could not extract borrowed book pages"
# Create temporary directory for images
temp_dir = tempfile.mkdtemp(prefix=f"{title}_", dir=output_dir)
logger.info(f"[UnifiedBookDownloader] Downloading {len(links)} pages to temporary directory...")
try:
# Download all pages (uses thread pool)
images = download(
session=session,
n_threads=10,
directory=temp_dir,
links=links,
scale=3, # Default resolution
book_id=book_id
)
logger.info(f"[UnifiedBookDownloader] Downloaded {len(images)} pages")
# Try to merge pages into PDF
try:
import img2pdf
logger.info(f"[UnifiedBookDownloader] Merging pages into PDF...")
# Prepare PDF metadata
pdfmeta = {}
if metadata:
if "title" in metadata:
pdfmeta["title"] = metadata["title"]
if "creator" in metadata:
pdfmeta["author"] = metadata["creator"]
pdfmeta["keywords"] = [f"https://archive.org/details/{book_id}"]
pdfmeta["creationdate"] = None # Avoid timezone issues
# Convert images to PDF
pdf_content = img2pdf.convert(images, **pdfmeta) if images else None
if not pdf_content:
logger.error(f"[UnifiedBookDownloader] PDF conversion failed")
return False, "Failed to convert pages to PDF"
# Save the PDF
pdf_filename = f"{title}.pdf" if title else "book.pdf"
pdf_path = Path(output_dir) / pdf_filename
# Handle duplicate filenames
i = 1
while pdf_path.exists():
pdf_path = Path(output_dir) / f"{title or 'book'}({i}).pdf"
i += 1
with open(pdf_path, 'wb') as f:
f.write(pdf_content)
logger.info(f"[UnifiedBookDownloader] Successfully created PDF: {pdf_path}")
return True, str(pdf_path)
except ImportError:
logger.warning(f"[UnifiedBookDownloader] img2pdf not available, saving as JPG collection instead")
# Create JPG collection directory
if not title:
title = f"book_{book_id}"
jpg_dir = Path(output_dir) / title
i = 1
while jpg_dir.exists():
jpg_dir = Path(output_dir) / f"{title}({i})"
i += 1
# Move temporary directory to final location
shutil.move(temp_dir, str(jpg_dir))
temp_dir = None # Mark as already moved
logger.info(f"[UnifiedBookDownloader] Saved as JPG collection: {jpg_dir}")
return True, str(jpg_dir)
finally:
# Clean up temporary directory if it still exists
if temp_dir and Path(temp_dir).exists():
shutil.rmtree(temp_dir)
except SystemExit:
# loan() function calls sys.exit on failure - catch it
logger.error(f"[UnifiedBookDownloader] Borrow process exited (book may not be borrowable)")
return False, "Book could not be borrowed (may not be available for borrowing)"
except Exception as e:
logger.error(f"[UnifiedBookDownloader] Archive borrow error: {e}")
return False, f"Borrow failed: {str(e)}"
def close(self) -> None:
"""Close the session."""
self.session.close()

View File

@@ -6,7 +6,7 @@ import subprocess
import sys
from typing import Any, Dict, List, Optional
from Provider._base import SearchProvider, SearchResult
from ProviderCore.base import SearchProvider, SearchResult
from SYS.logger import log

View File

@@ -4,7 +4,7 @@ import os
import sys
from typing import Any
from Provider._base import FileProvider
from ProviderCore.base import FileProvider
from SYS.logger import log