dfdsf
This commit is contained in:
5
Provider/__init__.py
Normal file
5
Provider/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""Provider plugin modules.
|
||||
|
||||
Concrete provider implementations live in this package.
|
||||
The public entrypoint/registry is Provider.registry.
|
||||
"""
|
||||
84
Provider/_base.py
Normal file
84
Provider/_base.py
Normal file
@@ -0,0 +1,84 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
|
||||
@dataclass
|
||||
class SearchResult:
|
||||
"""Unified search result format across all search providers."""
|
||||
|
||||
table: str # Provider name: "libgen", "soulseek", "bandcamp", "youtube", etc.
|
||||
title: str # Display title/filename
|
||||
path: str # Download target (URL, path, magnet, identifier)
|
||||
|
||||
detail: str = "" # Additional description
|
||||
annotations: List[str] = field(default_factory=list) # Tags: ["120MB", "flac", "ready"]
|
||||
media_kind: str = "other" # Type: "book", "audio", "video", "game", "magnet"
|
||||
size_bytes: Optional[int] = None
|
||||
tags: set[str] = field(default_factory=set) # Searchable tags
|
||||
columns: List[Tuple[str, str]] = field(default_factory=list) # Display columns
|
||||
full_metadata: Dict[str, Any] = field(default_factory=dict) # Extra metadata
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary for pipeline processing."""
|
||||
|
||||
return {
|
||||
"table": self.table,
|
||||
"title": self.title,
|
||||
"path": self.path,
|
||||
"detail": self.detail,
|
||||
"annotations": self.annotations,
|
||||
"media_kind": self.media_kind,
|
||||
"size_bytes": self.size_bytes,
|
||||
"tags": list(self.tags),
|
||||
"columns": list(self.columns),
|
||||
"full_metadata": self.full_metadata,
|
||||
}
|
||||
|
||||
|
||||
class SearchProvider(ABC):
|
||||
"""Base class for search providers."""
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
self.config = config or {}
|
||||
self.name = self.__class__.__name__.lower()
|
||||
|
||||
@abstractmethod
|
||||
def search(
|
||||
self,
|
||||
query: str,
|
||||
limit: int = 50,
|
||||
filters: Optional[Dict[str, Any]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[SearchResult]:
|
||||
"""Search for items matching the query."""
|
||||
|
||||
def download(self, result: SearchResult, output_dir: Path) -> Optional[Path]:
|
||||
"""Download an item from a search result."""
|
||||
|
||||
return None
|
||||
|
||||
def validate(self) -> bool:
|
||||
"""Check if provider is available and properly configured."""
|
||||
|
||||
return True
|
||||
|
||||
|
||||
class FileProvider(ABC):
|
||||
"""Base class for file upload providers."""
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
self.config = config or {}
|
||||
self.name = self.__class__.__name__.lower()
|
||||
|
||||
@abstractmethod
|
||||
def upload(self, file_path: str, **kwargs: Any) -> str:
|
||||
"""Upload a file and return the URL."""
|
||||
|
||||
def validate(self) -> bool:
|
||||
"""Check if provider is available/configured."""
|
||||
|
||||
return True
|
||||
109
Provider/bandcamp.py
Normal file
109
Provider/bandcamp.py
Normal file
@@ -0,0 +1,109 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from Provider._base import SearchProvider, SearchResult
|
||||
from SYS.logger import log, debug
|
||||
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright
|
||||
except ImportError: # pragma: no cover
|
||||
sync_playwright = None
|
||||
|
||||
|
||||
class Bandcamp(SearchProvider):
|
||||
"""Search provider for Bandcamp."""
|
||||
|
||||
def search(
|
||||
self,
|
||||
query: str,
|
||||
limit: int = 50,
|
||||
filters: Optional[Dict[str, Any]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[SearchResult]:
|
||||
if sync_playwright is None:
|
||||
log(
|
||||
"[bandcamp] Playwright not available. Install with: pip install playwright",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return []
|
||||
|
||||
try:
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
page = browser.new_page()
|
||||
|
||||
if query.strip().lower().startswith("artist:"):
|
||||
artist_name = query[7:].strip().strip('"')
|
||||
search_url = f"https://bandcamp.com/search?q={artist_name}&item_type=b"
|
||||
else:
|
||||
search_url = f"https://bandcamp.com/search?q={query}&item_type=a"
|
||||
|
||||
results = self._scrape_url(page, search_url, limit)
|
||||
|
||||
browser.close()
|
||||
return results
|
||||
|
||||
except Exception as exc:
|
||||
log(f"[bandcamp] Search error: {exc}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
def _scrape_url(self, page: Any, url: str, limit: int) -> List[SearchResult]:
|
||||
debug(f"[bandcamp] Scraping: {url}")
|
||||
|
||||
page.goto(url)
|
||||
page.wait_for_load_state("domcontentloaded")
|
||||
|
||||
results: List[SearchResult] = []
|
||||
|
||||
search_results = page.query_selector_all(".searchresult")
|
||||
if not search_results:
|
||||
return results
|
||||
|
||||
for item in search_results[:limit]:
|
||||
try:
|
||||
heading = item.query_selector(".heading")
|
||||
if not heading:
|
||||
continue
|
||||
|
||||
link = heading.query_selector("a")
|
||||
if not link:
|
||||
continue
|
||||
|
||||
title = link.inner_text().strip()
|
||||
target_url = link.get_attribute("href")
|
||||
|
||||
subhead = item.query_selector(".subhead")
|
||||
artist = subhead.inner_text().strip() if subhead else "Unknown"
|
||||
|
||||
itemtype = item.query_selector(".itemtype")
|
||||
media_type = itemtype.inner_text().strip() if itemtype else "album"
|
||||
|
||||
results.append(
|
||||
SearchResult(
|
||||
table="bandcamp",
|
||||
title=title,
|
||||
path=target_url,
|
||||
detail=f"By: {artist}",
|
||||
annotations=[media_type],
|
||||
media_kind="audio",
|
||||
columns=[
|
||||
("Name", title),
|
||||
("Artist", artist),
|
||||
("Type", media_type),
|
||||
],
|
||||
full_metadata={
|
||||
"artist": artist,
|
||||
"type": media_type,
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
except Exception as exc:
|
||||
debug(f"[bandcamp] Error parsing result: {exc}")
|
||||
|
||||
return results
|
||||
|
||||
def validate(self) -> bool:
|
||||
return sync_playwright is not None
|
||||
98
Provider/libgen.py
Normal file
98
Provider/libgen.py
Normal file
@@ -0,0 +1,98 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from Provider._base import SearchProvider, SearchResult
|
||||
from SYS.logger import log
|
||||
|
||||
|
||||
class Libgen(SearchProvider):
|
||||
"""Search provider for Library Genesis books."""
|
||||
|
||||
def search(
|
||||
self,
|
||||
query: str,
|
||||
limit: int = 50,
|
||||
filters: Optional[Dict[str, Any]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[SearchResult]:
|
||||
filters = filters or {}
|
||||
|
||||
try:
|
||||
from Provider.unified_book_downloader import UnifiedBookDownloader
|
||||
from Provider.query_parser import parse_query, get_field, get_free_text
|
||||
|
||||
parsed = parse_query(query)
|
||||
isbn = get_field(parsed, "isbn")
|
||||
author = get_field(parsed, "author")
|
||||
title = get_field(parsed, "title")
|
||||
free_text = get_free_text(parsed)
|
||||
|
||||
search_query = isbn or title or author or free_text or query
|
||||
|
||||
downloader = UnifiedBookDownloader(config=self.config)
|
||||
books = downloader.search_libgen(search_query, limit=limit)
|
||||
|
||||
results: List[SearchResult] = []
|
||||
for idx, book in enumerate(books, 1):
|
||||
title = book.get("title", "Unknown")
|
||||
author = book.get("author", "Unknown")
|
||||
year = book.get("year", "Unknown")
|
||||
pages = book.get("pages") or book.get("pages_str") or ""
|
||||
extension = book.get("extension", "") or book.get("ext", "")
|
||||
filesize = book.get("filesize_str", "Unknown")
|
||||
isbn = book.get("isbn", "")
|
||||
mirror_url = book.get("mirror_url", "")
|
||||
|
||||
columns = [
|
||||
("Title", title),
|
||||
("Author", author),
|
||||
("Pages", str(pages)),
|
||||
("Ext", str(extension)),
|
||||
]
|
||||
|
||||
detail = f"By: {author}"
|
||||
if year and year != "Unknown":
|
||||
detail += f" ({year})"
|
||||
|
||||
annotations = [f"{filesize}"]
|
||||
if isbn:
|
||||
annotations.append(f"ISBN: {isbn}")
|
||||
|
||||
results.append(
|
||||
SearchResult(
|
||||
table="libgen",
|
||||
title=title,
|
||||
path=mirror_url or f"libgen:{book.get('id', '')}",
|
||||
detail=detail,
|
||||
annotations=annotations,
|
||||
media_kind="book",
|
||||
columns=columns,
|
||||
full_metadata={
|
||||
"number": idx,
|
||||
"author": author,
|
||||
"year": year,
|
||||
"isbn": isbn,
|
||||
"filesize": filesize,
|
||||
"pages": pages,
|
||||
"extension": extension,
|
||||
"book_id": book.get("book_id", ""),
|
||||
"md5": book.get("md5", ""),
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
except Exception as exc:
|
||||
log(f"[libgen] Search error: {exc}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
def validate(self) -> bool:
|
||||
try:
|
||||
from Provider.unified_book_downloader import UnifiedBookDownloader # noqa: F401
|
||||
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
523
Provider/libgen_service.py
Normal file
523
Provider/libgen_service.py
Normal file
@@ -0,0 +1,523 @@
|
||||
"""Shared Library Genesis search and download helpers.
|
||||
|
||||
Replaces the old libgen backend with a robust scraper based on libgen-api-enhanced logic.
|
||||
Targets libgen.is/rs/st mirrors and parses the results table directly.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
import requests
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple
|
||||
from urllib.parse import quote, urljoin, urlparse, unquote
|
||||
|
||||
# Optional dependencies
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
except ImportError:
|
||||
BeautifulSoup = None
|
||||
|
||||
LogFn = Optional[Callable[[str], None]]
|
||||
ErrorFn = Optional[Callable[[str], None]]
|
||||
|
||||
DEFAULT_TIMEOUT = 20.0
|
||||
DEFAULT_LIMIT = 50
|
||||
|
||||
# Mirrors to try in order
|
||||
MIRRORS = [
|
||||
"https://libgen.is",
|
||||
"https://libgen.rs",
|
||||
"https://libgen.st",
|
||||
"http://libgen.is",
|
||||
"http://libgen.rs",
|
||||
"http://libgen.st",
|
||||
"https://libgen.li", # Different structure, fallback
|
||||
"http://libgen.li",
|
||||
"https://libgen.gl", # Different structure, fallback
|
||||
"http://libgen.gl",
|
||||
]
|
||||
|
||||
logging.getLogger(__name__).setLevel(logging.INFO)
|
||||
|
||||
|
||||
def _call(logger: LogFn, message: str) -> None:
|
||||
if logger:
|
||||
logger(message)
|
||||
|
||||
|
||||
class LibgenSearch:
|
||||
"""Robust LibGen searcher."""
|
||||
|
||||
def __init__(self, session: Optional[requests.Session] = None):
|
||||
self.session = session or requests.Session()
|
||||
self.session.headers.update({
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
})
|
||||
|
||||
def search(self, query: str, limit: int = DEFAULT_LIMIT) -> List[Dict[str, Any]]:
|
||||
"""Search LibGen mirrors."""
|
||||
if not BeautifulSoup:
|
||||
logging.error("BeautifulSoup not installed. Cannot search LibGen.")
|
||||
return []
|
||||
|
||||
for mirror in MIRRORS:
|
||||
try:
|
||||
if "libgen.li" in mirror or "libgen.gl" in mirror:
|
||||
results = self._search_libgen_li(mirror, query, limit)
|
||||
else:
|
||||
results = self._search_libgen_rs(mirror, query, limit)
|
||||
|
||||
if results:
|
||||
return results
|
||||
except Exception as e:
|
||||
logging.debug(f"Mirror {mirror} failed: {e}")
|
||||
continue
|
||||
|
||||
return []
|
||||
|
||||
def _search_libgen_rs(self, mirror: str, query: str, limit: int) -> List[Dict[str, Any]]:
|
||||
"""Search libgen.rs/is/st style mirrors."""
|
||||
# Search URL: /search.php?req=QUERY&res=100&column=def
|
||||
url = f"{mirror}/search.php"
|
||||
params = {
|
||||
"req": query,
|
||||
"res": 100, # Request more to filter later
|
||||
"column": "def",
|
||||
"open": 0,
|
||||
"view": "simple",
|
||||
"phrase": 1,
|
||||
}
|
||||
|
||||
resp = self.session.get(url, params=params, timeout=DEFAULT_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
# Find the table with results. usually class 'c'
|
||||
table = soup.find("table", {"class": "c"})
|
||||
if not table:
|
||||
# Try finding by structure (table with many rows)
|
||||
tables = soup.find_all("table")
|
||||
for t in tables:
|
||||
if len(t.find_all("tr")) > 5:
|
||||
table = t
|
||||
break
|
||||
|
||||
if not table:
|
||||
return []
|
||||
|
||||
results = []
|
||||
# Skip header row
|
||||
rows = table.find_all("tr")[1:]
|
||||
|
||||
for row in rows:
|
||||
cols = row.find_all("td")
|
||||
if len(cols) < 9:
|
||||
continue
|
||||
|
||||
# Columns:
|
||||
# 0: ID
|
||||
# 1: Author(s)
|
||||
# 2: Title
|
||||
# 3: Publisher
|
||||
# 4: Year
|
||||
# 5: Pages
|
||||
# 6: Language
|
||||
# 7: Size
|
||||
# 8: Extension
|
||||
# 9+: Mirrors
|
||||
|
||||
try:
|
||||
libgen_id = cols[0].get_text(strip=True)
|
||||
authors = [a.get_text(strip=True) for a in cols[1].find_all("a")]
|
||||
if not authors:
|
||||
authors = [cols[1].get_text(strip=True)]
|
||||
|
||||
title_tag = cols[2].find("a")
|
||||
title = title_tag.get_text(strip=True) if title_tag else cols[2].get_text(strip=True)
|
||||
|
||||
# Extract MD5 from title link if possible (often in href)
|
||||
# href='book/index.php?md5=...'
|
||||
md5 = ""
|
||||
if title_tag and title_tag.has_attr("href"):
|
||||
href = title_tag["href"]
|
||||
match = re.search(r"md5=([a-fA-F0-9]{32})", href)
|
||||
if match:
|
||||
md5 = match.group(1)
|
||||
|
||||
publisher = cols[3].get_text(strip=True)
|
||||
year = cols[4].get_text(strip=True)
|
||||
pages = cols[5].get_text(strip=True)
|
||||
language = cols[6].get_text(strip=True)
|
||||
size = cols[7].get_text(strip=True)
|
||||
extension = cols[8].get_text(strip=True)
|
||||
|
||||
# Mirrors
|
||||
# Usually col 9 is http://library.lol/main/MD5
|
||||
mirror_links = []
|
||||
for i in range(9, len(cols)):
|
||||
a = cols[i].find("a")
|
||||
if a and a.has_attr("href"):
|
||||
mirror_links.append(a["href"])
|
||||
|
||||
# Construct direct download page link (library.lol)
|
||||
# If we have MD5, we can guess it: http://library.lol/main/{md5}
|
||||
if md5:
|
||||
download_link = f"http://library.lol/main/{md5}"
|
||||
elif mirror_links:
|
||||
download_link = mirror_links[0]
|
||||
else:
|
||||
download_link = ""
|
||||
|
||||
results.append({
|
||||
"id": libgen_id,
|
||||
"title": title,
|
||||
"author": ", ".join(authors),
|
||||
"publisher": publisher,
|
||||
"year": year,
|
||||
"pages": pages,
|
||||
"language": language,
|
||||
"filesize_str": size,
|
||||
"extension": extension,
|
||||
"md5": md5,
|
||||
"mirror_url": download_link,
|
||||
"cover": "", # Could extract from hover if needed
|
||||
})
|
||||
|
||||
if len(results) >= limit:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
logging.debug(f"Error parsing row: {e}")
|
||||
continue
|
||||
|
||||
return results
|
||||
|
||||
def _search_libgen_li(self, mirror: str, query: str, limit: int) -> List[Dict[str, Any]]:
|
||||
"""Search libgen.li/gl style mirrors."""
|
||||
# Search URL: /index.php?req=QUERY&columns[]=t&columns[]=a...
|
||||
url = f"{mirror}/index.php"
|
||||
params = {
|
||||
"req": query,
|
||||
"res": 100,
|
||||
"covers": "on",
|
||||
"filesuns": "all",
|
||||
}
|
||||
|
||||
resp = self.session.get(url, params=params, timeout=DEFAULT_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
table = soup.find("table", {"id": "tablelibgen"})
|
||||
if not table:
|
||||
table = soup.find("table", {"class": "table table-striped"})
|
||||
|
||||
if not table:
|
||||
return []
|
||||
|
||||
results = []
|
||||
rows = table.find_all("tr")[1:]
|
||||
|
||||
for row in rows:
|
||||
cols = row.find_all("td")
|
||||
if len(cols) < 9:
|
||||
continue
|
||||
|
||||
try:
|
||||
# Structure is different
|
||||
# 0: Cover
|
||||
# 1: Title (with link to file.php?id=...)
|
||||
# 2: Author
|
||||
# 3: Publisher
|
||||
# 4: Year
|
||||
# 5: Language
|
||||
# 6: Pages
|
||||
# 7: Size
|
||||
# 8: Extension
|
||||
# 9: Mirrors
|
||||
|
||||
title_col = cols[1]
|
||||
title_link = title_col.find("a")
|
||||
title = title_link.get_text(strip=True) if title_link else title_col.get_text(strip=True)
|
||||
|
||||
# Extract ID from link
|
||||
libgen_id = ""
|
||||
if title_link and title_link.has_attr("href"):
|
||||
href = title_link["href"]
|
||||
# href is usually "file.php?id=..." or "edition.php?id=..."
|
||||
match = re.search(r"id=(\d+)", href)
|
||||
if match:
|
||||
libgen_id = match.group(1)
|
||||
|
||||
authors = cols[2].get_text(strip=True)
|
||||
publisher = cols[3].get_text(strip=True)
|
||||
year = cols[4].get_text(strip=True)
|
||||
language = cols[5].get_text(strip=True)
|
||||
pages = cols[6].get_text(strip=True)
|
||||
size = cols[7].get_text(strip=True)
|
||||
extension = cols[8].get_text(strip=True)
|
||||
|
||||
# Mirror link
|
||||
# Usually in col 9 or title link
|
||||
mirror_url = ""
|
||||
if title_link:
|
||||
href = title_link["href"]
|
||||
if href.startswith("/"):
|
||||
mirror_url = mirror + href
|
||||
else:
|
||||
mirror_url = urljoin(mirror, href)
|
||||
|
||||
results.append({
|
||||
"id": libgen_id,
|
||||
"title": title,
|
||||
"author": authors,
|
||||
"publisher": publisher,
|
||||
"year": year,
|
||||
"pages": pages,
|
||||
"language": language,
|
||||
"filesize_str": size,
|
||||
"extension": extension,
|
||||
"md5": "", # .li doesn't show MD5 easily in table
|
||||
"mirror_url": mirror_url,
|
||||
})
|
||||
|
||||
if len(results) >= limit:
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def search_libgen(
|
||||
query: str,
|
||||
limit: int = DEFAULT_LIMIT,
|
||||
*,
|
||||
log_info: LogFn = None,
|
||||
log_error: ErrorFn = None,
|
||||
session: Optional[requests.Session] = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Search Libgen using the robust scraper."""
|
||||
searcher = LibgenSearch(session=session)
|
||||
try:
|
||||
results = searcher.search(query, limit=limit)
|
||||
_call(log_info, f"[libgen] Found {len(results)} results")
|
||||
return results
|
||||
except Exception as e:
|
||||
_call(log_error, f"[libgen] Search failed: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def _resolve_download_url(
|
||||
session: requests.Session,
|
||||
url: str,
|
||||
log_info: LogFn = None
|
||||
) -> Optional[str]:
|
||||
"""Resolve the final download URL by following the LibGen chain."""
|
||||
current_url = url
|
||||
visited = set()
|
||||
|
||||
# Max hops to prevent infinite loops
|
||||
for _ in range(6):
|
||||
if current_url in visited:
|
||||
break
|
||||
visited.add(current_url)
|
||||
|
||||
_call(log_info, f"[resolve] Checking: {current_url}")
|
||||
|
||||
# Simple heuristic: if it looks like a file, return it
|
||||
if current_url.lower().endswith(('.pdf', '.epub', '.mobi', '.djvu', '.azw3', '.cbz', '.cbr')):
|
||||
return current_url
|
||||
|
||||
try:
|
||||
# Use HEAD first to check content type if possible, but some mirrors block HEAD or return 405
|
||||
# So we'll just GET with stream=True to peek headers/content without downloading everything
|
||||
with session.get(current_url, stream=True, timeout=30) as resp:
|
||||
resp.raise_for_status()
|
||||
ct = resp.headers.get("Content-Type", "").lower()
|
||||
|
||||
if "text/html" not in ct:
|
||||
# It's a binary file
|
||||
return current_url
|
||||
|
||||
# It's HTML, read content
|
||||
content = resp.text
|
||||
except Exception as e:
|
||||
_call(log_info, f"[resolve] Failed to fetch {current_url}: {e}")
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
|
||||
# 1. Check for "GET" link (library.lol / ads.php style)
|
||||
# Usually <h2>GET</h2> inside <a> or just text "GET"
|
||||
get_link = soup.find("a", string=re.compile(r"^GET$", re.IGNORECASE))
|
||||
if not get_link:
|
||||
# Try finding <a> containing <h2>GET</h2>
|
||||
h2_get = soup.find("h2", string=re.compile(r"^GET$", re.IGNORECASE))
|
||||
if h2_get and h2_get.parent.name == "a":
|
||||
get_link = h2_get.parent
|
||||
|
||||
if get_link and get_link.has_attr("href"):
|
||||
return urljoin(current_url, get_link["href"])
|
||||
|
||||
# 2. Check for "series.php" -> "edition.php"
|
||||
if "series.php" in current_url:
|
||||
# Find first edition link
|
||||
edition_link = soup.find("a", href=re.compile(r"edition\.php"))
|
||||
if edition_link:
|
||||
current_url = urljoin(current_url, edition_link["href"])
|
||||
continue
|
||||
|
||||
# 3. Check for "edition.php" -> "file.php"
|
||||
if "edition.php" in current_url:
|
||||
file_link = soup.find("a", href=re.compile(r"file\.php"))
|
||||
if file_link:
|
||||
current_url = urljoin(current_url, file_link["href"])
|
||||
continue
|
||||
|
||||
# 4. Check for "file.php" -> "ads.php" (Libgen badge)
|
||||
if "file.php" in current_url:
|
||||
# Look for link with title="libgen" or text "Libgen"
|
||||
libgen_link = soup.find("a", title="libgen")
|
||||
if not libgen_link:
|
||||
libgen_link = soup.find("a", string=re.compile(r"Libgen", re.IGNORECASE))
|
||||
|
||||
if libgen_link and libgen_link.has_attr("href"):
|
||||
current_url = urljoin(current_url, libgen_link["href"])
|
||||
continue
|
||||
|
||||
# 5. Check for "ads.php" -> "get.php" (Fallback if GET link logic above failed)
|
||||
if "ads.php" in current_url:
|
||||
get_php_link = soup.find("a", href=re.compile(r"get\.php"))
|
||||
if get_php_link:
|
||||
return urljoin(current_url, get_php_link["href"])
|
||||
|
||||
# 6. Library.lol / generic fallback
|
||||
for text in ["Cloudflare", "IPFS.io", "Infura"]:
|
||||
link = soup.find("a", string=re.compile(text, re.IGNORECASE))
|
||||
if link and link.has_attr("href"):
|
||||
return urljoin(current_url, link["href"])
|
||||
|
||||
# If we found nothing new, stop
|
||||
break
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _guess_filename_extension(download_url: str, headers: Dict[str, str]) -> Optional[str]:
|
||||
"""Guess the file extension from headers or the download URL."""
|
||||
content_disposition = headers.get("content-disposition", "")
|
||||
if content_disposition:
|
||||
match = re.search(r'filename\*?=(?:UTF-8\'\'|"?)([^";]+)', content_disposition, flags=re.IGNORECASE)
|
||||
if match:
|
||||
filename = unquote(match.group(1).strip('"'))
|
||||
suffix = Path(filename).suffix
|
||||
if suffix:
|
||||
return suffix.lstrip('.')
|
||||
|
||||
parsed = urlparse(download_url)
|
||||
suffix = Path(parsed.path).suffix
|
||||
if suffix:
|
||||
return suffix.lstrip('.')
|
||||
|
||||
content_type = headers.get('content-type', '').lower()
|
||||
mime_map = {
|
||||
'application/pdf': 'pdf',
|
||||
'application/epub+zip': 'epub',
|
||||
'application/x-mobipocket-ebook': 'mobi',
|
||||
'application/x-cbr': 'cbr',
|
||||
'application/x-cbz': 'cbz',
|
||||
'application/zip': 'zip',
|
||||
}
|
||||
|
||||
for mime, ext in mime_map.items():
|
||||
if mime in content_type:
|
||||
return ext
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _apply_extension(path: Path, extension: Optional[str]) -> Path:
|
||||
"""Rename the path to match the detected extension, if needed."""
|
||||
if not extension:
|
||||
return path
|
||||
|
||||
suffix = extension if extension.startswith('.') else f'.{extension}'
|
||||
if path.suffix.lower() == suffix.lower():
|
||||
return path
|
||||
|
||||
candidate = path.with_suffix(suffix)
|
||||
base_stem = path.stem
|
||||
counter = 1
|
||||
while candidate.exists() and counter < 100:
|
||||
candidate = path.with_name(f"{base_stem}({counter}){suffix}")
|
||||
counter += 1
|
||||
|
||||
try:
|
||||
path.replace(candidate)
|
||||
return candidate
|
||||
except Exception:
|
||||
return path
|
||||
|
||||
def download_from_mirror(
|
||||
mirror_url: str,
|
||||
output_path: Path,
|
||||
*,
|
||||
log_info: LogFn = None,
|
||||
log_error: ErrorFn = None,
|
||||
session: Optional[requests.Session] = None,
|
||||
progress_callback: Optional[Callable[[int, int], None]] = None,
|
||||
) -> Tuple[bool, Optional[Path]]:
|
||||
"""Download file from a LibGen mirror URL with optional progress tracking."""
|
||||
session = session or requests.Session()
|
||||
output_path = Path(output_path)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
_call(log_info, f"[download] Resolving download link from: {mirror_url}")
|
||||
|
||||
download_url = _resolve_download_url(session, mirror_url, log_info)
|
||||
|
||||
if not download_url:
|
||||
_call(log_error, "[download] Could not find direct download link")
|
||||
return False, None
|
||||
|
||||
_call(log_info, f"[download] Downloading from: {download_url}")
|
||||
|
||||
downloaded = 0
|
||||
total_size = 0
|
||||
headers: Dict[str, str] = {}
|
||||
|
||||
with session.get(download_url, stream=True, timeout=60) as r:
|
||||
r.raise_for_status()
|
||||
headers = dict(r.headers)
|
||||
|
||||
# Verify it's not HTML (error page)
|
||||
ct = headers.get("content-type", "").lower()
|
||||
if "text/html" in ct:
|
||||
_call(log_error, "[download] Final URL returned HTML, not a file.")
|
||||
return False, None
|
||||
|
||||
total_size = int(headers.get("content-length", 0) or 0)
|
||||
|
||||
with open(output_path, "wb") as f:
|
||||
for chunk in r.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
downloaded += len(chunk)
|
||||
if progress_callback:
|
||||
progress_callback(downloaded, total_size)
|
||||
|
||||
final_extension = _guess_filename_extension(download_url, headers)
|
||||
final_path = _apply_extension(output_path, final_extension)
|
||||
|
||||
if progress_callback and total_size > 0:
|
||||
progress_callback(downloaded, total_size)
|
||||
|
||||
_call(log_info, f"[download] Saved to {final_path}")
|
||||
return True, final_path
|
||||
|
||||
except Exception as e:
|
||||
_call(log_error, f"[download] Download failed: {e}")
|
||||
return False, None
|
||||
94
Provider/matrix.py
Normal file
94
Provider/matrix.py
Normal file
@@ -0,0 +1,94 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import mimetypes
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
|
||||
from Provider._base import FileProvider
|
||||
|
||||
|
||||
class Matrix(FileProvider):
|
||||
"""File provider for Matrix (Element) chat rooms."""
|
||||
|
||||
def validate(self) -> bool:
|
||||
if not self.config:
|
||||
return False
|
||||
matrix_conf = self.config.get("storage", {}).get("matrix", {})
|
||||
return bool(
|
||||
matrix_conf.get("homeserver")
|
||||
and matrix_conf.get("room_id")
|
||||
and (matrix_conf.get("access_token") or matrix_conf.get("password"))
|
||||
)
|
||||
|
||||
def upload(self, file_path: str, **kwargs: Any) -> str:
|
||||
path = Path(file_path)
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
matrix_conf = self.config.get("storage", {}).get("matrix", {})
|
||||
homeserver = matrix_conf.get("homeserver")
|
||||
access_token = matrix_conf.get("access_token")
|
||||
room_id = matrix_conf.get("room_id")
|
||||
|
||||
if not homeserver:
|
||||
raise Exception("Matrix homeserver missing")
|
||||
if not access_token:
|
||||
raise Exception("Matrix access_token missing")
|
||||
if not room_id:
|
||||
raise Exception("Matrix room_id missing")
|
||||
|
||||
if not homeserver.startswith("http"):
|
||||
homeserver = f"https://{homeserver}"
|
||||
|
||||
# Upload media
|
||||
upload_url = f"{homeserver}/_matrix/media/v3/upload"
|
||||
headers = {
|
||||
"Authorization": f"Bearer {access_token}",
|
||||
"Content-Type": "application/octet-stream",
|
||||
}
|
||||
|
||||
mime_type, _ = mimetypes.guess_type(path)
|
||||
if mime_type:
|
||||
headers["Content-Type"] = mime_type
|
||||
|
||||
filename = path.name
|
||||
|
||||
with open(path, "rb") as handle:
|
||||
resp = requests.post(upload_url, headers=headers, data=handle, params={"filename": filename})
|
||||
|
||||
if resp.status_code != 200:
|
||||
raise Exception(f"Matrix upload failed: {resp.text}")
|
||||
|
||||
content_uri = resp.json().get("content_uri")
|
||||
if not content_uri:
|
||||
raise Exception("No content_uri returned")
|
||||
|
||||
# Send message
|
||||
send_url = f"{homeserver}/_matrix/client/v3/rooms/{room_id}/send/m.room.message"
|
||||
|
||||
# Determine message type
|
||||
msgtype = "m.file"
|
||||
ext = path.suffix.lower()
|
||||
|
||||
audio_exts = {".mp3", ".flac", ".wav", ".m4a", ".aac", ".ogg", ".opus", ".wma", ".mka", ".alac"}
|
||||
video_exts = {".mp4", ".mkv", ".webm", ".mov", ".avi", ".flv", ".mpg", ".mpeg", ".ts", ".m4v", ".wmv"}
|
||||
image_exts = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff"}
|
||||
|
||||
if ext in audio_exts:
|
||||
msgtype = "m.audio"
|
||||
elif ext in video_exts:
|
||||
msgtype = "m.video"
|
||||
elif ext in image_exts:
|
||||
msgtype = "m.image"
|
||||
|
||||
info = {"mimetype": mime_type, "size": path.stat().st_size}
|
||||
payload = {"msgtype": msgtype, "body": filename, "url": content_uri, "info": info}
|
||||
|
||||
resp = requests.post(send_url, headers=headers, json=payload)
|
||||
if resp.status_code != 200:
|
||||
raise Exception(f"Matrix send message failed: {resp.text}")
|
||||
|
||||
event_id = resp.json().get("event_id")
|
||||
return f"https://matrix.to/#/{room_id}/{event_id}"
|
||||
389
Provider/metadata_provider.py
Normal file
389
Provider/metadata_provider.py
Normal file
@@ -0,0 +1,389 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Dict, List, Optional, Type
|
||||
import requests
|
||||
import sys
|
||||
|
||||
from SYS.logger import log, debug
|
||||
|
||||
try: # Optional dependency
|
||||
import musicbrainzngs # type: ignore
|
||||
except ImportError: # pragma: no cover - optional
|
||||
musicbrainzngs = None
|
||||
|
||||
|
||||
class MetadataProvider(ABC):
|
||||
"""Base class for metadata providers (music, movies, books, etc.)."""
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
|
||||
self.config = config or {}
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return self.__class__.__name__.replace("Provider", "").lower()
|
||||
|
||||
@abstractmethod
|
||||
def search(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
|
||||
"""Return a list of candidate metadata records."""
|
||||
|
||||
def to_tags(self, item: Dict[str, Any]) -> List[str]:
|
||||
"""Convert a result item into a list of tags."""
|
||||
tags: List[str] = []
|
||||
title = item.get("title")
|
||||
artist = item.get("artist")
|
||||
album = item.get("album")
|
||||
year = item.get("year")
|
||||
|
||||
if title:
|
||||
tags.append(f"title:{title}")
|
||||
if artist:
|
||||
tags.append(f"artist:{artist}")
|
||||
if album:
|
||||
tags.append(f"album:{album}")
|
||||
if year:
|
||||
tags.append(f"year:{year}")
|
||||
|
||||
tags.append(f"source:{self.name}")
|
||||
return tags
|
||||
|
||||
|
||||
class ITunesProvider(MetadataProvider):
|
||||
"""Metadata provider using the iTunes Search API."""
|
||||
|
||||
def search(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
|
||||
params = {"term": query, "media": "music", "entity": "song", "limit": limit}
|
||||
try:
|
||||
resp = requests.get("https://itunes.apple.com/search", params=params, timeout=10)
|
||||
resp.raise_for_status()
|
||||
results = resp.json().get("results", [])
|
||||
except Exception as exc:
|
||||
log(f"iTunes search failed: {exc}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
items: List[Dict[str, Any]] = []
|
||||
for r in results:
|
||||
item = {
|
||||
"title": r.get("trackName"),
|
||||
"artist": r.get("artistName"),
|
||||
"album": r.get("collectionName"),
|
||||
"year": str(r.get("releaseDate", ""))[:4],
|
||||
"provider": self.name,
|
||||
"raw": r,
|
||||
}
|
||||
items.append(item)
|
||||
debug(f"iTunes returned {len(items)} items for '{query}'")
|
||||
return items
|
||||
|
||||
|
||||
class OpenLibraryMetadataProvider(MetadataProvider):
|
||||
"""Metadata provider for OpenLibrary book metadata."""
|
||||
|
||||
@property
|
||||
def name(self) -> str: # type: ignore[override]
|
||||
return "openlibrary"
|
||||
|
||||
def search(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
|
||||
query_clean = (query or "").strip()
|
||||
if not query_clean:
|
||||
return []
|
||||
|
||||
try:
|
||||
# Prefer ISBN-specific search when the query looks like one
|
||||
if query_clean.replace("-", "").isdigit() and len(query_clean.replace("-", "")) in (10, 13):
|
||||
q = f"isbn:{query_clean.replace('-', '')}"
|
||||
else:
|
||||
q = query_clean
|
||||
|
||||
resp = requests.get(
|
||||
"https://openlibrary.org/search.json",
|
||||
params={"q": q, "limit": limit},
|
||||
timeout=10,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
except Exception as exc:
|
||||
log(f"OpenLibrary search failed: {exc}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
items: List[Dict[str, Any]] = []
|
||||
for doc in data.get("docs", [])[:limit]:
|
||||
authors = doc.get("author_name") or []
|
||||
publisher = ""
|
||||
publishers = doc.get("publisher") or []
|
||||
if isinstance(publishers, list) and publishers:
|
||||
publisher = publishers[0]
|
||||
|
||||
# Prefer 13-digit ISBN when available, otherwise 10-digit
|
||||
isbn_list = doc.get("isbn") or []
|
||||
isbn_13 = next((i for i in isbn_list if len(str(i)) == 13), None)
|
||||
isbn_10 = next((i for i in isbn_list if len(str(i)) == 10), None)
|
||||
|
||||
# Derive OLID from key
|
||||
olid = ""
|
||||
key = doc.get("key", "")
|
||||
if isinstance(key, str) and key:
|
||||
olid = key.split("/")[-1]
|
||||
|
||||
items.append({
|
||||
"title": doc.get("title") or "",
|
||||
"artist": ", ".join(authors) if authors else "",
|
||||
"album": publisher,
|
||||
"year": str(doc.get("first_publish_year") or ""),
|
||||
"provider": self.name,
|
||||
"authors": authors,
|
||||
"publisher": publisher,
|
||||
"identifiers": {
|
||||
"isbn_13": isbn_13,
|
||||
"isbn_10": isbn_10,
|
||||
"openlibrary": olid,
|
||||
"oclc": (doc.get("oclc_numbers") or [None])[0],
|
||||
"lccn": (doc.get("lccn") or [None])[0],
|
||||
},
|
||||
"description": None,
|
||||
})
|
||||
|
||||
return items
|
||||
|
||||
def to_tags(self, item: Dict[str, Any]) -> List[str]:
|
||||
tags: List[str] = []
|
||||
title = item.get("title")
|
||||
authors = item.get("authors") or []
|
||||
publisher = item.get("publisher")
|
||||
year = item.get("year")
|
||||
description = item.get("description") or ""
|
||||
|
||||
if title:
|
||||
tags.append(f"title:{title}")
|
||||
for author in authors:
|
||||
if author:
|
||||
tags.append(f"author:{author}")
|
||||
if publisher:
|
||||
tags.append(f"publisher:{publisher}")
|
||||
if year:
|
||||
tags.append(f"year:{year}")
|
||||
if description:
|
||||
tags.append(f"description:{description[:200]}")
|
||||
|
||||
identifiers = item.get("identifiers") or {}
|
||||
for key, value in identifiers.items():
|
||||
if value:
|
||||
tags.append(f"{key}:{value}")
|
||||
|
||||
tags.append(f"source:{self.name}")
|
||||
return tags
|
||||
|
||||
|
||||
class GoogleBooksMetadataProvider(MetadataProvider):
|
||||
"""Metadata provider for Google Books volumes API."""
|
||||
|
||||
@property
|
||||
def name(self) -> str: # type: ignore[override]
|
||||
return "googlebooks"
|
||||
|
||||
def search(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
|
||||
query_clean = (query or "").strip()
|
||||
if not query_clean:
|
||||
return []
|
||||
|
||||
# Prefer ISBN queries when possible
|
||||
if query_clean.replace("-", "").isdigit() and len(query_clean.replace("-", "")) in (10, 13):
|
||||
q = f"isbn:{query_clean.replace('-', '')}"
|
||||
else:
|
||||
q = query_clean
|
||||
|
||||
try:
|
||||
resp = requests.get(
|
||||
"https://www.googleapis.com/books/v1/volumes",
|
||||
params={"q": q, "maxResults": limit},
|
||||
timeout=10,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
payload = resp.json()
|
||||
except Exception as exc:
|
||||
log(f"Google Books search failed: {exc}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
items: List[Dict[str, Any]] = []
|
||||
for volume in payload.get("items", [])[:limit]:
|
||||
info = volume.get("volumeInfo") or {}
|
||||
authors = info.get("authors") or []
|
||||
publisher = info.get("publisher", "")
|
||||
published_date = info.get("publishedDate", "")
|
||||
year = str(published_date)[:4] if published_date else ""
|
||||
|
||||
identifiers_raw = info.get("industryIdentifiers") or []
|
||||
identifiers: Dict[str, Optional[str]] = {"googlebooks": volume.get("id")}
|
||||
for ident in identifiers_raw:
|
||||
if not isinstance(ident, dict):
|
||||
continue
|
||||
ident_type = ident.get("type", "").lower()
|
||||
ident_value = ident.get("identifier")
|
||||
if not ident_value:
|
||||
continue
|
||||
if ident_type == "isbn_13":
|
||||
identifiers.setdefault("isbn_13", ident_value)
|
||||
elif ident_type == "isbn_10":
|
||||
identifiers.setdefault("isbn_10", ident_value)
|
||||
else:
|
||||
identifiers.setdefault(ident_type, ident_value)
|
||||
|
||||
items.append({
|
||||
"title": info.get("title") or "",
|
||||
"artist": ", ".join(authors) if authors else "",
|
||||
"album": publisher,
|
||||
"year": year,
|
||||
"provider": self.name,
|
||||
"authors": authors,
|
||||
"publisher": publisher,
|
||||
"identifiers": identifiers,
|
||||
"description": info.get("description", ""),
|
||||
})
|
||||
|
||||
return items
|
||||
|
||||
def to_tags(self, item: Dict[str, Any]) -> List[str]:
|
||||
tags: List[str] = []
|
||||
title = item.get("title")
|
||||
authors = item.get("authors") or []
|
||||
publisher = item.get("publisher")
|
||||
year = item.get("year")
|
||||
description = item.get("description") or ""
|
||||
|
||||
if title:
|
||||
tags.append(f"title:{title}")
|
||||
for author in authors:
|
||||
if author:
|
||||
tags.append(f"author:{author}")
|
||||
if publisher:
|
||||
tags.append(f"publisher:{publisher}")
|
||||
if year:
|
||||
tags.append(f"year:{year}")
|
||||
if description:
|
||||
tags.append(f"description:{description[:200]}")
|
||||
|
||||
identifiers = item.get("identifiers") or {}
|
||||
for key, value in identifiers.items():
|
||||
if value:
|
||||
tags.append(f"{key}:{value}")
|
||||
|
||||
tags.append(f"source:{self.name}")
|
||||
return tags
|
||||
|
||||
|
||||
class MusicBrainzMetadataProvider(MetadataProvider):
|
||||
"""Metadata provider for MusicBrainz recordings."""
|
||||
|
||||
@property
|
||||
def name(self) -> str: # type: ignore[override]
|
||||
return "musicbrainz"
|
||||
|
||||
def search(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
|
||||
if not musicbrainzngs:
|
||||
log("musicbrainzngs is not installed; skipping MusicBrainz scrape", file=sys.stderr)
|
||||
return []
|
||||
|
||||
q = (query or "").strip()
|
||||
if not q:
|
||||
return []
|
||||
|
||||
try:
|
||||
# Ensure user agent is set (required by MusicBrainz)
|
||||
musicbrainzngs.set_useragent("Medeia-Macina", "0.1")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
resp = musicbrainzngs.search_recordings(query=q, limit=limit)
|
||||
recordings = resp.get("recording-list") or resp.get("recordings") or []
|
||||
except Exception as exc:
|
||||
log(f"MusicBrainz search failed: {exc}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
items: List[Dict[str, Any]] = []
|
||||
for rec in recordings[:limit]:
|
||||
if not isinstance(rec, dict):
|
||||
continue
|
||||
title = rec.get("title") or ""
|
||||
|
||||
artist = ""
|
||||
artist_credit = rec.get("artist-credit") or rec.get("artist_credit")
|
||||
if isinstance(artist_credit, list) and artist_credit:
|
||||
first = artist_credit[0]
|
||||
if isinstance(first, dict):
|
||||
artist = first.get("name") or first.get("artist", {}).get("name", "")
|
||||
elif isinstance(first, str):
|
||||
artist = first
|
||||
|
||||
album = ""
|
||||
release_list = rec.get("release-list") or rec.get("releases") or rec.get("release")
|
||||
if isinstance(release_list, list) and release_list:
|
||||
first_rel = release_list[0]
|
||||
if isinstance(first_rel, dict):
|
||||
album = first_rel.get("title", "") or ""
|
||||
release_date = first_rel.get("date") or ""
|
||||
else:
|
||||
album = str(first_rel)
|
||||
release_date = ""
|
||||
else:
|
||||
release_date = rec.get("first-release-date") or ""
|
||||
|
||||
year = str(release_date)[:4] if release_date else ""
|
||||
mbid = rec.get("id") or ""
|
||||
|
||||
items.append({
|
||||
"title": title,
|
||||
"artist": artist,
|
||||
"album": album,
|
||||
"year": year,
|
||||
"provider": self.name,
|
||||
"mbid": mbid,
|
||||
"raw": rec,
|
||||
})
|
||||
|
||||
return items
|
||||
|
||||
def to_tags(self, item: Dict[str, Any]) -> List[str]:
|
||||
tags = super().to_tags(item)
|
||||
mbid = item.get("mbid")
|
||||
if mbid:
|
||||
tags.append(f"musicbrainz:{mbid}")
|
||||
return tags
|
||||
|
||||
|
||||
# Registry ---------------------------------------------------------------
|
||||
|
||||
_METADATA_PROVIDERS: Dict[str, Type[MetadataProvider]] = {
|
||||
"itunes": ITunesProvider,
|
||||
"openlibrary": OpenLibraryMetadataProvider,
|
||||
"googlebooks": GoogleBooksMetadataProvider,
|
||||
"google": GoogleBooksMetadataProvider,
|
||||
"musicbrainz": MusicBrainzMetadataProvider,
|
||||
}
|
||||
|
||||
|
||||
def register_provider(name: str, provider_cls: Type[MetadataProvider]) -> None:
|
||||
_METADATA_PROVIDERS[name.lower()] = provider_cls
|
||||
|
||||
|
||||
def list_metadata_providers(config: Optional[Dict[str, Any]] = None) -> Dict[str, bool]:
|
||||
availability: Dict[str, bool] = {}
|
||||
for name, cls in _METADATA_PROVIDERS.items():
|
||||
try:
|
||||
provider = cls(config)
|
||||
# Basic availability check: perform lightweight validation if defined
|
||||
availability[name] = True
|
||||
except Exception:
|
||||
availability[name] = False
|
||||
return availability
|
||||
|
||||
|
||||
def get_metadata_provider(name: str, config: Optional[Dict[str, Any]] = None) -> Optional[MetadataProvider]:
|
||||
cls = _METADATA_PROVIDERS.get(name.lower())
|
||||
if not cls:
|
||||
return None
|
||||
try:
|
||||
return cls(config)
|
||||
except Exception as exc:
|
||||
log(f"Provider init failed for '{name}': {exc}", file=sys.stderr)
|
||||
return None
|
||||
159
Provider/query_parser.py
Normal file
159
Provider/query_parser.py
Normal file
@@ -0,0 +1,159 @@
|
||||
"""Dynamic query parser for filtering and field extraction.
|
||||
|
||||
Supports query syntax like:
|
||||
- isbn:0557677203
|
||||
- author:"Albert Pike"
|
||||
- title:"Morals and Dogma"
|
||||
- year:2010
|
||||
- isbn:0557677203 author:"Albert Pike"
|
||||
- Mixed with free text: "Morals" isbn:0557677203
|
||||
|
||||
This allows flexible query strings that can be parsed by any search provider
|
||||
to extract specific fields for filtering and searching.
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Tuple, Optional, Any
|
||||
import re
|
||||
|
||||
|
||||
def parse_query(query: str) -> Dict[str, Any]:
|
||||
"""Parse a query string into field:value pairs and free text.
|
||||
|
||||
Args:
|
||||
query: Query string like 'isbn:0557677203 author:"Albert Pike" Morals'
|
||||
|
||||
Returns:
|
||||
Dictionary with:
|
||||
- 'fields': Dict[field_name, field_value] for structured fields
|
||||
- 'text': str with remaining free text
|
||||
- 'raw': str original query
|
||||
"""
|
||||
result = {
|
||||
'fields': {},
|
||||
'text': '',
|
||||
'raw': query,
|
||||
}
|
||||
|
||||
if not query or not query.strip():
|
||||
return result
|
||||
|
||||
query = query.strip()
|
||||
remaining_parts = []
|
||||
|
||||
# Pattern to match: field:value or field:"quoted value"
|
||||
# Matches: word: followed by either quoted string or unquoted word
|
||||
pattern = r'(\w+):(?:"([^"]*)"|(\S+))'
|
||||
|
||||
pos = 0
|
||||
for match in re.finditer(pattern, query):
|
||||
# Add any text before this match
|
||||
if match.start() > pos:
|
||||
before_text = query[pos:match.start()].strip()
|
||||
if before_text:
|
||||
remaining_parts.append(before_text)
|
||||
|
||||
field_name = match.group(1).lower()
|
||||
field_value = match.group(2) if match.group(2) is not None else match.group(3)
|
||||
|
||||
result['fields'][field_name] = field_value
|
||||
pos = match.end()
|
||||
|
||||
# Add any remaining text after last match
|
||||
if pos < len(query):
|
||||
remaining_text = query[pos:].strip()
|
||||
if remaining_text:
|
||||
remaining_parts.append(remaining_text)
|
||||
|
||||
result['text'] = ' '.join(remaining_parts)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_field(parsed_query: Dict[str, Any], field_name: str, default: Optional[str] = None) -> Optional[str]:
|
||||
"""Get a field value from parsed query, with optional default.
|
||||
|
||||
Args:
|
||||
parsed_query: Result from parse_query()
|
||||
field_name: Field name to look up (case-insensitive)
|
||||
default: Default value if field not found
|
||||
|
||||
Returns:
|
||||
Field value or default
|
||||
"""
|
||||
return parsed_query.get('fields', {}).get(field_name.lower(), default)
|
||||
|
||||
|
||||
def has_field(parsed_query: Dict[str, Any], field_name: str) -> bool:
|
||||
"""Check if a field exists in parsed query.
|
||||
|
||||
Args:
|
||||
parsed_query: Result from parse_query()
|
||||
field_name: Field name to check (case-insensitive)
|
||||
|
||||
Returns:
|
||||
True if field exists
|
||||
"""
|
||||
return field_name.lower() in parsed_query.get('fields', {})
|
||||
|
||||
|
||||
def get_free_text(parsed_query: Dict[str, Any]) -> str:
|
||||
"""Get the free text portion of a parsed query.
|
||||
|
||||
Args:
|
||||
parsed_query: Result from parse_query()
|
||||
|
||||
Returns:
|
||||
Free text or empty string
|
||||
"""
|
||||
return parsed_query.get('text', '')
|
||||
|
||||
|
||||
def build_query_for_provider(
|
||||
parsed_query: Dict[str, Any],
|
||||
provider: str,
|
||||
extraction_map: Optional[Dict[str, str]] = None
|
||||
) -> Tuple[str, Dict[str, str]]:
|
||||
"""Build a search query and filters dict for a specific provider.
|
||||
|
||||
Different providers have different search syntax. This function
|
||||
extracts the appropriate fields for each provider.
|
||||
|
||||
Args:
|
||||
parsed_query: Result from parse_query()
|
||||
provider: Provider name ('libgen', 'openlibrary', 'soulseek')
|
||||
extraction_map: Optional mapping of field names to provider-specific names
|
||||
e.g. {'isbn': 'isbn', 'author': 'author', 'title': 'title'}
|
||||
|
||||
Returns:
|
||||
Tuple of (search_query: str, extracted_fields: Dict[field, value])
|
||||
"""
|
||||
extraction_map = extraction_map or {}
|
||||
extracted = {}
|
||||
free_text = get_free_text(parsed_query)
|
||||
|
||||
# Extract fields based on map
|
||||
for field_name, provider_key in extraction_map.items():
|
||||
if has_field(parsed_query, field_name):
|
||||
extracted[provider_key] = get_field(parsed_query, field_name)
|
||||
|
||||
# If provider-specific extraction needed, providers can implement it
|
||||
# For now, return the free text as query
|
||||
return free_text, extracted
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Test cases
|
||||
test_queries = [
|
||||
'isbn:0557677203',
|
||||
'isbn:0557677203 author:"Albert Pike"',
|
||||
'Morals and Dogma isbn:0557677203',
|
||||
'title:"Morals and Dogma" author:"Albert Pike" year:2010',
|
||||
'search term without fields',
|
||||
'author:"John Smith" title:"A Book"',
|
||||
]
|
||||
|
||||
for query in test_queries:
|
||||
print(f"\nQuery: {query}")
|
||||
parsed = parse_query(query)
|
||||
print(f" Fields: {parsed['fields']}")
|
||||
print(f" Text: {parsed['text']}")
|
||||
110
Provider/registry.py
Normal file
110
Provider/registry.py
Normal file
@@ -0,0 +1,110 @@
|
||||
"""Provider registry.
|
||||
|
||||
Concrete provider implementations live in the `Provider/` package.
|
||||
This module is the single source of truth for provider discovery.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, Optional, Type
|
||||
import sys
|
||||
|
||||
from SYS.logger import log
|
||||
|
||||
from Provider._base import FileProvider, SearchProvider, SearchResult
|
||||
from Provider.bandcamp import Bandcamp
|
||||
from Provider.libgen import Libgen
|
||||
from Provider.matrix import Matrix
|
||||
from Provider.soulseek import Soulseek, download_soulseek_file
|
||||
from Provider.youtube import YouTube
|
||||
from Provider.zeroxzero import ZeroXZero
|
||||
|
||||
|
||||
_SEARCH_PROVIDERS: Dict[str, Type[SearchProvider]] = {
|
||||
"libgen": Libgen,
|
||||
"soulseek": Soulseek,
|
||||
"bandcamp": Bandcamp,
|
||||
"youtube": YouTube,
|
||||
}
|
||||
|
||||
|
||||
def get_search_provider(name: str, config: Optional[Dict[str, Any]] = None) -> Optional[SearchProvider]:
|
||||
"""Get a search provider by name."""
|
||||
|
||||
provider_class = _SEARCH_PROVIDERS.get((name or "").lower())
|
||||
if provider_class is None:
|
||||
log(f"[provider] Unknown search provider: {name}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
try:
|
||||
provider = provider_class(config)
|
||||
if not provider.validate():
|
||||
log(f"[provider] Provider '{name}' is not available", file=sys.stderr)
|
||||
return None
|
||||
return provider
|
||||
except Exception as exc:
|
||||
log(f"[provider] Error initializing '{name}': {exc}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def list_search_providers(config: Optional[Dict[str, Any]] = None) -> Dict[str, bool]:
|
||||
"""List all search providers and their availability."""
|
||||
|
||||
availability: Dict[str, bool] = {}
|
||||
for name, provider_class in _SEARCH_PROVIDERS.items():
|
||||
try:
|
||||
provider = provider_class(config)
|
||||
availability[name] = provider.validate()
|
||||
except Exception:
|
||||
availability[name] = False
|
||||
return availability
|
||||
|
||||
|
||||
_FILE_PROVIDERS: Dict[str, Type[FileProvider]] = {
|
||||
"0x0": ZeroXZero,
|
||||
"matrix": Matrix,
|
||||
}
|
||||
|
||||
|
||||
def get_file_provider(name: str, config: Optional[Dict[str, Any]] = None) -> Optional[FileProvider]:
|
||||
"""Get a file provider by name."""
|
||||
|
||||
provider_class = _FILE_PROVIDERS.get((name or "").lower())
|
||||
if provider_class is None:
|
||||
log(f"[provider] Unknown file provider: {name}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
try:
|
||||
provider = provider_class(config)
|
||||
if not provider.validate():
|
||||
log(f"[provider] File provider '{name}' is not available", file=sys.stderr)
|
||||
return None
|
||||
return provider
|
||||
except Exception as exc:
|
||||
log(f"[provider] Error initializing file provider '{name}': {exc}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def list_file_providers(config: Optional[Dict[str, Any]] = None) -> Dict[str, bool]:
|
||||
"""List all file providers and their availability."""
|
||||
|
||||
availability: Dict[str, bool] = {}
|
||||
for name, provider_class in _FILE_PROVIDERS.items():
|
||||
try:
|
||||
provider = provider_class(config)
|
||||
availability[name] = provider.validate()
|
||||
except Exception:
|
||||
availability[name] = False
|
||||
return availability
|
||||
|
||||
|
||||
__all__ = [
|
||||
"SearchResult",
|
||||
"SearchProvider",
|
||||
"FileProvider",
|
||||
"get_search_provider",
|
||||
"list_search_providers",
|
||||
"get_file_provider",
|
||||
"list_file_providers",
|
||||
"download_soulseek_file",
|
||||
]
|
||||
380
Provider/soulseek.py
Normal file
380
Provider/soulseek.py
Normal file
@@ -0,0 +1,380 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from Provider._base import SearchProvider, SearchResult
|
||||
from SYS.logger import log, debug
|
||||
|
||||
|
||||
class Soulseek(SearchProvider):
|
||||
"""Search provider for Soulseek P2P network."""
|
||||
|
||||
MUSIC_EXTENSIONS = {
|
||||
".flac",
|
||||
".mp3",
|
||||
".m4a",
|
||||
".aac",
|
||||
".ogg",
|
||||
".opus",
|
||||
".wav",
|
||||
".alac",
|
||||
".wma",
|
||||
".ape",
|
||||
".aiff",
|
||||
".dsf",
|
||||
".dff",
|
||||
".wv",
|
||||
".tta",
|
||||
".tak",
|
||||
".ac3",
|
||||
".dts",
|
||||
}
|
||||
|
||||
# NOTE: These defaults preserve existing behavior.
|
||||
USERNAME = "asjhkjljhkjfdsd334"
|
||||
PASSWORD = "khhhg"
|
||||
DOWNLOAD_DIR = "./downloads"
|
||||
MAX_WAIT_TRANSFER = 1200
|
||||
|
||||
def download(self, result: SearchResult, output_dir: Path) -> Optional[Path]:
|
||||
"""Download file from Soulseek."""
|
||||
|
||||
try:
|
||||
full_metadata = result.full_metadata or {}
|
||||
username = full_metadata.get("username")
|
||||
filename = full_metadata.get("filename") or result.path
|
||||
|
||||
if not username or not filename:
|
||||
log(f"[soulseek] Missing metadata for download: {result.title}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
# This cmdlet stack is synchronous; use asyncio.run for clarity.
|
||||
return asyncio.run(
|
||||
download_soulseek_file(
|
||||
username=username,
|
||||
filename=filename,
|
||||
output_dir=output_dir,
|
||||
timeout=self.MAX_WAIT_TRANSFER,
|
||||
)
|
||||
)
|
||||
|
||||
except RuntimeError:
|
||||
# If we're already inside an event loop (e.g., TUI), fall back to a
|
||||
# dedicated loop in this thread.
|
||||
loop = asyncio.new_event_loop()
|
||||
try:
|
||||
asyncio.set_event_loop(loop)
|
||||
return loop.run_until_complete(
|
||||
download_soulseek_file(
|
||||
username=username,
|
||||
filename=filename,
|
||||
output_dir=output_dir,
|
||||
timeout=self.MAX_WAIT_TRANSFER,
|
||||
)
|
||||
)
|
||||
finally:
|
||||
try:
|
||||
loop.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
except Exception as exc:
|
||||
log(f"[soulseek] Download error: {exc}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
async def perform_search(self, query: str, timeout: float = 9.0, limit: int = 50) -> List[Dict[str, Any]]:
|
||||
"""Perform async Soulseek search."""
|
||||
|
||||
import os
|
||||
from aioslsk.client import SoulSeekClient
|
||||
from aioslsk.settings import CredentialsSettings, Settings
|
||||
|
||||
os.makedirs(self.DOWNLOAD_DIR, exist_ok=True)
|
||||
|
||||
settings = Settings(credentials=CredentialsSettings(username=self.USERNAME, password=self.PASSWORD))
|
||||
client = SoulSeekClient(settings)
|
||||
|
||||
try:
|
||||
await client.start()
|
||||
await client.login()
|
||||
except Exception as exc:
|
||||
log(f"[soulseek] Login failed: {type(exc).__name__}: {exc}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
try:
|
||||
search_request = await client.searches.search(query)
|
||||
await self._collect_results(search_request, timeout=timeout)
|
||||
return self._flatten_results(search_request)[:limit]
|
||||
except Exception as exc:
|
||||
log(f"[soulseek] Search error: {type(exc).__name__}: {exc}", file=sys.stderr)
|
||||
return []
|
||||
finally:
|
||||
try:
|
||||
await client.stop()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _flatten_results(self, search_request: Any) -> List[dict]:
|
||||
flat: List[dict] = []
|
||||
for result in getattr(search_request, "results", []):
|
||||
username = getattr(result, "username", "?")
|
||||
|
||||
for file_data in getattr(result, "shared_items", []):
|
||||
flat.append(
|
||||
{
|
||||
"file": file_data,
|
||||
"username": username,
|
||||
"filename": getattr(file_data, "filename", "?"),
|
||||
"size": getattr(file_data, "filesize", 0),
|
||||
}
|
||||
)
|
||||
|
||||
for file_data in getattr(result, "locked_results", []):
|
||||
flat.append(
|
||||
{
|
||||
"file": file_data,
|
||||
"username": username,
|
||||
"filename": getattr(file_data, "filename", "?"),
|
||||
"size": getattr(file_data, "filesize", 0),
|
||||
}
|
||||
)
|
||||
|
||||
return flat
|
||||
|
||||
async def _collect_results(self, search_request: Any, timeout: float = 75.0) -> None:
|
||||
end = time.time() + timeout
|
||||
last_count = 0
|
||||
while time.time() < end:
|
||||
current_count = len(getattr(search_request, "results", []))
|
||||
if current_count > last_count:
|
||||
debug(f"[soulseek] Got {current_count} result(s)...")
|
||||
last_count = current_count
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
def search(
|
||||
self,
|
||||
query: str,
|
||||
limit: int = 50,
|
||||
filters: Optional[Dict[str, Any]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[SearchResult]:
|
||||
filters = filters or {}
|
||||
|
||||
try:
|
||||
flat_results = asyncio.run(self.perform_search(query, timeout=9.0, limit=limit))
|
||||
if not flat_results:
|
||||
return []
|
||||
|
||||
music_results: List[dict] = []
|
||||
for item in flat_results:
|
||||
filename = item["filename"]
|
||||
ext = ("." + filename.rsplit(".", 1)[-1].lower()) if "." in filename else ""
|
||||
if ext in self.MUSIC_EXTENSIONS:
|
||||
music_results.append(item)
|
||||
|
||||
if not music_results:
|
||||
return []
|
||||
|
||||
enriched_results: List[dict] = []
|
||||
for item in music_results:
|
||||
filename = item["filename"]
|
||||
ext = ("." + filename.rsplit(".", 1)[-1].lower()) if "." in filename else ""
|
||||
|
||||
display_name = filename.replace("\\", "/").split("/")[-1]
|
||||
path_parts = filename.replace("\\", "/").split("/")
|
||||
artist = path_parts[-3] if len(path_parts) >= 3 else ""
|
||||
album = path_parts[-2] if len(path_parts) >= 3 else (path_parts[-2] if len(path_parts) == 2 else "")
|
||||
|
||||
base_name = display_name.rsplit(".", 1)[0] if "." in display_name else display_name
|
||||
track_num = ""
|
||||
title = base_name
|
||||
filename_artist = ""
|
||||
|
||||
match = re.match(r"^(\d{1,3})\s*[\.\-]?\s+(.+)$", base_name)
|
||||
if match:
|
||||
track_num = match.group(1)
|
||||
rest = match.group(2)
|
||||
if " - " in rest:
|
||||
filename_artist, title = rest.split(" - ", 1)
|
||||
else:
|
||||
title = rest
|
||||
|
||||
if filename_artist:
|
||||
artist = filename_artist
|
||||
|
||||
enriched_results.append(
|
||||
{
|
||||
**item,
|
||||
"artist": artist,
|
||||
"album": album,
|
||||
"title": title,
|
||||
"track_num": track_num,
|
||||
"ext": ext,
|
||||
}
|
||||
)
|
||||
|
||||
if filters:
|
||||
artist_filter = (filters.get("artist", "") or "").lower()
|
||||
album_filter = (filters.get("album", "") or "").lower()
|
||||
track_filter = (filters.get("track", "") or "").lower()
|
||||
|
||||
if artist_filter or album_filter or track_filter:
|
||||
filtered: List[dict] = []
|
||||
for item in enriched_results:
|
||||
if artist_filter and artist_filter not in item["artist"].lower():
|
||||
continue
|
||||
if album_filter and album_filter not in item["album"].lower():
|
||||
continue
|
||||
if track_filter and track_filter not in item["title"].lower():
|
||||
continue
|
||||
filtered.append(item)
|
||||
enriched_results = filtered
|
||||
|
||||
enriched_results.sort(key=lambda item: (item["ext"].lower() != ".flac", -item["size"]))
|
||||
|
||||
results: List[SearchResult] = []
|
||||
for item in enriched_results:
|
||||
artist_display = item["artist"] if item["artist"] else "(no artist)"
|
||||
album_display = item["album"] if item["album"] else "(no album)"
|
||||
size_mb = int(item["size"] / 1024 / 1024)
|
||||
|
||||
columns = [
|
||||
("Track", item["track_num"] or "?"),
|
||||
("Title", item["title"][:40]),
|
||||
("Artist", artist_display[:32]),
|
||||
("Album", album_display[:32]),
|
||||
("Size", f"{size_mb} MB"),
|
||||
]
|
||||
|
||||
results.append(
|
||||
SearchResult(
|
||||
table="soulseek",
|
||||
title=item["title"],
|
||||
path=item["filename"],
|
||||
detail=f"{artist_display} - {album_display}",
|
||||
annotations=[f"{size_mb} MB", item["ext"].lstrip(".").upper()],
|
||||
media_kind="audio",
|
||||
size_bytes=item["size"],
|
||||
columns=columns,
|
||||
full_metadata={
|
||||
"username": item["username"],
|
||||
"filename": item["filename"],
|
||||
"artist": item["artist"],
|
||||
"album": item["album"],
|
||||
"track_num": item["track_num"],
|
||||
"ext": item["ext"],
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
except Exception as exc:
|
||||
log(f"[soulseek] Search error: {exc}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
def validate(self) -> bool:
|
||||
try:
|
||||
from aioslsk.client import SoulSeekClient # noqa: F401
|
||||
|
||||
return True
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
|
||||
async def download_soulseek_file(
|
||||
username: str,
|
||||
filename: str,
|
||||
output_dir: Path = Path("./downloads"),
|
||||
timeout: int = 1200,
|
||||
) -> Optional[Path]:
|
||||
"""Download a file from a Soulseek peer."""
|
||||
|
||||
try:
|
||||
from aioslsk.client import SoulSeekClient
|
||||
from aioslsk.settings import CredentialsSettings, Settings
|
||||
from aioslsk.transfer.model import Transfer, TransferDirection
|
||||
from aioslsk.transfer.state import TransferState
|
||||
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
local_filename = filename.replace("\\", "/").split("/")[-1]
|
||||
output_user_dir = output_dir / username
|
||||
output_user_dir.mkdir(parents=True, exist_ok=True)
|
||||
output_path = (output_user_dir / local_filename)
|
||||
|
||||
if output_path.exists():
|
||||
base = output_path.stem
|
||||
ext = output_path.suffix
|
||||
counter = 1
|
||||
while output_path.exists():
|
||||
output_path = output_user_dir / f"{base}_{counter}{ext}"
|
||||
counter += 1
|
||||
|
||||
output_path = output_path.resolve()
|
||||
|
||||
settings = Settings(credentials=CredentialsSettings(username=Soulseek.USERNAME, password=Soulseek.PASSWORD))
|
||||
client = SoulSeekClient(settings)
|
||||
|
||||
try:
|
||||
await client.start()
|
||||
await client.login()
|
||||
debug(f"[soulseek] Logged in as {Soulseek.USERNAME}")
|
||||
|
||||
debug(f"[soulseek] Requesting download from {username}: {filename}")
|
||||
|
||||
transfer = await client.transfers.add(Transfer(username, filename, TransferDirection.DOWNLOAD))
|
||||
transfer.local_path = str(output_path)
|
||||
await client.transfers.queue(transfer)
|
||||
|
||||
start_time = time.time()
|
||||
last_log_time = 0.0
|
||||
while not transfer.is_finalized():
|
||||
if time.time() - start_time > timeout:
|
||||
log(f"[soulseek] Download timeout after {timeout}s", file=sys.stderr)
|
||||
return None
|
||||
|
||||
if time.time() - last_log_time >= 5.0 and transfer.bytes_transfered > 0:
|
||||
progress = (transfer.bytes_transfered / transfer.filesize * 100) if transfer.filesize else 0
|
||||
debug(
|
||||
f"[soulseek] Progress: {progress:.1f}% "
|
||||
f"({transfer.bytes_transfered}/{transfer.filesize})"
|
||||
)
|
||||
last_log_time = time.time()
|
||||
|
||||
await asyncio.sleep(1)
|
||||
|
||||
if transfer.state.VALUE == TransferState.COMPLETE and transfer.local_path:
|
||||
downloaded_path = Path(transfer.local_path)
|
||||
if downloaded_path.exists():
|
||||
debug(f"[soulseek] Download complete: {downloaded_path}")
|
||||
return downloaded_path
|
||||
|
||||
log(f"[soulseek] Transfer completed but file missing: {downloaded_path}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
log(
|
||||
f"[soulseek] Download failed: state={transfer.state.VALUE} "
|
||||
f"bytes={transfer.bytes_transfered}/{transfer.filesize}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return None
|
||||
|
||||
finally:
|
||||
try:
|
||||
await client.stop()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
except ImportError:
|
||||
log("[soulseek] aioslsk not installed. Install with: pip install aioslsk", file=sys.stderr)
|
||||
return None
|
||||
except Exception as exc:
|
||||
log(f"[soulseek] Download failed: {type(exc).__name__}: {exc}", file=sys.stderr)
|
||||
return None
|
||||
707
Provider/unified_book_downloader.py
Normal file
707
Provider/unified_book_downloader.py
Normal file
@@ -0,0 +1,707 @@
|
||||
"""Unified book downloader - handles Archive.org borrowing and Libgen fallback.
|
||||
|
||||
This module provides a single interface for downloading books from multiple sources:
|
||||
1. Try Archive.org direct download (if available)
|
||||
2. Try Archive.org borrowing (if user has credentials)
|
||||
3. Fallback to Libgen search by ISBN
|
||||
4. Attempt Libgen download
|
||||
|
||||
All sources integrated with proper metadata scraping and error handling.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import asyncio
|
||||
import requests
|
||||
from typing import Optional, Dict, Any, Tuple, List, Callable, cast
|
||||
from pathlib import Path
|
||||
|
||||
from SYS.logger import debug
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class UnifiedBookDownloader:
|
||||
"""Unified interface for downloading books from multiple sources."""
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None, output_dir: Optional[str] = None):
|
||||
"""Initialize the unified book downloader.
|
||||
|
||||
Args:
|
||||
config: Configuration dict with credentials
|
||||
output_dir: Default output directory
|
||||
"""
|
||||
self.config = config or {}
|
||||
self.output_dir = output_dir
|
||||
self.session = requests.Session()
|
||||
|
||||
# Import download functions from their modules
|
||||
self._init_downloaders()
|
||||
|
||||
def _init_downloaders(self) -> None:
|
||||
"""Initialize downloader functions from their modules."""
|
||||
try:
|
||||
from API.archive_client import (
|
||||
check_direct_download,
|
||||
get_openlibrary_by_isbn,
|
||||
loan
|
||||
)
|
||||
self.check_direct_download = check_direct_download
|
||||
self.get_openlibrary_by_isbn = get_openlibrary_by_isbn
|
||||
self.loan_func = loan
|
||||
logger.debug("[UnifiedBookDownloader] Loaded archive.org downloaders from archive_client")
|
||||
except Exception as e:
|
||||
logger.warning(f"[UnifiedBookDownloader] Failed to load archive.org functions: {e}")
|
||||
self.check_direct_download = None
|
||||
self.get_openlibrary_by_isbn = None
|
||||
self.loan_func = None
|
||||
|
||||
try:
|
||||
from Provider.libgen_service import (
|
||||
DEFAULT_LIMIT as _LIBGEN_DEFAULT_LIMIT,
|
||||
download_from_mirror as _libgen_download,
|
||||
search_libgen as _libgen_search,
|
||||
)
|
||||
|
||||
def _log_info(message: str) -> None:
|
||||
debug(f"[UnifiedBookDownloader] {message}")
|
||||
|
||||
def _log_error(message: str) -> None:
|
||||
logger.error(f"[UnifiedBookDownloader] {message}")
|
||||
|
||||
self.search_libgen = lambda query, limit=_LIBGEN_DEFAULT_LIMIT: _libgen_search(
|
||||
query,
|
||||
limit=limit,
|
||||
log_info=_log_info,
|
||||
log_error=_log_error,
|
||||
)
|
||||
self.download_from_mirror = lambda mirror_url, output_path: _libgen_download(
|
||||
mirror_url,
|
||||
output_path,
|
||||
log_info=_log_info,
|
||||
log_error=_log_error,
|
||||
)
|
||||
logger.debug("[UnifiedBookDownloader] Loaded Libgen helpers")
|
||||
except Exception as e:
|
||||
logger.warning(f"[UnifiedBookDownloader] Failed to load Libgen helpers: {e}")
|
||||
self.search_libgen = None
|
||||
self.download_from_mirror = None
|
||||
|
||||
def get_download_options(self, book_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Get all available download options for a book.
|
||||
|
||||
Checks in priority order:
|
||||
1. Archive.org direct download (public domain)
|
||||
2. Archive.org borrowing (if credentials available and book is borrowable)
|
||||
3. Libgen fallback (by ISBN)
|
||||
|
||||
Args:
|
||||
book_data: Book metadata dict with at least 'openlibrary_id' or 'isbn'
|
||||
|
||||
Returns:
|
||||
Dict with available download methods and metadata
|
||||
"""
|
||||
options = {
|
||||
'book_title': book_data.get('title', 'Unknown'),
|
||||
'book_author': book_data.get('author', 'Unknown'),
|
||||
'isbn': book_data.get('isbn', ''),
|
||||
'openlibrary_id': book_data.get('openlibrary_id', ''),
|
||||
'methods': [], # Will be sorted by priority
|
||||
'metadata': {}
|
||||
}
|
||||
|
||||
# Extract book ID from openlibrary_id (e.g., OL8513721M -> 8513721, OL8513721W -> 8513721)
|
||||
ol_id = book_data.get('openlibrary_id', '')
|
||||
book_id = None
|
||||
|
||||
if ol_id.startswith('OL') and len(ol_id) > 2:
|
||||
# Remove 'OL' prefix (keep everything after it including the suffix letter)
|
||||
# The book_id is all digits after 'OL'
|
||||
book_id = ''.join(c for c in ol_id[2:] if c.isdigit())
|
||||
|
||||
# PRIORITY 1: Check direct download (fastest, no auth needed)
|
||||
if self.check_direct_download:
|
||||
try:
|
||||
can_download, pdf_url = self.check_direct_download(book_id)
|
||||
if can_download:
|
||||
options['methods'].append({
|
||||
'type': 'archive.org_direct',
|
||||
'label': 'Archive.org Direct Download',
|
||||
'requires_auth': False,
|
||||
'pdf_url': pdf_url,
|
||||
'book_id': book_id,
|
||||
'priority': 1 # Highest priority
|
||||
})
|
||||
logger.info(f"[UnifiedBookDownloader] Direct download available for {book_id}")
|
||||
except Exception as e:
|
||||
logger.debug(f"[UnifiedBookDownloader] Direct download check failed: {e}")
|
||||
|
||||
# PRIORITY 2: Check borrowing option (requires auth, 14-day loan)
|
||||
# First verify the book is actually lendable via OpenLibrary API
|
||||
if self._has_archive_credentials():
|
||||
is_lendable, status = self._check_book_lendable_status(ol_id)
|
||||
|
||||
if is_lendable:
|
||||
options['methods'].append({
|
||||
'type': 'archive.org_borrow',
|
||||
'label': 'Archive.org Borrow',
|
||||
'requires_auth': True,
|
||||
'book_id': book_id,
|
||||
'priority': 2 # Second priority
|
||||
})
|
||||
logger.info(f"[UnifiedBookDownloader] Borrow option available for {book_id} (status: {status})")
|
||||
else:
|
||||
logger.debug(f"[UnifiedBookDownloader] Borrow not available for {book_id} (status: {status})")
|
||||
|
||||
# PRIORITY 3: Check Libgen fallback (by ISBN, no auth needed, most reliable)
|
||||
isbn = book_data.get('isbn', '')
|
||||
title = book_data.get('title', '')
|
||||
author = book_data.get('author', '')
|
||||
|
||||
if self.search_libgen:
|
||||
# Can use Libgen if we have ISBN OR title (or both)
|
||||
if isbn or title:
|
||||
options['methods'].append({
|
||||
'type': 'libgen',
|
||||
'label': 'Libgen Search & Download',
|
||||
'requires_auth': False,
|
||||
'isbn': isbn,
|
||||
'title': title,
|
||||
'author': author,
|
||||
'priority': 3 # Third priority (fallback)
|
||||
})
|
||||
logger.info(f"[UnifiedBookDownloader] Libgen fallback available (ISBN: {isbn if isbn else 'N/A'}, Title: {title})")
|
||||
|
||||
# Sort by priority (higher priority first)
|
||||
options['methods'].sort(key=lambda x: x.get('priority', 999))
|
||||
|
||||
return options
|
||||
|
||||
def _has_archive_credentials(self) -> bool:
|
||||
"""Check if Archive.org credentials are available."""
|
||||
try:
|
||||
from API.archive_client import credential_openlibrary
|
||||
email, password = credential_openlibrary(self.config)
|
||||
return bool(email and password)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _check_book_lendable_status(self, ol_id: str) -> Tuple[bool, Optional[str]]:
|
||||
"""Check if a book is lendable via OpenLibrary API.
|
||||
|
||||
Queries: https://openlibrary.org/api/volumes/brief/json/OLID:{ol_id}
|
||||
Note: Only works with Edition IDs (OL...M), not Work IDs (OL...W)
|
||||
|
||||
Args:
|
||||
ol_id: OpenLibrary ID (e.g., OL8513721M for Edition or OL4801915W for Work)
|
||||
|
||||
Returns:
|
||||
Tuple of (is_lendable: bool, status_reason: Optional[str])
|
||||
"""
|
||||
try:
|
||||
if not ol_id.startswith('OL'):
|
||||
return False, "Invalid OpenLibrary ID format"
|
||||
|
||||
# If this is a Work ID (ends with W), we can't query Volumes API
|
||||
# Work IDs are abstract umbrella records, not specific editions
|
||||
if ol_id.endswith('W'):
|
||||
logger.debug(f"[UnifiedBookDownloader] Work ID {ol_id} - skipping Volumes API (not lendable)")
|
||||
return False, "Work ID not supported by Volumes API (not a specific edition)"
|
||||
|
||||
# If it ends with M, it's an Edition ID - proceed with query
|
||||
if not ol_id.endswith('M'):
|
||||
logger.debug(f"[UnifiedBookDownloader] Unknown ID type {ol_id} (not M or W)")
|
||||
return False, "Invalid OpenLibrary ID type"
|
||||
|
||||
url = f"https://openlibrary.org/api/volumes/brief/json/OLID:{ol_id}"
|
||||
response = self.session.get(url, timeout=10)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
# Empty response means no records found
|
||||
if not data:
|
||||
logger.debug(f"[UnifiedBookDownloader] Empty response for {ol_id}")
|
||||
return False, "No availability data found"
|
||||
|
||||
# The response is wrapped in OLID key
|
||||
olid_key = f"OLID:{ol_id}"
|
||||
if olid_key not in data:
|
||||
logger.debug(f"[UnifiedBookDownloader] OLID key not found in response")
|
||||
return False, "No availability data found"
|
||||
|
||||
olid_data = data[olid_key]
|
||||
|
||||
# Check items array for lendable status
|
||||
if 'items' in olid_data and olid_data['items'] and len(olid_data['items']) > 0:
|
||||
items = olid_data['items']
|
||||
|
||||
# Check the first item for lending status
|
||||
first_item = items[0]
|
||||
|
||||
# Handle both dict and string representations (PowerShell converts to string)
|
||||
if isinstance(first_item, dict):
|
||||
status = first_item.get('status', '')
|
||||
else:
|
||||
# String representation - check if 'lendable' is in it
|
||||
status = str(first_item).lower()
|
||||
|
||||
is_lendable = 'lendable' in str(status).lower()
|
||||
|
||||
if is_lendable:
|
||||
logger.info(f"[UnifiedBookDownloader] Book {ol_id} is lendable")
|
||||
return True, "LENDABLE"
|
||||
else:
|
||||
status_str = status.get('status', 'NOT_LENDABLE') if isinstance(status, dict) else 'NOT_LENDABLE'
|
||||
logger.debug(f"[UnifiedBookDownloader] Book {ol_id} is not lendable (status: {status_str})")
|
||||
return False, status_str
|
||||
else:
|
||||
# No items array or empty
|
||||
logger.debug(f"[UnifiedBookDownloader] No items found for {ol_id}")
|
||||
return False, "Not available for lending"
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
logger.warning(f"[UnifiedBookDownloader] OpenLibrary API timeout for {ol_id}")
|
||||
return False, "API timeout"
|
||||
except Exception as e:
|
||||
logger.debug(f"[UnifiedBookDownloader] Failed to check lendable status for {ol_id}: {e}")
|
||||
return False, f"API error"
|
||||
|
||||
|
||||
async def download_book(self, method: Dict[str, Any], output_dir: Optional[str] = None) -> Tuple[bool, str]:
|
||||
"""Download a book using the specified method.
|
||||
|
||||
Args:
|
||||
method: Download method dict from get_download_options()
|
||||
output_dir: Directory to save the book
|
||||
|
||||
Returns:
|
||||
Tuple of (success: bool, message: str)
|
||||
"""
|
||||
output_dir = output_dir or self.output_dir or str(Path.home() / "Downloads")
|
||||
method_type = method.get('type', '')
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Starting download with method: {method_type}")
|
||||
|
||||
try:
|
||||
if method_type == 'archive.org_direct':
|
||||
return await self._download_archive_direct(method, output_dir)
|
||||
|
||||
elif method_type == 'archive.org_borrow':
|
||||
return await self._download_archive_borrow(method, output_dir)
|
||||
|
||||
elif method_type == 'libgen':
|
||||
return await self._download_libgen(method, output_dir)
|
||||
|
||||
else:
|
||||
return False, f"Unknown download method: {method_type}"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[UnifiedBookDownloader] Download error: {e}", exc_info=True)
|
||||
return False, f"Download failed: {str(e)}"
|
||||
|
||||
async def _download_archive_direct(self, method: Dict[str, Any], output_dir: str) -> Tuple[bool, str]:
|
||||
"""Download directly from Archive.org."""
|
||||
try:
|
||||
pdf_url = method.get('pdf_url', '')
|
||||
book_id = method.get('book_id', '')
|
||||
|
||||
if not pdf_url:
|
||||
return False, "No PDF URL available"
|
||||
|
||||
# Determine output filename
|
||||
filename = f"{book_id}.pdf"
|
||||
output_path = Path(output_dir) / filename
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Downloading PDF from: {pdf_url}")
|
||||
|
||||
# Download in a thread to avoid blocking
|
||||
loop = asyncio.get_event_loop()
|
||||
success = await loop.run_in_executor(
|
||||
None,
|
||||
self._download_file,
|
||||
pdf_url,
|
||||
str(output_path)
|
||||
)
|
||||
|
||||
if success:
|
||||
logger.info(f"[UnifiedBookDownloader] Successfully downloaded to: {output_path}")
|
||||
return True, f"Downloaded to: {output_path}"
|
||||
else:
|
||||
return False, "Failed to download PDF"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[UnifiedBookDownloader] Archive direct download error: {e}")
|
||||
return False, f"Archive download failed: {str(e)}"
|
||||
|
||||
async def _download_archive_borrow(self, method: Dict[str, Any], output_dir: str) -> Tuple[bool, str]:
|
||||
"""Download via Archive.org borrowing (requires credentials).
|
||||
|
||||
Process (follows archive_client.py pattern):
|
||||
1. Login to Archive.org with credentials
|
||||
2. Call loan endpoint to borrow the book (14-day loan)
|
||||
3. Get book info (page links, metadata)
|
||||
4. Download all pages as images
|
||||
5. Merge images into PDF
|
||||
|
||||
The loan function from archive_client.py handles:
|
||||
- Checking if book needs borrowing (status 400 = "doesn't need to be borrowed")
|
||||
- Creating borrow token for access
|
||||
- Handling borrow failures
|
||||
|
||||
get_book_infos() extracts page links from the borrowed book viewer
|
||||
download() downloads all pages using thread pool
|
||||
img2pdf merges pages into searchable PDF
|
||||
"""
|
||||
try:
|
||||
from API.archive_client import credential_openlibrary
|
||||
|
||||
book_id = method.get('book_id', '')
|
||||
|
||||
# Get credentials
|
||||
email, password = credential_openlibrary(self.config)
|
||||
if not email or not password:
|
||||
return False, "Archive.org credentials not configured"
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Logging into Archive.org...")
|
||||
|
||||
# Login and borrow (in thread, following download_book.py pattern)
|
||||
loop = asyncio.get_event_loop()
|
||||
borrow_result = await loop.run_in_executor(
|
||||
None,
|
||||
self._archive_borrow_and_download,
|
||||
email,
|
||||
password,
|
||||
book_id,
|
||||
output_dir
|
||||
)
|
||||
|
||||
if borrow_result and isinstance(borrow_result, tuple):
|
||||
success, filepath = borrow_result
|
||||
if success:
|
||||
logger.info(f"[UnifiedBookDownloader] Borrow succeeded: {filepath}")
|
||||
return True, filepath
|
||||
else:
|
||||
logger.warning(f"[UnifiedBookDownloader] Borrow failed: {filepath}")
|
||||
return False, filepath
|
||||
else:
|
||||
return False, "Failed to borrow book from Archive.org"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[UnifiedBookDownloader] Archive borrow error: {e}")
|
||||
return False, f"Archive borrow failed: {str(e)}"
|
||||
|
||||
async def _download_libgen(self, method: Dict[str, Any], output_dir: str) -> Tuple[bool, str]:
|
||||
"""Download via Libgen search and download with mirror fallback."""
|
||||
try:
|
||||
isbn = method.get('isbn', '')
|
||||
title = method.get('title', '')
|
||||
|
||||
if not isbn and not title:
|
||||
return False, "Need ISBN or title for Libgen search"
|
||||
|
||||
if not self.search_libgen:
|
||||
return False, "Libgen searcher not available"
|
||||
|
||||
# Define wrapper functions to safely call the methods
|
||||
search_func = self.search_libgen
|
||||
if search_func is None:
|
||||
return False, "Search function not available"
|
||||
|
||||
preloaded_results = method.get('results')
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
if preloaded_results:
|
||||
results = list(preloaded_results)
|
||||
if not results:
|
||||
results = await loop.run_in_executor(None, lambda: search_func(isbn or title, 10))
|
||||
else:
|
||||
results = await loop.run_in_executor(None, lambda: search_func(isbn or title, 10))
|
||||
|
||||
if not results:
|
||||
logger.warning(f"[UnifiedBookDownloader] No Libgen results for: {isbn or title}")
|
||||
return False, f"No Libgen results found for: {isbn or title}"
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Found {len(results)} Libgen results")
|
||||
|
||||
# Determine output filename (use first result for naming)
|
||||
first_result = results[0]
|
||||
filename = f"{first_result.get('title', 'book')}"
|
||||
filename = "".join(c for c in filename if c.isalnum() or c in (' ', '.', '-'))[:100]
|
||||
|
||||
# Try each result's mirror until one succeeds
|
||||
for idx, result in enumerate(results, 1):
|
||||
mirror_url = result.get('mirror_url', '')
|
||||
|
||||
if not mirror_url:
|
||||
logger.debug(f"[UnifiedBookDownloader] Result {idx}: No mirror URL")
|
||||
continue
|
||||
|
||||
# Use extension from this result if available
|
||||
extension = result.get('extension', 'pdf')
|
||||
if extension and not extension.startswith('.'):
|
||||
extension = f".{extension}"
|
||||
elif not extension:
|
||||
extension = '.pdf'
|
||||
|
||||
output_path = Path(output_dir) / (filename + extension)
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Trying mirror {idx}/{len(results)}: {mirror_url}")
|
||||
|
||||
download_func = self.download_from_mirror
|
||||
if download_func is None:
|
||||
return False, "Download function not available"
|
||||
|
||||
download_callable = cast(Callable[[str, str], Tuple[bool, Optional[Path]]], download_func)
|
||||
|
||||
def download_wrapper():
|
||||
return download_callable(mirror_url, str(output_path))
|
||||
|
||||
# Download (in thread)
|
||||
try:
|
||||
success, downloaded_path = await loop.run_in_executor(None, download_wrapper)
|
||||
|
||||
if success:
|
||||
dest_path = Path(downloaded_path) if downloaded_path else output_path
|
||||
# Validate downloaded file is not HTML (common Libgen issue)
|
||||
if dest_path.exists():
|
||||
try:
|
||||
with open(dest_path, 'rb') as f:
|
||||
file_start = f.read(1024).decode('utf-8', errors='ignore').lower()
|
||||
if '<!doctype' in file_start or '<html' in file_start:
|
||||
logger.warning(f"[UnifiedBookDownloader] Mirror {idx} returned HTML instead of file, trying next mirror...")
|
||||
dest_path.unlink() # Delete the HTML file
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.debug(f"[UnifiedBookDownloader] Could not validate file content: {e}")
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Successfully downloaded from mirror {idx} to: {dest_path}")
|
||||
return True, str(dest_path)
|
||||
else:
|
||||
logger.warning(f"[UnifiedBookDownloader] Mirror {idx} download failed, trying next...")
|
||||
except Exception as e:
|
||||
logger.warning(f"[UnifiedBookDownloader] Mirror {idx} error: {e}, trying next...")
|
||||
continue
|
||||
|
||||
return False, f"All {len(results)} mirrors failed"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[UnifiedBookDownloader] Libgen download error: {e}")
|
||||
return False, f"Libgen download failed: {str(e)}"
|
||||
|
||||
async def download_libgen_selection(
|
||||
self,
|
||||
selected: Dict[str, Any],
|
||||
remaining: Optional[List[Dict[str, Any]]] = None,
|
||||
output_dir: Optional[str] = None,
|
||||
) -> Tuple[bool, str]:
|
||||
"""Download a specific Libgen result with optional fallbacks."""
|
||||
|
||||
if not isinstance(selected, dict):
|
||||
return False, "Selected result must be a dictionary"
|
||||
|
||||
ordered_results: List[Dict[str, Any]] = [selected]
|
||||
if remaining:
|
||||
for item in remaining:
|
||||
if isinstance(item, dict) and item is not selected:
|
||||
ordered_results.append(item)
|
||||
|
||||
method: Dict[str, Any] = {
|
||||
'type': 'libgen',
|
||||
'isbn': selected.get('isbn', '') or '',
|
||||
'title': selected.get('title', '') or '',
|
||||
'author': selected.get('author', '') or '',
|
||||
'results': ordered_results,
|
||||
}
|
||||
|
||||
return await self.download_book(method, output_dir)
|
||||
|
||||
def download_libgen_selection_sync(
|
||||
self,
|
||||
selected: Dict[str, Any],
|
||||
remaining: Optional[List[Dict[str, Any]]] = None,
|
||||
output_dir: Optional[str] = None,
|
||||
) -> Tuple[bool, str]:
|
||||
"""Synchronous helper for downloading a Libgen selection."""
|
||||
|
||||
async def _run() -> Tuple[bool, str]:
|
||||
return await self.download_libgen_selection(selected, remaining, output_dir)
|
||||
|
||||
loop = asyncio.new_event_loop()
|
||||
try:
|
||||
asyncio.set_event_loop(loop)
|
||||
return loop.run_until_complete(_run())
|
||||
finally:
|
||||
loop.close()
|
||||
asyncio.set_event_loop(None)
|
||||
|
||||
def _download_file(self, url: str, output_path: str) -> bool:
|
||||
"""Download a file from URL."""
|
||||
try:
|
||||
response = requests.get(url, stream=True, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
with open(output_path, 'wb') as f:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"[UnifiedBookDownloader] File download error: {e}")
|
||||
return False
|
||||
|
||||
def _archive_borrow_and_download(self, email: str, password: str, book_id: str, output_dir: str) -> Tuple[bool, str]:
|
||||
"""Borrow a book from Archive.org and download pages as PDF.
|
||||
|
||||
This follows the exact process from archive_client.py:
|
||||
1. Login with credentials
|
||||
2. Call loan() to create 14-day borrow
|
||||
3. Get book info (extract page url)
|
||||
4. Download all pages as images
|
||||
5. Merge images into searchable PDF
|
||||
|
||||
Returns tuple of (success: bool, filepath/message: str)
|
||||
"""
|
||||
try:
|
||||
from API.archive_client import login, loan, get_book_infos, download
|
||||
import tempfile
|
||||
import shutil
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Logging into Archive.org as {email}")
|
||||
session = login(email, password)
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Attempting to borrow book: {book_id}")
|
||||
# Call loan to create the 14-day borrow
|
||||
session = loan(session, book_id, verbose=True)
|
||||
|
||||
# If we get here, borrowing succeeded
|
||||
logger.info(f"[UnifiedBookDownloader] Successfully borrowed book: {book_id}")
|
||||
|
||||
# Now get the book info (page url and metadata)
|
||||
logger.info(f"[UnifiedBookDownloader] Extracting book page information...")
|
||||
# Try both URL formats: with /borrow and without
|
||||
book_url = [
|
||||
f"https://archive.org/borrow/{book_id}", # Try borrow page first (for borrowed books)
|
||||
f"https://archive.org/details/{book_id}" # Fallback to details page
|
||||
]
|
||||
|
||||
title = None
|
||||
links = None
|
||||
metadata = None
|
||||
last_error = None
|
||||
|
||||
for book_url in book_url:
|
||||
try:
|
||||
logger.debug(f"[UnifiedBookDownloader] Trying to get book info from: {book_url}")
|
||||
response = session.get(book_url, timeout=10)
|
||||
|
||||
# Log response status
|
||||
if response.status_code != 200:
|
||||
logger.debug(f"[UnifiedBookDownloader] URL returned {response.status_code}: {book_url}")
|
||||
# Continue to try next URL
|
||||
continue
|
||||
|
||||
# Try to parse the response
|
||||
title, links, metadata = get_book_infos(session, book_url)
|
||||
logger.info(f"[UnifiedBookDownloader] Successfully got info from: {book_url}")
|
||||
logger.info(f"[UnifiedBookDownloader] Found {len(links)} pages to download")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.debug(f"[UnifiedBookDownloader] Failed with {book_url}: {e}")
|
||||
last_error = e
|
||||
continue
|
||||
|
||||
if links is None:
|
||||
logger.error(f"[UnifiedBookDownloader] Failed to get book info from all url: {last_error}")
|
||||
# Borrow extraction failed - return False
|
||||
return False, "Could not extract borrowed book pages"
|
||||
|
||||
# Create temporary directory for images
|
||||
temp_dir = tempfile.mkdtemp(prefix=f"{title}_", dir=output_dir)
|
||||
logger.info(f"[UnifiedBookDownloader] Downloading {len(links)} pages to temporary directory...")
|
||||
|
||||
try:
|
||||
# Download all pages (uses thread pool)
|
||||
images = download(
|
||||
session=session,
|
||||
n_threads=10,
|
||||
directory=temp_dir,
|
||||
links=links,
|
||||
scale=3, # Default resolution
|
||||
book_id=book_id
|
||||
)
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Downloaded {len(images)} pages")
|
||||
|
||||
# Try to merge pages into PDF
|
||||
try:
|
||||
import img2pdf
|
||||
logger.info(f"[UnifiedBookDownloader] Merging pages into PDF...")
|
||||
|
||||
# Prepare PDF metadata
|
||||
pdfmeta = {}
|
||||
if metadata:
|
||||
if "title" in metadata:
|
||||
pdfmeta["title"] = metadata["title"]
|
||||
if "creator" in metadata:
|
||||
pdfmeta["author"] = metadata["creator"]
|
||||
pdfmeta["keywords"] = [f"https://archive.org/details/{book_id}"]
|
||||
pdfmeta["creationdate"] = None # Avoid timezone issues
|
||||
|
||||
# Convert images to PDF
|
||||
pdf_content = img2pdf.convert(images, **pdfmeta) if images else None
|
||||
if not pdf_content:
|
||||
logger.error(f"[UnifiedBookDownloader] PDF conversion failed")
|
||||
return False, "Failed to convert pages to PDF"
|
||||
|
||||
# Save the PDF
|
||||
pdf_filename = f"{title}.pdf" if title else "book.pdf"
|
||||
pdf_path = Path(output_dir) / pdf_filename
|
||||
|
||||
# Handle duplicate filenames
|
||||
i = 1
|
||||
while pdf_path.exists():
|
||||
pdf_path = Path(output_dir) / f"{title or 'book'}({i}).pdf"
|
||||
i += 1
|
||||
|
||||
with open(pdf_path, 'wb') as f:
|
||||
f.write(pdf_content)
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Successfully created PDF: {pdf_path}")
|
||||
|
||||
return True, str(pdf_path)
|
||||
|
||||
except ImportError:
|
||||
logger.warning(f"[UnifiedBookDownloader] img2pdf not available, saving as JPG collection instead")
|
||||
|
||||
# Create JPG collection directory
|
||||
if not title:
|
||||
title = f"book_{book_id}"
|
||||
jpg_dir = Path(output_dir) / title
|
||||
i = 1
|
||||
while jpg_dir.exists():
|
||||
jpg_dir = Path(output_dir) / f"{title}({i})"
|
||||
i += 1
|
||||
|
||||
# Move temporary directory to final location
|
||||
shutil.move(temp_dir, str(jpg_dir))
|
||||
temp_dir = None # Mark as already moved
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Saved as JPG collection: {jpg_dir}")
|
||||
return True, str(jpg_dir)
|
||||
|
||||
finally:
|
||||
# Clean up temporary directory if it still exists
|
||||
if temp_dir and Path(temp_dir).exists():
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
except SystemExit:
|
||||
# loan() function calls sys.exit on failure - catch it
|
||||
logger.error(f"[UnifiedBookDownloader] Borrow process exited (book may not be borrowable)")
|
||||
return False, "Book could not be borrowed (may not be available for borrowing)"
|
||||
except Exception as e:
|
||||
logger.error(f"[UnifiedBookDownloader] Archive borrow error: {e}")
|
||||
return False, f"Borrow failed: {str(e)}"
|
||||
|
||||
def close(self) -> None:
|
||||
"""Close the session."""
|
||||
self.session.close()
|
||||
94
Provider/youtube.py
Normal file
94
Provider/youtube.py
Normal file
@@ -0,0 +1,94 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from Provider._base import SearchProvider, SearchResult
|
||||
from SYS.logger import log
|
||||
|
||||
|
||||
class YouTube(SearchProvider):
|
||||
"""Search provider for YouTube using yt-dlp."""
|
||||
|
||||
def search(
|
||||
self,
|
||||
query: str,
|
||||
limit: int = 10,
|
||||
filters: Optional[Dict[str, Any]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[SearchResult]:
|
||||
ytdlp_path = shutil.which("yt-dlp")
|
||||
if not ytdlp_path:
|
||||
log("[youtube] yt-dlp not found in PATH", file=sys.stderr)
|
||||
return []
|
||||
|
||||
search_query = f"ytsearch{limit}:{query}"
|
||||
cmd = [ytdlp_path, "--dump-json", "--flat-playlist", "--no-warnings", search_query]
|
||||
|
||||
try:
|
||||
process = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
encoding="utf-8",
|
||||
errors="replace",
|
||||
)
|
||||
|
||||
if process.returncode != 0:
|
||||
log(f"[youtube] yt-dlp failed: {process.stderr}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
results: List[SearchResult] = []
|
||||
for line in process.stdout.splitlines():
|
||||
if not line.strip():
|
||||
continue
|
||||
|
||||
try:
|
||||
video_data = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
title = video_data.get("title", "Unknown")
|
||||
video_id = video_data.get("id", "")
|
||||
url = video_data.get("url") or f"https://youtube.com/watch?v={video_id}"
|
||||
uploader = video_data.get("uploader", "Unknown")
|
||||
duration = video_data.get("duration", 0)
|
||||
view_count = video_data.get("view_count", 0)
|
||||
|
||||
duration_str = f"{int(duration // 60)}:{int(duration % 60):02d}" if duration else ""
|
||||
views_str = f"{view_count:,}" if view_count else ""
|
||||
|
||||
results.append(
|
||||
SearchResult(
|
||||
table="youtube",
|
||||
title=title,
|
||||
path=url,
|
||||
detail=f"By: {uploader}",
|
||||
annotations=[duration_str, f"{views_str} views"],
|
||||
media_kind="video",
|
||||
columns=[
|
||||
("Title", title),
|
||||
("Uploader", uploader),
|
||||
("Duration", duration_str),
|
||||
("Views", views_str),
|
||||
],
|
||||
full_metadata={
|
||||
"video_id": video_id,
|
||||
"uploader": uploader,
|
||||
"duration": duration,
|
||||
"view_count": view_count,
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
except Exception as exc:
|
||||
log(f"[youtube] Error: {exc}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
def validate(self) -> bool:
|
||||
return shutil.which("yt-dlp") is not None
|
||||
36
Provider/zeroxzero.py
Normal file
36
Provider/zeroxzero.py
Normal file
@@ -0,0 +1,36 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sys
|
||||
from typing import Any
|
||||
|
||||
from Provider._base import FileProvider
|
||||
from SYS.logger import log
|
||||
|
||||
|
||||
class ZeroXZero(FileProvider):
|
||||
"""File provider for 0x0.st."""
|
||||
|
||||
def upload(self, file_path: str, **kwargs: Any) -> str:
|
||||
from API.HTTP import HTTPClient
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
try:
|
||||
headers = {"User-Agent": "Medeia-Macina/1.0"}
|
||||
with HTTPClient(headers=headers) as client:
|
||||
with open(file_path, "rb") as handle:
|
||||
response = client.post("https://0x0.st", files={"file": handle})
|
||||
|
||||
if response.status_code == 200:
|
||||
return response.text.strip()
|
||||
|
||||
raise Exception(f"Upload failed: {response.status_code} - {response.text}")
|
||||
|
||||
except Exception as exc:
|
||||
log(f"[0x0] Upload error: {exc}", file=sys.stderr)
|
||||
raise
|
||||
|
||||
def validate(self) -> bool:
|
||||
return True
|
||||
Reference in New Issue
Block a user