This commit is contained in:
nose
2025-12-16 01:45:01 -08:00
parent a03eb0d1be
commit 9873280f0e
36 changed files with 4911 additions and 1225 deletions

296
Provider/alldebrid.py Normal file
View File

@@ -0,0 +1,296 @@
from __future__ import annotations
import sys
from typing import Any, Dict, Iterable, List, Optional
from ProviderCore.base import SearchProvider, SearchResult
from SYS.logger import log
def _get_debrid_api_key(config: Dict[str, Any]) -> Optional[str]:
"""Read AllDebrid API key from config.
Preferred formats:
- config.conf provider block:
[provider=alldebrid]
api_key=...
-> config["provider"]["alldebrid"]["api_key"]
- store-style debrid block:
config["store"]["debrid"]["all-debrid"]["api_key"]
Falls back to some legacy keys if present.
"""
# 1) provider block: [provider=alldebrid]
provider = config.get("provider")
if isinstance(provider, dict):
entry = provider.get("alldebrid")
if isinstance(entry, dict):
for k in ("api_key", "apikey", "API_KEY", "APIKEY"):
val = entry.get(k)
if isinstance(val, str) and val.strip():
return val.strip()
if isinstance(entry, str) and entry.strip():
return entry.strip()
# 2) store.debrid block (canonical for debrid store configuration)
try:
from config import get_debrid_api_key
key = get_debrid_api_key(config, service="All-debrid")
return key.strip() if key else None
except Exception:
pass
# Legacy fallback (kept permissive so older configs still work)
for legacy_key in ("alldebrid_api_key", "AllDebrid", "all_debrid_api_key"):
val = config.get(legacy_key)
if isinstance(val, str) and val.strip():
return val.strip()
return None
class AllDebrid(SearchProvider):
"""Search provider for AllDebrid account content.
This provider lists and searches the files/magnets already present in the
user's AllDebrid account.
Query behavior:
- "*" / "all" / "list": list recent files from ready magnets
- otherwise: substring match on file name OR magnet name, or exact magnet id
"""
def validate(self) -> bool:
# Consider "available" when configured; actual API connectivity can vary.
return bool(_get_debrid_api_key(self.config or {}))
@staticmethod
def _flatten_files(items: Any) -> Iterable[Dict[str, Any]]:
"""Flatten AllDebrid magnet file tree into file dicts.
API commonly returns:
- file: {n: name, s: size, l: link}
- folder: {n: name, e: [sub_items]}
Some call sites in this repo also expect {name, size, link}, so we accept both.
"""
if not items:
return
if isinstance(items, dict):
items = [items]
if not isinstance(items, list):
return
for node in items:
if not isinstance(node, dict):
continue
children = node.get('e') or node.get('children')
if isinstance(children, list):
yield from AllDebrid._flatten_files(children)
continue
name = node.get('n') or node.get('name')
link = node.get('l') or node.get('link')
if isinstance(name, str) and name.strip() and isinstance(link, str) and link.strip():
yield node
def search(
self,
query: str,
limit: int = 50,
filters: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> List[SearchResult]:
q = (query or "").strip()
if not q:
return []
api_key = _get_debrid_api_key(self.config or {})
if not api_key:
return []
view = None
if isinstance(filters, dict):
view = str(filters.get("view") or "").strip().lower() or None
view = view or "folders"
try:
from API.alldebrid import AllDebridClient
client = AllDebridClient(api_key)
except Exception as exc:
log(f"[alldebrid] Failed to init client: {exc}", file=sys.stderr)
return []
q_lower = q.lower()
needle = "" if q_lower in {"*", "all", "list"} else q_lower
# Second-stage: list files for a specific magnet id.
if view == "files":
magnet_id_val = None
if isinstance(filters, dict):
magnet_id_val = filters.get("magnet_id")
if magnet_id_val is None:
magnet_id_val = kwargs.get("magnet_id")
try:
magnet_id = int(magnet_id_val)
except Exception:
return []
magnet_status: Dict[str, Any] = {}
try:
magnet_status = client.magnet_status(magnet_id)
except Exception:
magnet_status = {}
magnet_name = str(magnet_status.get('filename') or magnet_status.get('name') or magnet_status.get('hash') or f"magnet-{magnet_id}")
status_code = magnet_status.get('statusCode')
status_text = str(magnet_status.get('status') or "").strip() or "unknown"
ready = status_code == 4 or bool(magnet_status.get('ready'))
if not ready:
return [
SearchResult(
table="alldebrid",
title=magnet_name,
path=f"alldebrid:magnet:{magnet_id}",
detail=status_text,
annotations=["folder", "not-ready"],
media_kind="folder",
tag={"alldebrid", "folder", str(magnet_id), "not-ready"},
columns=[
("Folder", magnet_name),
("ID", str(magnet_id)),
("Status", status_text),
("Ready", "no"),
],
full_metadata={"magnet": magnet_status, "magnet_id": magnet_id},
)
]
try:
files_result = client.magnet_links([magnet_id])
magnet_files = files_result.get(str(magnet_id), {}) if isinstance(files_result, dict) else {}
file_tree = magnet_files.get('files', []) if isinstance(magnet_files, dict) else []
except Exception as exc:
log(f"[alldebrid] Failed to list files for magnet {magnet_id}: {exc}", file=sys.stderr)
file_tree = []
results: List[SearchResult] = []
for file_node in self._flatten_files(file_tree):
file_name = str(file_node.get('n') or file_node.get('name') or '').strip()
file_url = str(file_node.get('l') or file_node.get('link') or '').strip()
file_size = file_node.get('s') or file_node.get('size')
if not file_name or not file_url:
continue
if needle and needle not in file_name.lower():
continue
size_bytes: Optional[int] = None
try:
if isinstance(file_size, (int, float)):
size_bytes = int(file_size)
elif isinstance(file_size, str) and file_size.isdigit():
size_bytes = int(file_size)
except Exception:
size_bytes = None
results.append(
SearchResult(
table="alldebrid",
title=file_name,
path=file_url,
detail=magnet_name,
annotations=["file"],
media_kind="file",
size_bytes=size_bytes,
tag={"alldebrid", "file", str(magnet_id)},
columns=[
("File", file_name),
("Folder", magnet_name),
("ID", str(magnet_id)),
],
full_metadata={"magnet": magnet_status, "magnet_id": magnet_id, "file": file_node},
)
)
if len(results) >= max(1, limit):
break
return results
# Default: folders view (magnets)
try:
magnets = client.magnet_list() or []
except Exception as exc:
log(f"[alldebrid] Failed to list account magnets: {exc}", file=sys.stderr)
return []
wanted_id: Optional[int] = None
if needle.isdigit():
try:
wanted_id = int(needle)
except Exception:
wanted_id = None
results: List[SearchResult] = []
for magnet in magnets:
if not isinstance(magnet, dict):
continue
try:
magnet_id = int(magnet.get('id'))
except Exception:
continue
magnet_name = str(magnet.get('filename') or magnet.get('name') or magnet.get('hash') or f"magnet-{magnet_id}")
magnet_name_lower = magnet_name.lower()
status_text = str(magnet.get('status') or "").strip() or "unknown"
status_code = magnet.get('statusCode')
ready = status_code == 4 or bool(magnet.get('ready'))
if wanted_id is not None:
if magnet_id != wanted_id:
continue
elif needle and (needle not in magnet_name_lower):
continue
size_bytes: Optional[int] = None
try:
size_val = magnet.get('size')
if isinstance(size_val, (int, float)):
size_bytes = int(size_val)
elif isinstance(size_val, str) and size_val.isdigit():
size_bytes = int(size_val)
except Exception:
size_bytes = None
results.append(
SearchResult(
table="alldebrid",
title=magnet_name,
path=f"alldebrid:magnet:{magnet_id}",
detail=status_text,
annotations=["folder"],
media_kind="folder",
size_bytes=size_bytes,
tag={"alldebrid", "folder", str(magnet_id)} | ({"ready"} if ready else {"not-ready"}),
columns=[
("Folder", magnet_name),
("ID", str(magnet_id)),
("Status", status_text),
("Ready", "yes" if ready else "no"),
],
full_metadata={"magnet": magnet, "magnet_id": magnet_id},
)
)
if len(results) >= max(1, limit):
break
return results

View File

@@ -4,12 +4,15 @@ import logging
import re
import requests
import sys
import time
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple
from urllib.parse import quote, urljoin, urlparse, unquote
from urllib.parse import urljoin, urlparse, unquote
from ProviderCore.base import SearchProvider, SearchResult
from ProviderCore.download import sanitize_filename
from SYS.logger import log
from models import ProgressBar
# Optional dependencies
@@ -33,6 +36,7 @@ class Libgen(SearchProvider):
try:
from cli_syntax import get_field, get_free_text, parse_query
from SYS.logger import is_debug_enabled
parsed = parse_query(query)
isbn = get_field(parsed, "isbn")
@@ -42,16 +46,24 @@ class Libgen(SearchProvider):
search_query = isbn or title or author or free_text or query
debug_info = None
try:
if is_debug_enabled():
debug_info = lambda msg: log(msg, file=sys.stderr)
except Exception:
debug_info = None
books = search_libgen(
search_query,
limit=limit,
log_info=debug_info,
log_error=lambda msg: log(msg, file=sys.stderr),
)
results: List[SearchResult] = []
for idx, book in enumerate(books, 1):
title = book.get("title", "Unknown")
author = book.get("author", "Unknown")
title = str(book.get("title") or "").strip() or "Unknown"
author = str(book.get("author") or "").strip() or "Unknown"
year = book.get("year", "Unknown")
pages = book.get("pages") or book.get("pages_str") or ""
extension = book.get("extension", "") or book.get("ext", "")
@@ -104,10 +116,106 @@ class Libgen(SearchProvider):
return []
def validate(self) -> bool:
# JSON-based searching can work without BeautifulSoup; HTML parsing is a fallback.
return True
def download(self, result: SearchResult, output_dir: Path) -> Optional[Path]:
"""Download a LibGen SearchResult into output_dir.
This is used by the download-file cmdlet when a provider item is piped.
"""
try:
return BeautifulSoup is not None
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
target = str(getattr(result, "path", "") or "")
md = getattr(result, "full_metadata", None)
if not isinstance(md, dict):
md = {}
title = str(getattr(result, "title", "") or "").strip()
md5 = str(md.get("md5") or "").strip()
extension = str(md.get("extension") or "").strip().lstrip(".")
if (not target) or target.startswith("libgen:"):
if md5 and re.fullmatch(r"[a-fA-F0-9]{32}", md5):
target = urljoin(MIRRORS[0], f"/ads.php?md5={md5}")
if not target:
return None
base_name = sanitize_filename(title or md5 or "libgen")
out_path = output_dir / base_name
if extension:
out_path = out_path.with_suffix(f".{extension}")
if out_path.exists():
stem = out_path.stem
suffix = out_path.suffix
counter = 1
while out_path.exists() and counter < 200:
out_path = out_path.with_name(f"{stem}({counter}){suffix}")
counter += 1
# Show a progress bar on stderr (safe for pipelines).
progress_bar = ProgressBar()
start_time = time.time()
# Allow the first callback to print immediately.
last_progress_time = [0.0]
label = out_path.name
def progress_callback(bytes_downloaded: int, content_length: int) -> None:
# Throttle updates to avoid flooding output.
now = time.time()
if now - last_progress_time[0] < 0.5:
return
total = int(content_length) if content_length and content_length > 0 else None
downloaded = int(bytes_downloaded) if bytes_downloaded and bytes_downloaded > 0 else 0
elapsed = max(0.001, now - start_time)
speed = downloaded / elapsed
eta_seconds = 0.0
if total and total > 0 and speed > 0:
eta_seconds = max(0.0, float(total - downloaded) / float(speed))
minutes, seconds = divmod(int(eta_seconds), 60)
hours, minutes = divmod(minutes, 60)
eta_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}" if total else "?:?:?"
speed_str = progress_bar.format_bytes(speed) + "/s"
percent_str = None
if total and total > 0:
percent = (downloaded / total) * 100.0
percent_str = f"{percent:.1f}%"
line = progress_bar.format_progress(
percent_str=percent_str,
downloaded=downloaded,
total=total,
speed_str=speed_str,
eta_str=eta_str,
)
# Prefix with filename for clarity when downloading multiple items.
if label:
line = f"{label} {line}"
if getattr(sys.stderr, "isatty", lambda: True)():
sys.stderr.write("\r" + line + " ")
sys.stderr.flush()
last_progress_time[0] = now
ok, final_path = download_from_mirror(target, out_path, progress_callback=progress_callback)
# Clear the in-place progress line.
if getattr(sys.stderr, "isatty", lambda: True)():
sys.stderr.write("\r" + (" " * 180) + "\r")
sys.stderr.write("\n")
sys.stderr.flush()
if ok and final_path:
return Path(final_path)
return None
except Exception:
return False
return None
LogFn = Optional[Callable[[str], None]]
@@ -116,18 +224,26 @@ ErrorFn = Optional[Callable[[str], None]]
DEFAULT_TIMEOUT = 20.0
DEFAULT_LIMIT = 50
# Keep LibGen searches responsive even if mirrors are blocked or slow.
# Note: requests' timeout doesn't always cover DNS stalls, but this prevents
# multi-mirror attempts from taking minutes.
DEFAULT_SEARCH_TOTAL_TIMEOUT = 20.0
DEFAULT_CONNECT_TIMEOUT = 4.0
DEFAULT_READ_TIMEOUT = 10.0
# Mirrors to try in order
MIRRORS = [
# Prefer .gl first (often most reachable/stable)
"https://libgen.gl",
"http://libgen.gl",
"https://libgen.li",
"http://libgen.li",
"https://libgen.is",
"https://libgen.rs",
"https://libgen.st",
"http://libgen.is",
"http://libgen.rs",
"http://libgen.st",
"https://libgen.li", # Different structure, fallback
"http://libgen.li",
"https://libgen.gl", # Different structure, fallback
"http://libgen.gl",
]
logging.getLogger(__name__).setLevel(logging.INFO)
@@ -147,28 +263,146 @@ class LibgenSearch:
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
})
def search(self, query: str, limit: int = DEFAULT_LIMIT) -> List[Dict[str, Any]]:
"""Search LibGen mirrors."""
if not BeautifulSoup:
logging.error("BeautifulSoup not installed. Cannot search LibGen.")
def _search_libgen_json(
self,
mirror: str,
query: str,
limit: int,
*,
timeout: Any = DEFAULT_TIMEOUT,
) -> List[Dict[str, Any]]:
"""Search libgen.rs/is/st JSON API when available.
Many LibGen mirrors expose /json.php which is less brittle than scraping.
"""
url = f"{mirror}/json.php"
params = {
"req": query,
"res": max(1, min(100, int(limit) if limit else 50)),
"column": "def",
"phrase": 1,
}
resp = self.session.get(url, params=params, timeout=timeout)
resp.raise_for_status()
data = resp.json()
if not isinstance(data, list):
return []
results: List[Dict[str, Any]] = []
for item in data:
if not isinstance(item, dict):
continue
# LibGen JSON responses vary by mirror; accept several common keys.
raw_id = item.get("ID") or item.get("Id") or item.get("id") or ""
title = item.get("Title") or item.get("title") or ""
author = item.get("Author") or item.get("author") or ""
publisher = item.get("Publisher") or item.get("publisher") or ""
year = item.get("Year") or item.get("year") or ""
pages = item.get("Pages") or item.get("pages") or ""
language = item.get("Language") or item.get("language") or ""
size = item.get("Size") or item.get("size") or item.get("filesize") or ""
extension = item.get("Extension") or item.get("extension") or item.get("ext") or ""
md5 = item.get("MD5") or item.get("md5") or ""
download_link = f"http://library.lol/main/{md5}" if md5 else ""
results.append({
"id": str(raw_id),
"title": str(title),
"author": str(author),
"publisher": str(publisher),
"year": str(year),
"pages": str(pages),
"language": str(language),
"filesize_str": str(size),
"extension": str(extension),
"md5": str(md5),
"mirror_url": download_link,
"cover": "",
})
if len(results) >= limit:
break
return results
def search(
self,
query: str,
limit: int = DEFAULT_LIMIT,
*,
total_timeout: float = DEFAULT_SEARCH_TOTAL_TIMEOUT,
log_info: LogFn = None,
log_error: ErrorFn = None,
) -> List[Dict[str, Any]]:
"""Search LibGen mirrors.
Uses a total time budget across mirrors to avoid long hangs.
"""
# Prefer JSON API (no BeautifulSoup needed); HTML scraping is a fallback.
has_bs4 = BeautifulSoup is not None
started = time.monotonic()
for mirror in MIRRORS:
elapsed = time.monotonic() - started
remaining = total_timeout - elapsed
if remaining <= 0:
_call(log_error, f"[libgen] Search timed out after {total_timeout:.0f}s")
break
# Bound each request so we can try multiple mirrors within the budget.
# Keep connect+read within the remaining budget as a best-effort.
connect_timeout = min(DEFAULT_CONNECT_TIMEOUT, max(0.1, remaining))
read_budget = max(0.1, remaining - connect_timeout)
read_timeout = min(DEFAULT_READ_TIMEOUT, read_budget)
request_timeout: Any = (connect_timeout, read_timeout)
_call(log_info, f"[libgen] Trying mirror: {mirror}")
try:
if "libgen.li" in mirror or "libgen.gl" in mirror:
results = self._search_libgen_li(mirror, query, limit)
else:
results = self._search_libgen_rs(mirror, query, limit)
# Try JSON first on *all* mirrors (including .gl/.li), then fall back to HTML scraping.
results: List[Dict[str, Any]] = []
try:
results = self._search_libgen_json(mirror, query, limit, timeout=request_timeout)
except Exception:
results = []
if not results:
if not has_bs4:
continue
if "libgen.li" in mirror or "libgen.gl" in mirror:
results = self._search_libgen_li(mirror, query, limit, timeout=request_timeout)
else:
results = self._search_libgen_rs(mirror, query, limit, timeout=request_timeout)
if results:
_call(log_info, f"[libgen] Using mirror: {mirror}")
return results
except requests.exceptions.Timeout:
_call(log_info, f"[libgen] Mirror timed out: {mirror}")
continue
except requests.exceptions.RequestException:
_call(log_info, f"[libgen] Mirror request failed: {mirror}")
continue
except Exception as e:
logging.debug(f"Mirror {mirror} failed: {e}")
continue
return []
def _search_libgen_rs(self, mirror: str, query: str, limit: int) -> List[Dict[str, Any]]:
def _search_libgen_rs(
self,
mirror: str,
query: str,
limit: int,
*,
timeout: Any = DEFAULT_TIMEOUT,
) -> List[Dict[str, Any]]:
"""Search libgen.rs/is/st style mirrors."""
url = f"{mirror}/search.php"
params = {
@@ -180,9 +414,11 @@ class LibgenSearch:
"phrase": 1,
}
resp = self.session.get(url, params=params, timeout=DEFAULT_TIMEOUT)
resp = self.session.get(url, params=params, timeout=timeout)
resp.raise_for_status()
if BeautifulSoup is None:
return []
soup = BeautifulSoup(resp.text, "html.parser")
table = soup.find("table", {"class": "c"})
@@ -215,7 +451,7 @@ class LibgenSearch:
md5 = ""
if title_tag and title_tag.has_attr("href"):
href = title_tag["href"]
href = str(title_tag.get("href") or "")
match = re.search(r"md5=([a-fA-F0-9]{32})", href)
if match:
md5 = match.group(1)
@@ -264,19 +500,29 @@ class LibgenSearch:
return results
def _search_libgen_li(self, mirror: str, query: str, limit: int) -> List[Dict[str, Any]]:
def _search_libgen_li(
self,
mirror: str,
query: str,
limit: int,
*,
timeout: Any = DEFAULT_TIMEOUT,
) -> List[Dict[str, Any]]:
"""Search libgen.li/gl style mirrors."""
url = f"{mirror}/index.php"
params = {
"req": query,
"res": 100,
"covers": "on",
# Keep the request lightweight; covers slow the HTML response.
"res": max(1, min(100, int(limit) if limit else 50)),
"covers": "off",
"filesuns": "all",
}
resp = self.session.get(url, params=params, timeout=DEFAULT_TIMEOUT)
resp = self.session.get(url, params=params, timeout=timeout)
resp.raise_for_status()
if BeautifulSoup is None:
return []
soup = BeautifulSoup(resp.text, "html.parser")
table = soup.find("table", {"id": "tablelibgen"})
if not table:
@@ -294,46 +540,152 @@ class LibgenSearch:
continue
try:
title_col = cols[1]
title_link = title_col.find("a")
title = title_link.get_text(strip=True) if title_link else title_col.get_text(strip=True)
libgen_id = ""
if title_link and title_link.has_attr("href"):
href = title_link["href"]
match = re.search(r"id=(\d+)", href)
if match:
libgen_id = match.group(1)
authors = cols[2].get_text(strip=True)
publisher = cols[3].get_text(strip=True)
year = cols[4].get_text(strip=True)
language = cols[5].get_text(strip=True)
pages = cols[6].get_text(strip=True)
size = cols[7].get_text(strip=True)
extension = cols[8].get_text(strip=True)
# Extract md5 (libgen.gl exposes /ads.php?md5=... in mirror column)
md5 = ""
mirror_url = ""
if title_link:
href = title_link["href"]
if href.startswith("/"):
mirror_url = mirror + href
else:
mirror_url = urljoin(mirror, href)
for a in row.find_all("a"):
href = a.get("href")
if not href:
continue
m = re.search(r"md5=([a-fA-F0-9]{32})", str(href))
if m:
md5 = m.group(1)
if "ads.php" in str(href):
mirror_url = urljoin(mirror, str(href))
break
if not mirror_url and md5:
mirror_url = urljoin(mirror, f"/ads.php?md5={md5}")
results.append({
"id": libgen_id,
"title": title,
"author": authors,
"publisher": publisher,
"year": year,
"pages": pages,
"language": language,
"filesize_str": size,
"extension": extension,
"md5": "",
"mirror_url": mirror_url,
})
# Extract numeric file id from /file.php?id=...
libgen_id = ""
file_link = row.find("a", href=re.compile(r"/file\.php\?id=\d+"))
if file_link and file_link.get("href"):
m = re.search(r"id=(\d+)", str(file_link.get("href")))
if m:
libgen_id = m.group(1)
title = ""
authors = ""
publisher = ""
year = ""
language = ""
pages = ""
size = ""
extension = ""
isbn = ""
# libgen.gl columns shift depending on whether covers are enabled.
# With covers on: cover, meta, author, publisher, year, language, pages, size, ext, mirrors (10)
# With covers off: meta, author, publisher, year, language, pages, size, ext, mirrors (9)
offset: Optional[int] = None
if len(cols) >= 10:
offset = 1
elif len(cols) >= 9:
offset = 0
if offset is not None:
meta_cell = cols[offset]
meta_text = " ".join([str(s).strip() for s in meta_cell.stripped_strings if str(s).strip()])
# Extract ISBNs from meta cell (avoid using them as title)
# Matches 10 or 13-digit ISBN with optional leading 978/979.
isbn_candidates = re.findall(r"\b(?:97[89])?\d{9}[\dXx]\b", meta_text)
if isbn_candidates:
seen: List[str] = []
for s in isbn_candidates:
s = s.upper()
if s not in seen:
seen.append(s)
isbn = "; ".join(seen)
# Choose a "real" title from meta cell.
# libgen.gl meta can include series/edition/isbn blobs; prefer text with letters.
raw_candidates: List[str] = []
for a in meta_cell.find_all("a"):
t = a.get_text(" ", strip=True)
if t:
raw_candidates.append(t)
for s in meta_cell.stripped_strings:
t = str(s).strip()
if t:
raw_candidates.append(t)
deduped: List[str] = []
for t in raw_candidates:
t = t.strip()
if t and t not in deduped:
deduped.append(t)
def _looks_like_isbn_blob(text: str) -> bool:
if re.fullmatch(r"[0-9Xx;\s\-]+", text):
# Numbers-only (common for ISBN lists)
return True
if ";" in text and len(re.findall(r"[A-Za-z]", text)) == 0:
return True
return False
best_title = ""
best_score: Optional[tuple] = None
for cand in deduped:
low = cand.lower().strip()
if low in {"cover", "edition"}:
continue
if _looks_like_isbn_blob(cand):
continue
letters = len(re.findall(r"[A-Za-z]", cand))
if letters < 3:
continue
digits = len(re.findall(r"\d", cand))
digit_ratio = digits / max(1, len(cand))
# Prefer more letters, fewer digits, and longer strings.
score = (letters, -digit_ratio, len(cand))
if best_score is None or score > best_score:
best_score = score
best_title = cand
title = best_title or meta_cell.get_text(" ", strip=True)
authors = cols[offset + 1].get_text(" ", strip=True)
publisher = cols[offset + 2].get_text(" ", strip=True)
year = cols[offset + 3].get_text(" ", strip=True)
language = cols[offset + 4].get_text(" ", strip=True)
pages = cols[offset + 5].get_text(" ", strip=True)
size = cols[offset + 6].get_text(" ", strip=True)
extension = cols[offset + 7].get_text(" ", strip=True)
else:
# Older fallback structure
title_col = cols[1]
title_link = title_col.find("a")
title = title_link.get_text(" ", strip=True) if title_link else title_col.get_text(" ", strip=True)
authors = cols[2].get_text(" ", strip=True)
publisher = cols[3].get_text(" ", strip=True)
year = cols[4].get_text(" ", strip=True)
language = cols[5].get_text(" ", strip=True)
pages = cols[6].get_text(" ", strip=True)
size = cols[7].get_text(" ", strip=True)
extension = cols[8].get_text(" ", strip=True)
title = (title or "").strip() or "Unknown"
authors = (authors or "").strip() or "Unknown"
results.append(
{
"id": libgen_id,
"title": title,
"author": authors,
"isbn": (isbn or "").strip(),
"publisher": (publisher or "").strip(),
"year": (year or "").strip(),
"pages": (pages or "").strip(),
"language": (language or "").strip(),
"filesize_str": (size or "").strip(),
"extension": (extension or "").strip(),
"md5": md5,
"mirror_url": mirror_url,
}
)
if len(results) >= limit:
break
@@ -354,7 +706,13 @@ def search_libgen(
"""Search Libgen using the robust scraper."""
searcher = LibgenSearch(session=session)
try:
results = searcher.search(query, limit=limit)
results = searcher.search(
query,
limit=limit,
total_timeout=DEFAULT_SEARCH_TOTAL_TIMEOUT,
log_info=log_info,
log_error=log_error,
)
_call(log_info, f"[libgen] Found {len(results)} results")
return results
except Exception as e:
@@ -371,6 +729,17 @@ def _resolve_download_url(
current_url = url
visited = set()
if BeautifulSoup is None:
_call(log_info, "[resolve] BeautifulSoup not available; cannot resolve HTML download chain")
return None
def _find_a_by_text(pattern: str) -> Optional[Any]:
for a in soup.find_all("a"):
t = a.get_text(" ", strip=True)
if t and re.search(pattern, t, re.IGNORECASE):
return a
return None
for _ in range(6):
if current_url in visited:
break
@@ -396,45 +765,40 @@ def _resolve_download_url(
soup = BeautifulSoup(content, "html.parser")
get_link = soup.find("a", string=re.compile(r"^GET$", re.IGNORECASE))
if not get_link:
h2_get = soup.find("h2", string=re.compile(r"^GET$", re.IGNORECASE))
if h2_get and h2_get.parent.name == "a":
get_link = h2_get.parent
get_link = _find_a_by_text(r"^GET$")
if get_link and get_link.has_attr("href"):
return urljoin(current_url, get_link["href"])
return urljoin(current_url, str(get_link.get("href") or ""))
if "series.php" in current_url:
edition_link = soup.find("a", href=re.compile(r"edition\.php"))
if edition_link:
current_url = urljoin(current_url, edition_link["href"])
current_url = urljoin(current_url, str(edition_link.get("href") or ""))
continue
if "edition.php" in current_url:
file_link = soup.find("a", href=re.compile(r"file\.php"))
if file_link:
current_url = urljoin(current_url, file_link["href"])
current_url = urljoin(current_url, str(file_link.get("href") or ""))
continue
if "file.php" in current_url:
libgen_link = soup.find("a", title="libgen")
if not libgen_link:
libgen_link = soup.find("a", string=re.compile(r"Libgen", re.IGNORECASE))
libgen_link = _find_a_by_text(r"Libgen")
if libgen_link and libgen_link.has_attr("href"):
current_url = urljoin(current_url, libgen_link["href"])
current_url = urljoin(current_url, str(libgen_link.get("href") or ""))
continue
if "ads.php" in current_url:
get_php_link = soup.find("a", href=re.compile(r"get\.php"))
if get_php_link:
return urljoin(current_url, get_php_link["href"])
return urljoin(current_url, str(get_php_link.get("href") or ""))
for text in ["Cloudflare", "IPFS.io", "Infura"]:
link = soup.find("a", string=re.compile(text, re.IGNORECASE))
link = _find_a_by_text(re.escape(text))
if link and link.has_attr("href"):
return urljoin(current_url, link["href"])
return urljoin(current_url, str(link.get("href") or ""))
break

View File

@@ -1,8 +1,11 @@
from __future__ import annotations
import mimetypes
import time
import uuid
from pathlib import Path
from typing import Any, Dict, Optional, Tuple
from typing import Any, Dict, List, Optional, Tuple
from urllib.parse import quote
import requests
@@ -57,17 +60,18 @@ class Matrix(FileProvider):
matrix_conf = self.config.get("provider", {}).get("matrix", {}) if isinstance(self.config, dict) else {}
homeserver = matrix_conf.get("homeserver")
room_id = matrix_conf.get("room_id")
access_token = matrix_conf.get("access_token")
password = matrix_conf.get("password")
# Not configured: keep instance but mark invalid via validate().
if not (homeserver and room_id and (access_token or password)):
# Note: `room_id` is intentionally NOT required, since the CLI can prompt
# the user to select a room dynamically.
if not (homeserver and (access_token or password)):
self._init_ok = None
self._init_reason = None
return
cache_key = f"{_normalize_homeserver(str(homeserver))}|room:{room_id}|has_token:{bool(access_token)}"
cache_key = f"{_normalize_homeserver(str(homeserver))}|has_token:{bool(access_token)}"
cached = _MATRIX_INIT_CHECK_CACHE.get(cache_key)
if cached is None:
ok, reason = _matrix_health_check(homeserver=str(homeserver), access_token=str(access_token) if access_token else None)
@@ -88,34 +92,69 @@ class Matrix(FileProvider):
matrix_conf = self.config.get("provider", {}).get("matrix", {})
return bool(
matrix_conf.get("homeserver")
and matrix_conf.get("room_id")
and (matrix_conf.get("access_token") or matrix_conf.get("password"))
)
def upload(self, file_path: str, **kwargs: Any) -> str:
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
def _get_homeserver_and_token(self) -> Tuple[str, str]:
matrix_conf = self.config.get("provider", {}).get("matrix", {})
homeserver = matrix_conf.get("homeserver")
access_token = matrix_conf.get("access_token")
room_id = matrix_conf.get("room_id")
if not homeserver:
raise Exception("Matrix homeserver missing")
if not access_token:
raise Exception("Matrix access_token missing")
base = _normalize_homeserver(str(homeserver))
if not base:
raise Exception("Matrix homeserver missing")
return base, str(access_token)
def list_rooms(self) -> List[Dict[str, Any]]:
"""Return the rooms the current user has joined.
Uses `GET /_matrix/client/v3/joined_rooms`.
"""
base, token = self._get_homeserver_and_token()
headers = {"Authorization": f"Bearer {token}"}
resp = requests.get(f"{base}/_matrix/client/v3/joined_rooms", headers=headers, timeout=10)
if resp.status_code != 200:
raise Exception(f"Matrix joined_rooms failed: {resp.text}")
data = resp.json() or {}
rooms = data.get("joined_rooms") or []
out: List[Dict[str, Any]] = []
for rid in rooms:
if not isinstance(rid, str) or not rid.strip():
continue
room_id = rid.strip()
name = ""
# Best-effort room name lookup (safe to fail).
try:
encoded = quote(room_id, safe="")
name_resp = requests.get(
f"{base}/_matrix/client/v3/rooms/{encoded}/state/m.room.name",
headers=headers,
timeout=5,
)
if name_resp.status_code == 200:
payload = name_resp.json() or {}
maybe = payload.get("name")
if isinstance(maybe, str):
name = maybe
except Exception:
pass
out.append({"room_id": room_id, "name": name})
return out
def upload_to_room(self, file_path: str, room_id: str) -> str:
"""Upload a file and send it to a specific room."""
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
if not room_id:
raise Exception("Matrix room_id missing")
if not homeserver.startswith("http"):
homeserver = f"https://{homeserver}"
# Upload media
upload_url = f"{homeserver}/_matrix/media/v3/upload"
base, token = self._get_homeserver_and_token()
headers = {
"Authorization": f"Bearer {access_token}",
"Authorization": f"Bearer {token}",
"Content-Type": "application/octet-stream",
}
@@ -125,27 +164,22 @@ class Matrix(FileProvider):
filename = path.name
# Upload media
upload_url = f"{base}/_matrix/media/v3/upload"
with open(path, "rb") as handle:
resp = requests.post(upload_url, headers=headers, data=handle, params={"filename": filename})
if resp.status_code != 200:
raise Exception(f"Matrix upload failed: {resp.text}")
content_uri = resp.json().get("content_uri")
content_uri = (resp.json() or {}).get("content_uri")
if not content_uri:
raise Exception("No content_uri returned")
# Send message
send_url = f"{homeserver}/_matrix/client/v3/rooms/{room_id}/send/m.room.message"
# Determine message type
msgtype = "m.file"
ext = path.suffix.lower()
audio_exts = {".mp3", ".flac", ".wav", ".m4a", ".aac", ".ogg", ".opus", ".wma", ".mka", ".alac"}
video_exts = {".mp4", ".mkv", ".webm", ".mov", ".avi", ".flv", ".mpg", ".mpeg", ".ts", ".m4v", ".wmv"}
image_exts = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff"}
if ext in audio_exts:
msgtype = "m.audio"
elif ext in video_exts:
@@ -156,9 +190,21 @@ class Matrix(FileProvider):
info = {"mimetype": mime_type, "size": path.stat().st_size}
payload = {"msgtype": msgtype, "body": filename, "url": content_uri, "info": info}
resp = requests.post(send_url, headers=headers, json=payload)
if resp.status_code != 200:
raise Exception(f"Matrix send message failed: {resp.text}")
# Correct Matrix client API send endpoint requires a transaction ID.
txn_id = f"mm_{int(time.time())}_{uuid.uuid4().hex[:8]}"
encoded_room = quote(str(room_id), safe="")
send_url = f"{base}/_matrix/client/v3/rooms/{encoded_room}/send/m.room.message/{txn_id}"
send_headers = {"Authorization": f"Bearer {token}"}
send_resp = requests.put(send_url, headers=send_headers, json=payload)
if send_resp.status_code != 200:
raise Exception(f"Matrix send message failed: {send_resp.text}")
event_id = resp.json().get("event_id")
return f"https://matrix.to/#/{room_id}/{event_id}"
event_id = (send_resp.json() or {}).get("event_id")
return f"https://matrix.to/#/{room_id}/{event_id}" if event_id else f"https://matrix.to/#/{room_id}"
def upload(self, file_path: str, **kwargs: Any) -> str:
matrix_conf = self.config.get("provider", {}).get("matrix", {})
room_id = matrix_conf.get("room_id")
if not room_id:
raise Exception("Matrix room_id missing")
return self.upload_to_room(file_path, str(room_id))

View File

@@ -182,6 +182,20 @@ class Soulseek(SearchProvider):
DOWNLOAD_DIR = "./downloads"
MAX_WAIT_TRANSFER = 1200
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
try:
from config import get_soulseek_username, get_soulseek_password
user = get_soulseek_username(self.config)
pwd = get_soulseek_password(self.config)
if user:
Soulseek.USERNAME = user
if pwd:
Soulseek.PASSWORD = pwd
except Exception:
pass
def download(self, result: SearchResult, output_dir: Path) -> Optional[Path]:
"""Download file from Soulseek."""
@@ -433,8 +447,16 @@ class Soulseek(SearchProvider):
def validate(self) -> bool:
try:
from aioslsk.client import SoulSeekClient # noqa: F401
# Require configured credentials.
try:
from config import get_soulseek_username, get_soulseek_password
return True
user = get_soulseek_username(self.config)
pwd = get_soulseek_password(self.config)
return bool(user and pwd)
except Exception:
# Fall back to legacy class defaults if config helpers aren't available.
return bool(Soulseek.USERNAME and Soulseek.PASSWORD)
except ImportError:
return False
@@ -444,6 +466,9 @@ async def download_soulseek_file(
filename: str,
output_dir: Path = Path("./downloads"),
timeout: int = 1200,
*,
client_username: Optional[str] = None,
client_password: Optional[str] = None,
) -> Optional[Path]:
"""Download a file from a Soulseek peer."""
@@ -471,14 +496,19 @@ async def download_soulseek_file(
output_path = output_path.resolve()
settings = Settings(credentials=CredentialsSettings(username=Soulseek.USERNAME, password=Soulseek.PASSWORD))
login_user = (client_username or Soulseek.USERNAME or "").strip()
login_pass = (client_password or Soulseek.PASSWORD or "").strip()
if not login_user or not login_pass:
raise RuntimeError("Soulseek credentials not configured (set provider=soulseek username/password)")
settings = Settings(credentials=CredentialsSettings(username=login_user, password=login_pass))
client = SoulSeekClient(settings)
with _suppress_aioslsk_noise():
try:
await client.start()
await client.login()
debug(f"[soulseek] Logged in as {Soulseek.USERNAME}")
debug(f"[soulseek] Logged in as {login_user}")
debug(f"[soulseek] Requesting download from {username}: {filename}")