"""Shared Library Genesis search and download helpers. Replaces the old libgen backend with a robust scraper based on libgen-api-enhanced logic. Targets libgen.is/rs/st mirrors and parses the results table directly. """ from __future__ import annotations import logging import re import requests from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Tuple from urllib.parse import quote, urljoin, urlparse, unquote # Optional dependencies try: from bs4 import BeautifulSoup except ImportError: BeautifulSoup = None LogFn = Optional[Callable[[str], None]] ErrorFn = Optional[Callable[[str], None]] DEFAULT_TIMEOUT = 20.0 DEFAULT_LIMIT = 50 # Mirrors to try in order MIRRORS = [ "https://libgen.is", "https://libgen.rs", "https://libgen.st", "http://libgen.is", "http://libgen.rs", "http://libgen.st", "https://libgen.li", # Different structure, fallback "http://libgen.li", "https://libgen.gl", # Different structure, fallback "http://libgen.gl", ] logging.getLogger(__name__).setLevel(logging.INFO) def _call(logger: LogFn, message: str) -> None: if logger: logger(message) class LibgenSearch: """Robust LibGen searcher.""" def __init__(self, session: Optional[requests.Session] = None): self.session = session or requests.Session() self.session.headers.update({ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" }) def search(self, query: str, limit: int = DEFAULT_LIMIT) -> List[Dict[str, Any]]: """Search LibGen mirrors.""" if not BeautifulSoup: logging.error("BeautifulSoup not installed. Cannot search LibGen.") return [] for mirror in MIRRORS: try: if "libgen.li" in mirror or "libgen.gl" in mirror: results = self._search_libgen_li(mirror, query, limit) else: results = self._search_libgen_rs(mirror, query, limit) if results: return results except Exception as e: logging.debug(f"Mirror {mirror} failed: {e}") continue return [] def _search_libgen_rs(self, mirror: str, query: str, limit: int) -> List[Dict[str, Any]]: """Search libgen.rs/is/st style mirrors.""" # Search URL: /search.php?req=QUERY&res=100&column=def url = f"{mirror}/search.php" params = { "req": query, "res": 100, # Request more to filter later "column": "def", "open": 0, "view": "simple", "phrase": 1, } resp = self.session.get(url, params=params, timeout=DEFAULT_TIMEOUT) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") # Find the table with results. usually class 'c' table = soup.find("table", {"class": "c"}) if not table: # Try finding by structure (table with many rows) tables = soup.find_all("table") for t in tables: if len(t.find_all("tr")) > 5: table = t break if not table: return [] results = [] # Skip header row rows = table.find_all("tr")[1:] for row in rows: cols = row.find_all("td") if len(cols) < 9: continue # Columns: # 0: ID # 1: Author(s) # 2: Title # 3: Publisher # 4: Year # 5: Pages # 6: Language # 7: Size # 8: Extension # 9+: Mirrors try: libgen_id = cols[0].get_text(strip=True) authors = [a.get_text(strip=True) for a in cols[1].find_all("a")] if not authors: authors = [cols[1].get_text(strip=True)] title_tag = cols[2].find("a") title = title_tag.get_text(strip=True) if title_tag else cols[2].get_text(strip=True) # Extract MD5 from title link if possible (often in href) # href='book/index.php?md5=...' md5 = "" if title_tag and title_tag.has_attr("href"): href = title_tag["href"] match = re.search(r"md5=([a-fA-F0-9]{32})", href) if match: md5 = match.group(1) publisher = cols[3].get_text(strip=True) year = cols[4].get_text(strip=True) pages = cols[5].get_text(strip=True) language = cols[6].get_text(strip=True) size = cols[7].get_text(strip=True) extension = cols[8].get_text(strip=True) # Mirrors # Usually col 9 is http://library.lol/main/MD5 mirror_links = [] for i in range(9, len(cols)): a = cols[i].find("a") if a and a.has_attr("href"): mirror_links.append(a["href"]) # Construct direct download page link (library.lol) # If we have MD5, we can guess it: http://library.lol/main/{md5} if md5: download_link = f"http://library.lol/main/{md5}" elif mirror_links: download_link = mirror_links[0] else: download_link = "" results.append({ "id": libgen_id, "title": title, "author": ", ".join(authors), "publisher": publisher, "year": year, "pages": pages, "language": language, "filesize_str": size, "extension": extension, "md5": md5, "mirror_url": download_link, "cover": "", # Could extract from hover if needed }) if len(results) >= limit: break except Exception as e: logging.debug(f"Error parsing row: {e}") continue return results def _search_libgen_li(self, mirror: str, query: str, limit: int) -> List[Dict[str, Any]]: """Search libgen.li/gl style mirrors.""" # Search URL: /index.php?req=QUERY&columns[]=t&columns[]=a... url = f"{mirror}/index.php" params = { "req": query, "res": 100, "covers": "on", "filesuns": "all", } resp = self.session.get(url, params=params, timeout=DEFAULT_TIMEOUT) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") table = soup.find("table", {"id": "tablelibgen"}) if not table: table = soup.find("table", {"class": "table table-striped"}) if not table: return [] results = [] rows = table.find_all("tr")[1:] for row in rows: cols = row.find_all("td") if len(cols) < 9: continue try: # Structure is different # 0: Cover # 1: Title (with link to file.php?id=...) # 2: Author # 3: Publisher # 4: Year # 5: Language # 6: Pages # 7: Size # 8: Extension # 9: Mirrors title_col = cols[1] title_link = title_col.find("a") title = title_link.get_text(strip=True) if title_link else title_col.get_text(strip=True) # Extract ID from link libgen_id = "" if title_link and title_link.has_attr("href"): href = title_link["href"] # href is usually "file.php?id=..." or "edition.php?id=..." match = re.search(r"id=(\d+)", href) if match: libgen_id = match.group(1) authors = cols[2].get_text(strip=True) publisher = cols[3].get_text(strip=True) year = cols[4].get_text(strip=True) language = cols[5].get_text(strip=True) pages = cols[6].get_text(strip=True) size = cols[7].get_text(strip=True) extension = cols[8].get_text(strip=True) # Mirror link # Usually in col 9 or title link mirror_url = "" if title_link: href = title_link["href"] if href.startswith("/"): mirror_url = mirror + href else: mirror_url = urljoin(mirror, href) results.append({ "id": libgen_id, "title": title, "author": authors, "publisher": publisher, "year": year, "pages": pages, "language": language, "filesize_str": size, "extension": extension, "md5": "", # .li doesn't show MD5 easily in table "mirror_url": mirror_url, }) if len(results) >= limit: break except Exception: continue return results def search_libgen( query: str, limit: int = DEFAULT_LIMIT, *, log_info: LogFn = None, log_error: ErrorFn = None, session: Optional[requests.Session] = None, ) -> List[Dict[str, Any]]: """Search Libgen using the robust scraper.""" searcher = LibgenSearch(session=session) try: results = searcher.search(query, limit=limit) _call(log_info, f"[libgen] Found {len(results)} results") return results except Exception as e: _call(log_error, f"[libgen] Search failed: {e}") return [] def _resolve_download_url( session: requests.Session, url: str, log_info: LogFn = None ) -> Optional[str]: """Resolve the final download URL by following the LibGen chain.""" current_url = url visited = set() # Max hops to prevent infinite loops for _ in range(6): if current_url in visited: break visited.add(current_url) _call(log_info, f"[resolve] Checking: {current_url}") # Simple heuristic: if it looks like a file, return it if current_url.lower().endswith(('.pdf', '.epub', '.mobi', '.djvu', '.azw3', '.cbz', '.cbr')): return current_url try: # Use HEAD first to check content type if possible, but some mirrors block HEAD or return 405 # So we'll just GET with stream=True to peek headers/content without downloading everything with session.get(current_url, stream=True, timeout=30) as resp: resp.raise_for_status() ct = resp.headers.get("Content-Type", "").lower() if "text/html" not in ct: # It's a binary file return current_url # It's HTML, read content content = resp.text except Exception as e: _call(log_info, f"[resolve] Failed to fetch {current_url}: {e}") return None soup = BeautifulSoup(content, "html.parser") # 1. Check for "GET" link (library.lol / ads.php style) # Usually