jkjnkjkllkjjk

This commit is contained in:
nose
2025-11-30 11:39:04 -08:00
parent ed417c8200
commit 7a13af9a1f
15 changed files with 1150 additions and 363 deletions

View File

@@ -1,21 +1,44 @@
"""Shared Library Genesis search and download helpers."""
"""Shared Library Genesis search and download helpers.
Replaces the old libgen backend with a robust scraper based on libgen-api-enhanced logic.
Targets libgen.is/rs/st mirrors and parses the results table directly.
"""
from __future__ import annotations
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Optional
import logging
import re
import requests
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional
from urllib.parse import quote, urljoin
from libgen import search_sync, LibgenError
# Optional dependencies
try:
from bs4 import BeautifulSoup
except ImportError:
BeautifulSoup = None
LogFn = Optional[Callable[[str], None]]
ErrorFn = Optional[Callable[[str], None]]
DEFAULT_TIMEOUT = 10.0
DEFAULT_TIMEOUT = 20.0
DEFAULT_LIMIT = 50
logging.getLogger(__name__).setLevel(logging.WARNING)
# Mirrors to try in order
MIRRORS = [
"https://libgen.is",
"https://libgen.rs",
"https://libgen.st",
"http://libgen.is",
"http://libgen.rs",
"http://libgen.st",
"https://libgen.li", # Different structure, fallback
"http://libgen.li",
"https://libgen.gl", # Different structure, fallback
"http://libgen.gl",
]
logging.getLogger(__name__).setLevel(logging.INFO)
def _call(logger: LogFn, message: str) -> None:
@@ -23,168 +46,248 @@ def _call(logger: LogFn, message: str) -> None:
logger(message)
def search_libgen_no_ads(query: str, session: Optional[requests.Session] = None) -> List[Dict[str, Any]]:
"""Search Libgen without triggering ads.php requests."""
try:
from bs4 import BeautifulSoup
except ImportError: # pragma: no cover
logging.warning("BeautifulSoup not available; falling back to standard search")
class LibgenSearch:
"""Robust LibGen searcher."""
def __init__(self, session: Optional[requests.Session] = None):
self.session = session or requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
})
def search(self, query: str, limit: int = DEFAULT_LIMIT) -> List[Dict[str, Any]]:
"""Search LibGen mirrors."""
if not BeautifulSoup:
logging.error("BeautifulSoup not installed. Cannot search LibGen.")
return []
for mirror in MIRRORS:
try:
if "libgen.li" in mirror or "libgen.gl" in mirror:
results = self._search_libgen_li(mirror, query, limit)
else:
results = self._search_libgen_rs(mirror, query, limit)
if results:
return results
except Exception as e:
logging.debug(f"Mirror {mirror} failed: {e}")
continue
return []
mirrors = [
"https://libgen.gl",
"https://libgen.vg",
"https://libgen.la",
"https://libgen.bz",
"https://libgen.gs",
]
def _search_libgen_rs(self, mirror: str, query: str, limit: int) -> List[Dict[str, Any]]:
"""Search libgen.rs/is/st style mirrors."""
# Search URL: /search.php?req=QUERY&res=100&column=def
url = f"{mirror}/search.php"
params = {
"req": query,
"res": 100, # Request more to filter later
"column": "def",
"open": 0,
"view": "simple",
"phrase": 1,
}
resp = self.session.get(url, params=params, timeout=DEFAULT_TIMEOUT)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
# Find the table with results. usually class 'c'
table = soup.find("table", {"class": "c"})
if not table:
# Try finding by structure (table with many rows)
tables = soup.find_all("table")
for t in tables:
if len(t.find_all("tr")) > 5:
table = t
break
if not table:
return []
session = session or requests.Session()
session.headers.setdefault(
"User-Agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
)
for mirror in mirrors:
try:
search_url = f"{mirror}/index.php?req={quote(query)}&res=100&covers=on&filesuns=all"
response = session.get(search_url, timeout=DEFAULT_TIMEOUT)
if response.status_code != 200:
results = []
# Skip header row
rows = table.find_all("tr")[1:]
for row in rows:
cols = row.find_all("td")
if len(cols) < 9:
continue
# Columns:
# 0: ID
# 1: Author(s)
# 2: Title
# 3: Publisher
# 4: Year
# 5: Pages
# 6: Language
# 7: Size
# 8: Extension
# 9+: Mirrors
try:
libgen_id = cols[0].get_text(strip=True)
authors = [a.get_text(strip=True) for a in cols[1].find_all("a")]
if not authors:
authors = [cols[1].get_text(strip=True)]
title_tag = cols[2].find("a")
title = title_tag.get_text(strip=True) if title_tag else cols[2].get_text(strip=True)
# Extract MD5 from title link if possible (often in href)
# href='book/index.php?md5=...'
md5 = ""
if title_tag and title_tag.has_attr("href"):
href = title_tag["href"]
match = re.search(r"md5=([a-fA-F0-9]{32})", href)
if match:
md5 = match.group(1)
publisher = cols[3].get_text(strip=True)
year = cols[4].get_text(strip=True)
pages = cols[5].get_text(strip=True)
language = cols[6].get_text(strip=True)
size = cols[7].get_text(strip=True)
extension = cols[8].get_text(strip=True)
# Mirrors
# Usually col 9 is http://library.lol/main/MD5
mirror_links = []
for i in range(9, len(cols)):
a = cols[i].find("a")
if a and a.has_attr("href"):
mirror_links.append(a["href"])
# Construct direct download page link (library.lol)
# If we have MD5, we can guess it: http://library.lol/main/{md5}
if md5:
download_link = f"http://library.lol/main/{md5}"
elif mirror_links:
download_link = mirror_links[0]
else:
download_link = ""
soup = BeautifulSoup(response.content, "html.parser")
table = soup.find("table", {"class": "catalog"})
if table is None:
for candidate in soup.find_all("table"):
rows = candidate.find_all("tr")
if len(rows) > 2:
table = candidate
break
if table is None:
logging.debug("[libgen_no_ads] No results table on %s", mirror)
results.append({
"id": libgen_id,
"title": title,
"author": ", ".join(authors),
"publisher": publisher,
"year": year,
"pages": pages,
"language": language,
"filesize_str": size,
"extension": extension,
"md5": md5,
"mirror_url": download_link,
"cover": "", # Could extract from hover if needed
})
if len(results) >= limit:
break
except Exception as e:
logging.debug(f"Error parsing row: {e}")
continue
return results
rows = table.find_all("tr")[1:]
results: List[Dict[str, Any]] = []
for row in rows:
try:
cells = row.find_all("td")
if len(cells) < 9:
continue
size_cell = cells[7]
file_link = size_cell.find("a")
mirror_link = ""
if file_link:
href = str(file_link.get("href", ""))
if href.startswith("/"):
mirror_link = mirror + href
elif href:
mirror_link = urljoin(mirror, href)
if not mirror_link:
title_link = cells[1].find("a") if len(cells) > 1 else None
if title_link:
href = str(title_link.get("href", ""))
if href.startswith("/"):
mirror_link = mirror + href
elif href:
mirror_link = urljoin(mirror, href)
if not mirror_link:
continue
results.append(
{
"id": "",
"mirror": mirror_link,
"cover": "",
"title": cells[1].get_text(strip=True) if len(cells) > 1 else "Unknown",
"authors": [cells[2].get_text(strip=True)]
if len(cells) > 2
else ["Unknown"],
"publisher": cells[3].get_text(strip=True) if len(cells) > 3 else "",
"year": cells[4].get_text(strip=True) if len(cells) > 4 else "",
"pages": cells[6].get_text(strip=True) if len(cells) > 6 else "",
"language": cells[5].get_text(strip=True) if len(cells) > 5 else "",
"size": cells[7].get_text(strip=True) if len(cells) > 7 else "",
"extension": cells[8].get_text(strip=True) if len(cells) > 8 else "",
"isbn": "",
}
)
except Exception as exc: # pragma: no cover - defensive
logging.debug("[libgen_no_ads] Error parsing row: %s", exc)
continue
if results:
logging.info("[libgen_no_ads] %d results from %s", len(results), mirror)
return results
except Exception as exc: # pragma: no cover - mirror issues
logging.debug("[libgen_no_ads] Mirror %s failed: %s", mirror, exc)
continue
return []
def format_book_info(book: Any) -> Dict[str, Any]:
"""Format Libgen search result into a consistent dictionary."""
filesize_bytes = 0
size_str = getattr(book, "size", "") or ""
if size_str:
parts = size_str.strip().split()
try:
value = float(parts[0])
unit = parts[1].upper() if len(parts) > 1 else "B"
if unit in {"MB", "M"}:
filesize_bytes = int(value * 1024 * 1024)
elif unit in {"GB", "G"}:
filesize_bytes = int(value * 1024 * 1024 * 1024)
elif unit in {"KB", "K"}:
filesize_bytes = int(value * 1024)
else:
filesize_bytes = int(value)
except (ValueError, IndexError): # pragma: no cover - defensive
filesize_bytes = 0
title = getattr(book, "title", "") or ""
isbn = getattr(book, "isbn", "") or ""
if not isbn and title:
import re
match = re.search(
r"((?:[\d]{10,13}(?:\s*[;,]\s*[\d]{10,13})+)|(?:[\d]{10,13})(?:\s*[;,]?\s*[\d\-]{0,50})?)\s*(?:\b|$)",
title,
)
if match:
potential_isbn = match.group(0).strip()
if re.search(r"\d{10,13}", potential_isbn):
isbn = potential_isbn
title = re.sub(r"\s+[a-z]\s*$", "", title[: match.start()].strip(), flags=re.IGNORECASE)
authors_value = getattr(book, "authors", None)
if isinstance(authors_value, Iterable) and not isinstance(authors_value, str):
authors_str = ", ".join(str(author) for author in authors_value)
else:
authors_str = str(authors_value or "Unknown")
download_links = getattr(book, "download_links", None)
mirror_url = None
if download_links and getattr(download_links, "get_link", None):
mirror_url = download_links.get_link
return {
"title": title or "Unknown",
"author": authors_str,
"publisher": getattr(book, "publisher", "") or "",
"year": getattr(book, "year", "") or "",
"pages": getattr(book, "pages", "") or "",
"language": getattr(book, "language", "") or "",
"filesize": filesize_bytes,
"filesize_str": size_str or "Unknown",
"extension": getattr(book, "extension", "") or "",
"isbn": isbn,
"mirror_url": mirror_url,
}
def _search_libgen_li(self, mirror: str, query: str, limit: int) -> List[Dict[str, Any]]:
"""Search libgen.li/gl style mirrors."""
# Search URL: /index.php?req=QUERY&columns[]=t&columns[]=a...
url = f"{mirror}/index.php"
params = {
"req": query,
"res": 100,
"covers": "on",
"filesuns": "all",
}
resp = self.session.get(url, params=params, timeout=DEFAULT_TIMEOUT)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
table = soup.find("table", {"id": "tablelibgen"})
if not table:
table = soup.find("table", {"class": "table table-striped"})
if not table:
return []
results = []
rows = table.find_all("tr")[1:]
for row in rows:
cols = row.find_all("td")
if len(cols) < 9:
continue
try:
# Structure is different
# 0: Cover
# 1: Title (with link to file.php?id=...)
# 2: Author
# 3: Publisher
# 4: Year
# 5: Language
# 6: Pages
# 7: Size
# 8: Extension
# 9: Mirrors
title_col = cols[1]
title_link = title_col.find("a")
title = title_link.get_text(strip=True) if title_link else title_col.get_text(strip=True)
# Extract ID from link
libgen_id = ""
if title_link and title_link.has_attr("href"):
href = title_link["href"]
# href is usually "file.php?id=..." or "edition.php?id=..."
match = re.search(r"id=(\d+)", href)
if match:
libgen_id = match.group(1)
authors = cols[2].get_text(strip=True)
publisher = cols[3].get_text(strip=True)
year = cols[4].get_text(strip=True)
language = cols[5].get_text(strip=True)
pages = cols[6].get_text(strip=True)
size = cols[7].get_text(strip=True)
extension = cols[8].get_text(strip=True)
# Mirror link
# Usually in col 9 or title link
mirror_url = ""
if title_link:
href = title_link["href"]
if href.startswith("/"):
mirror_url = mirror + href
else:
mirror_url = urljoin(mirror, href)
results.append({
"id": libgen_id,
"title": title,
"author": authors,
"publisher": publisher,
"year": year,
"pages": pages,
"language": language,
"filesize_str": size,
"extension": extension,
"md5": "", # .li doesn't show MD5 easily in table
"mirror_url": mirror_url,
})
if len(results) >= limit:
break
except Exception:
continue
return results
def search_libgen(
@@ -195,183 +298,160 @@ def search_libgen(
log_error: ErrorFn = None,
session: Optional[requests.Session] = None,
) -> List[Dict[str, Any]]:
"""Search Libgen returning formatted dictionaries with multiple mirrors.
Uses HTML scraper (search_libgen_no_ads) to find books quickly.
Returns mirror URLs and book IDs that can be used to generate alternative mirrors.
"""
"""Search Libgen using the robust scraper."""
searcher = LibgenSearch(session=session)
try:
_call(log_info, f"[search] Searching Libgen for: {query}")
session = session or requests.Session()
# Use HTML scraper - more reliable and doesn't hang on mirror resolution
_call(log_info, "[search] Using HTML scraper (search_libgen_no_ads)...")
results: List[Any] = search_libgen_no_ads(query, session=session)
if not results:
_call(log_info, "[search] No results from HTML scraper")
return []
formatted: List[Dict[str, Any]] = []
mirrors_list = [
"https://libgen.gl",
"https://libgen.vg",
"https://libgen.la",
"https://libgen.bz",
"https://libgen.gs",
]
for book in results[:limit]:
if isinstance(book, dict):
# Result from search_libgen_no_ads (HTML scraper)
authors = book.get("authors", ["Unknown"])
if isinstance(authors, list):
author_value = ", ".join(str(a) for a in authors)
else:
author_value = str(authors)
# Extract book ID from mirror URL if available
mirror = book.get("mirror", "")
book_id = ""
if mirror and "/file.php?id=" in mirror:
try:
book_id = mirror.split("/file.php?id=")[1].split("&")[0]
except (IndexError, ValueError):
pass
# Build list of alternative mirrors based on book ID
mirrors_dict = {}
if book_id:
for mirror_base in mirrors_list:
mirrors_dict[mirror_base] = f"{mirror_base}/file.php?id={book_id}"
elif mirror:
# Fallback: use the mirror we found
mirrors_dict["primary"] = mirror
formatted.append(
{
"title": book.get("title", "Unknown"),
"author": author_value,
"publisher": book.get("publisher", ""),
"year": book.get("year", ""),
"pages": book.get("pages", ""),
"language": book.get("language", ""),
"filesize": 0,
"filesize_str": book.get("size", "Unknown"),
"extension": book.get("extension", ""),
"isbn": book.get("isbn", ""),
"mirror_url": mirror, # Primary mirror
"mirrors": mirrors_dict, # Alternative mirrors
"book_id": book_id,
}
)
else:
# Fallback: try to format as book object
try:
formatted.append(format_book_info(book))
except Exception:
pass
_call(log_info, f"[search] Found {len(formatted)} result(s)")
return formatted
except LibgenError as exc:
_call(log_error, f"[search] Libgen error: {exc}")
return []
except Exception as exc: # pragma: no cover - defensive
_call(log_error, f"[search] Error: {exc}")
results = searcher.search(query, limit=limit)
_call(log_info, f"[libgen] Found {len(results)} results")
return results
except Exception as e:
_call(log_error, f"[libgen] Search failed: {e}")
return []
def _resolve_download_url(
session: requests.Session,
url: str,
log_info: LogFn = None
) -> Optional[str]:
"""Resolve the final download URL by following the LibGen chain."""
current_url = url
visited = set()
# Max hops to prevent infinite loops
for _ in range(6):
if current_url in visited:
break
visited.add(current_url)
_call(log_info, f"[resolve] Checking: {current_url}")
# Simple heuristic: if it looks like a file, return it
if current_url.lower().endswith(('.pdf', '.epub', '.mobi', '.djvu', '.azw3', '.cbz', '.cbr')):
return current_url
try:
# Use HEAD first to check content type if possible, but some mirrors block HEAD or return 405
# So we'll just GET with stream=True to peek headers/content without downloading everything
with session.get(current_url, stream=True, timeout=30) as resp:
resp.raise_for_status()
ct = resp.headers.get("Content-Type", "").lower()
if "text/html" not in ct:
# It's a binary file
return current_url
# It's HTML, read content
content = resp.text
except Exception as e:
_call(log_info, f"[resolve] Failed to fetch {current_url}: {e}")
return None
soup = BeautifulSoup(content, "html.parser")
# 1. Check for "GET" link (library.lol / ads.php style)
# Usually <h2>GET</h2> inside <a> or just text "GET"
get_link = soup.find("a", string=re.compile(r"^GET$", re.IGNORECASE))
if not get_link:
# Try finding <a> containing <h2>GET</h2>
h2_get = soup.find("h2", string=re.compile(r"^GET$", re.IGNORECASE))
if h2_get and h2_get.parent.name == "a":
get_link = h2_get.parent
if get_link and get_link.has_attr("href"):
return urljoin(current_url, get_link["href"])
# 2. Check for "series.php" -> "edition.php"
if "series.php" in current_url:
# Find first edition link
edition_link = soup.find("a", href=re.compile(r"edition\.php"))
if edition_link:
current_url = urljoin(current_url, edition_link["href"])
continue
# 3. Check for "edition.php" -> "file.php"
if "edition.php" in current_url:
file_link = soup.find("a", href=re.compile(r"file\.php"))
if file_link:
current_url = urljoin(current_url, file_link["href"])
continue
# 4. Check for "file.php" -> "ads.php" (Libgen badge)
if "file.php" in current_url:
# Look for link with title="libgen" or text "Libgen"
libgen_link = soup.find("a", title="libgen")
if not libgen_link:
libgen_link = soup.find("a", string=re.compile(r"Libgen", re.IGNORECASE))
if libgen_link and libgen_link.has_attr("href"):
current_url = urljoin(current_url, libgen_link["href"])
continue
# 5. Check for "ads.php" -> "get.php" (Fallback if GET link logic above failed)
if "ads.php" in current_url:
get_php_link = soup.find("a", href=re.compile(r"get\.php"))
if get_php_link:
return urljoin(current_url, get_php_link["href"])
# 6. Library.lol / generic fallback
for text in ["Cloudflare", "IPFS.io", "Infura"]:
link = soup.find("a", string=re.compile(text, re.IGNORECASE))
if link and link.has_attr("href"):
return urljoin(current_url, link["href"])
# If we found nothing new, stop
break
return None
def download_from_mirror(
mirror_url: str,
output_path: str | Path,
output_path: Path,
*,
log_info: LogFn = None,
log_error: ErrorFn = None,
session: Optional[requests.Session] = None,
) -> bool:
"""Download a Libgen file and write it to disk.
Handles Libgen redirects and ensures proper file download by:
- Following all redirects (default behavior)
- Setting User-Agent header (required by some mirrors)
- Validating that we're downloading binary content, not HTML
- Attempting alternative download method if HTML is returned
"""
"""Download file from a LibGen mirror URL."""
session = session or requests.Session()
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
try:
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
_call(log_info, f"[download] Downloading from mirror: {mirror_url}")
_call(log_info, f"[download] Resolving download link from: {mirror_url}")
# Ensure session has proper headers for Libgen
if 'User-Agent' not in session.headers:
session.headers['User-Agent'] = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
)
download_url = _resolve_download_url(session, mirror_url, log_info)
# Download with redirects enabled (default) and referer
session.headers['Referer'] = 'https://libgen.gs/'
response = session.get(mirror_url, stream=True, timeout=30, allow_redirects=True)
response.raise_for_status()
# Check if we got HTML instead of a file (common Libgen issue)
content_type = response.headers.get('content-type', '').lower()
if 'text/html' in content_type:
_call(log_error, f"[download] Server returned HTML. Trying alternative method...")
if not download_url:
_call(log_error, "[download] Could not find direct download link")
return False
# Try to extract file ID and use alternative CDN
try:
# Parse the HTML to extract MD5 or file ID
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
# Look for download link in the HTML
# Common patterns: md5 hash in form, or direct link in anchor tags
download_link = None
# Try to find forms that might contain download functionality
forms = soup.find_all('form')
for form in forms:
action = form.get('action', '')
if 'download' in action.lower() or 'get' in action.lower():
download_link = action
break
if not download_link:
_call(log_error, f"[download] Could not extract alternative download link from HTML")
return False
_call(log_info, f"[download] Using alternative download method: {download_link[:100]}")
# Try downloading from alternative link
response2 = session.get(download_link, stream=True, timeout=30, allow_redirects=True)
response2.raise_for_status()
response = response2 # Use the new response
_call(log_info, f"[download] Downloading from: {download_url}")
# Download the actual file
with session.get(download_url, stream=True, timeout=60) as r:
r.raise_for_status()
except Exception as alt_error:
_call(log_error, f"[download] Alternative method failed: {alt_error}")
# Verify it's not HTML (error page)
ct = r.headers.get("content-type", "").lower()
if "text/html" in ct:
_call(log_error, "[download] Final URL returned HTML, not a file.")
return False
total_size = int(response.headers.get("content-length", 0))
downloaded = 0
with open(output_path, "wb") as handle:
for chunk in response.iter_content(chunk_size=8192):
if not chunk:
continue
handle.write(chunk)
downloaded += len(chunk)
if total_size > 0:
percent = downloaded / total_size * 100
_call(
log_info,
f"[download] {percent:.1f}% - {downloaded // (1024*1024)}MB / {total_size // (1024*1024)}MB",
)
_call(log_info, f"[download] Downloaded successfully to: {output_path}")
total_size = int(r.headers.get("content-length", 0))
downloaded = 0
with open(output_path, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
downloaded += len(chunk)
# Optional: progress logging
_call(log_info, f"[download] Saved to {output_path}")
return True
except Exception as exc: # pragma: no cover - defensive
_call(log_error, f"[download] Error: {exc}")
except Exception as e:
_call(log_error, f"[download] Download failed: {e}")
return False