AST
This commit is contained in:
377
helper/libgen_service.py
Normal file
377
helper/libgen_service.py
Normal file
@@ -0,0 +1,377 @@
|
||||
"""Shared Library Genesis search and download helpers."""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, Iterable, List, Optional
|
||||
import logging
|
||||
import requests
|
||||
from urllib.parse import quote, urljoin
|
||||
|
||||
from libgen import search_sync, LibgenError
|
||||
|
||||
LogFn = Optional[Callable[[str], None]]
|
||||
ErrorFn = Optional[Callable[[str], None]]
|
||||
|
||||
DEFAULT_TIMEOUT = 10.0
|
||||
DEFAULT_LIMIT = 50
|
||||
|
||||
logging.getLogger(__name__).setLevel(logging.WARNING)
|
||||
|
||||
|
||||
def _call(logger: LogFn, message: str) -> None:
|
||||
if logger:
|
||||
logger(message)
|
||||
|
||||
|
||||
def search_libgen_no_ads(query: str, session: Optional[requests.Session] = None) -> List[Dict[str, Any]]:
|
||||
"""Search Libgen without triggering ads.php requests."""
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
except ImportError: # pragma: no cover
|
||||
logging.warning("BeautifulSoup not available; falling back to standard search")
|
||||
return []
|
||||
|
||||
mirrors = [
|
||||
"https://libgen.gl",
|
||||
"https://libgen.vg",
|
||||
"https://libgen.la",
|
||||
"https://libgen.bz",
|
||||
"https://libgen.gs",
|
||||
]
|
||||
|
||||
session = session or requests.Session()
|
||||
session.headers.setdefault(
|
||||
"User-Agent",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
||||
)
|
||||
|
||||
for mirror in mirrors:
|
||||
try:
|
||||
search_url = f"{mirror}/index.php?req={quote(query)}&res=100&covers=on&filesuns=all"
|
||||
response = session.get(search_url, timeout=DEFAULT_TIMEOUT)
|
||||
if response.status_code != 200:
|
||||
continue
|
||||
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
table = soup.find("table", {"class": "catalog"})
|
||||
if table is None:
|
||||
for candidate in soup.find_all("table"):
|
||||
rows = candidate.find_all("tr")
|
||||
if len(rows) > 2:
|
||||
table = candidate
|
||||
break
|
||||
if table is None:
|
||||
logging.debug("[libgen_no_ads] No results table on %s", mirror)
|
||||
continue
|
||||
|
||||
rows = table.find_all("tr")[1:]
|
||||
results: List[Dict[str, Any]] = []
|
||||
for row in rows:
|
||||
try:
|
||||
cells = row.find_all("td")
|
||||
if len(cells) < 9:
|
||||
continue
|
||||
|
||||
size_cell = cells[7]
|
||||
file_link = size_cell.find("a")
|
||||
mirror_link = ""
|
||||
if file_link:
|
||||
href = str(file_link.get("href", ""))
|
||||
if href.startswith("/"):
|
||||
mirror_link = mirror + href
|
||||
elif href:
|
||||
mirror_link = urljoin(mirror, href)
|
||||
|
||||
if not mirror_link:
|
||||
title_link = cells[1].find("a") if len(cells) > 1 else None
|
||||
if title_link:
|
||||
href = str(title_link.get("href", ""))
|
||||
if href.startswith("/"):
|
||||
mirror_link = mirror + href
|
||||
elif href:
|
||||
mirror_link = urljoin(mirror, href)
|
||||
|
||||
if not mirror_link:
|
||||
continue
|
||||
|
||||
results.append(
|
||||
{
|
||||
"id": "",
|
||||
"mirror": mirror_link,
|
||||
"cover": "",
|
||||
"title": cells[1].get_text(strip=True) if len(cells) > 1 else "Unknown",
|
||||
"authors": [cells[2].get_text(strip=True)]
|
||||
if len(cells) > 2
|
||||
else ["Unknown"],
|
||||
"publisher": cells[3].get_text(strip=True) if len(cells) > 3 else "",
|
||||
"year": cells[4].get_text(strip=True) if len(cells) > 4 else "",
|
||||
"pages": cells[6].get_text(strip=True) if len(cells) > 6 else "",
|
||||
"language": cells[5].get_text(strip=True) if len(cells) > 5 else "",
|
||||
"size": cells[7].get_text(strip=True) if len(cells) > 7 else "",
|
||||
"extension": cells[8].get_text(strip=True) if len(cells) > 8 else "",
|
||||
"isbn": "",
|
||||
}
|
||||
)
|
||||
except Exception as exc: # pragma: no cover - defensive
|
||||
logging.debug("[libgen_no_ads] Error parsing row: %s", exc)
|
||||
continue
|
||||
|
||||
if results:
|
||||
logging.info("[libgen_no_ads] %d results from %s", len(results), mirror)
|
||||
return results
|
||||
except Exception as exc: # pragma: no cover - mirror issues
|
||||
logging.debug("[libgen_no_ads] Mirror %s failed: %s", mirror, exc)
|
||||
continue
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def format_book_info(book: Any) -> Dict[str, Any]:
|
||||
"""Format Libgen search result into a consistent dictionary."""
|
||||
filesize_bytes = 0
|
||||
size_str = getattr(book, "size", "") or ""
|
||||
if size_str:
|
||||
parts = size_str.strip().split()
|
||||
try:
|
||||
value = float(parts[0])
|
||||
unit = parts[1].upper() if len(parts) > 1 else "B"
|
||||
if unit in {"MB", "M"}:
|
||||
filesize_bytes = int(value * 1024 * 1024)
|
||||
elif unit in {"GB", "G"}:
|
||||
filesize_bytes = int(value * 1024 * 1024 * 1024)
|
||||
elif unit in {"KB", "K"}:
|
||||
filesize_bytes = int(value * 1024)
|
||||
else:
|
||||
filesize_bytes = int(value)
|
||||
except (ValueError, IndexError): # pragma: no cover - defensive
|
||||
filesize_bytes = 0
|
||||
|
||||
title = getattr(book, "title", "") or ""
|
||||
isbn = getattr(book, "isbn", "") or ""
|
||||
if not isbn and title:
|
||||
import re
|
||||
|
||||
match = re.search(
|
||||
r"((?:[\d]{10,13}(?:\s*[;,]\s*[\d]{10,13})+)|(?:[\d]{10,13})(?:\s*[;,]?\s*[\d\-]{0,50})?)\s*(?:\b|$)",
|
||||
title,
|
||||
)
|
||||
if match:
|
||||
potential_isbn = match.group(0).strip()
|
||||
if re.search(r"\d{10,13}", potential_isbn):
|
||||
isbn = potential_isbn
|
||||
title = re.sub(r"\s+[a-z]\s*$", "", title[: match.start()].strip(), flags=re.IGNORECASE)
|
||||
|
||||
authors_value = getattr(book, "authors", None)
|
||||
if isinstance(authors_value, Iterable) and not isinstance(authors_value, str):
|
||||
authors_str = ", ".join(str(author) for author in authors_value)
|
||||
else:
|
||||
authors_str = str(authors_value or "Unknown")
|
||||
|
||||
download_links = getattr(book, "download_links", None)
|
||||
mirror_url = None
|
||||
if download_links and getattr(download_links, "get_link", None):
|
||||
mirror_url = download_links.get_link
|
||||
|
||||
return {
|
||||
"title": title or "Unknown",
|
||||
"author": authors_str,
|
||||
"publisher": getattr(book, "publisher", "") or "",
|
||||
"year": getattr(book, "year", "") or "",
|
||||
"pages": getattr(book, "pages", "") or "",
|
||||
"language": getattr(book, "language", "") or "",
|
||||
"filesize": filesize_bytes,
|
||||
"filesize_str": size_str or "Unknown",
|
||||
"extension": getattr(book, "extension", "") or "",
|
||||
"isbn": isbn,
|
||||
"mirror_url": mirror_url,
|
||||
}
|
||||
|
||||
|
||||
def search_libgen(
|
||||
query: str,
|
||||
limit: int = DEFAULT_LIMIT,
|
||||
*,
|
||||
log_info: LogFn = None,
|
||||
log_error: ErrorFn = None,
|
||||
session: Optional[requests.Session] = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Search Libgen returning formatted dictionaries with multiple mirrors.
|
||||
|
||||
Uses HTML scraper (search_libgen_no_ads) to find books quickly.
|
||||
Returns mirror URLs and book IDs that can be used to generate alternative mirrors.
|
||||
"""
|
||||
try:
|
||||
_call(log_info, f"[search] Searching Libgen for: {query}")
|
||||
session = session or requests.Session()
|
||||
|
||||
# Use HTML scraper - more reliable and doesn't hang on mirror resolution
|
||||
_call(log_info, "[search] Using HTML scraper (search_libgen_no_ads)...")
|
||||
results: List[Any] = search_libgen_no_ads(query, session=session)
|
||||
|
||||
if not results:
|
||||
_call(log_info, "[search] No results from HTML scraper")
|
||||
return []
|
||||
|
||||
formatted: List[Dict[str, Any]] = []
|
||||
mirrors_list = [
|
||||
"https://libgen.gl",
|
||||
"https://libgen.vg",
|
||||
"https://libgen.la",
|
||||
"https://libgen.bz",
|
||||
"https://libgen.gs",
|
||||
]
|
||||
|
||||
for book in results[:limit]:
|
||||
if isinstance(book, dict):
|
||||
# Result from search_libgen_no_ads (HTML scraper)
|
||||
authors = book.get("authors", ["Unknown"])
|
||||
if isinstance(authors, list):
|
||||
author_value = ", ".join(str(a) for a in authors)
|
||||
else:
|
||||
author_value = str(authors)
|
||||
|
||||
# Extract book ID from mirror URL if available
|
||||
mirror = book.get("mirror", "")
|
||||
book_id = ""
|
||||
if mirror and "/file.php?id=" in mirror:
|
||||
try:
|
||||
book_id = mirror.split("/file.php?id=")[1].split("&")[0]
|
||||
except (IndexError, ValueError):
|
||||
pass
|
||||
|
||||
# Build list of alternative mirrors based on book ID
|
||||
mirrors_dict = {}
|
||||
if book_id:
|
||||
for mirror_base in mirrors_list:
|
||||
mirrors_dict[mirror_base] = f"{mirror_base}/file.php?id={book_id}"
|
||||
elif mirror:
|
||||
# Fallback: use the mirror we found
|
||||
mirrors_dict["primary"] = mirror
|
||||
|
||||
formatted.append(
|
||||
{
|
||||
"title": book.get("title", "Unknown"),
|
||||
"author": author_value,
|
||||
"publisher": book.get("publisher", ""),
|
||||
"year": book.get("year", ""),
|
||||
"pages": book.get("pages", ""),
|
||||
"language": book.get("language", ""),
|
||||
"filesize": 0,
|
||||
"filesize_str": book.get("size", "Unknown"),
|
||||
"extension": book.get("extension", ""),
|
||||
"isbn": book.get("isbn", ""),
|
||||
"mirror_url": mirror, # Primary mirror
|
||||
"mirrors": mirrors_dict, # Alternative mirrors
|
||||
"book_id": book_id,
|
||||
}
|
||||
)
|
||||
else:
|
||||
# Fallback: try to format as book object
|
||||
try:
|
||||
formatted.append(format_book_info(book))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
_call(log_info, f"[search] Found {len(formatted)} result(s)")
|
||||
return formatted
|
||||
except LibgenError as exc:
|
||||
_call(log_error, f"[search] Libgen error: {exc}")
|
||||
return []
|
||||
except Exception as exc: # pragma: no cover - defensive
|
||||
_call(log_error, f"[search] Error: {exc}")
|
||||
return []
|
||||
|
||||
|
||||
def download_from_mirror(
|
||||
mirror_url: str,
|
||||
output_path: str | Path,
|
||||
*,
|
||||
log_info: LogFn = None,
|
||||
log_error: ErrorFn = None,
|
||||
session: Optional[requests.Session] = None,
|
||||
) -> bool:
|
||||
"""Download a Libgen file and write it to disk.
|
||||
|
||||
Handles Libgen redirects and ensures proper file download by:
|
||||
- Following all redirects (default behavior)
|
||||
- Setting User-Agent header (required by some mirrors)
|
||||
- Validating that we're downloading binary content, not HTML
|
||||
- Attempting alternative download method if HTML is returned
|
||||
"""
|
||||
session = session or requests.Session()
|
||||
try:
|
||||
output_path = Path(output_path)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
_call(log_info, f"[download] Downloading from mirror: {mirror_url}")
|
||||
|
||||
# Ensure session has proper headers for Libgen
|
||||
if 'User-Agent' not in session.headers:
|
||||
session.headers['User-Agent'] = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
)
|
||||
|
||||
# Download with redirects enabled (default) and referer
|
||||
session.headers['Referer'] = 'https://libgen.gs/'
|
||||
response = session.get(mirror_url, stream=True, timeout=30, allow_redirects=True)
|
||||
response.raise_for_status()
|
||||
|
||||
# Check if we got HTML instead of a file (common Libgen issue)
|
||||
content_type = response.headers.get('content-type', '').lower()
|
||||
if 'text/html' in content_type:
|
||||
_call(log_error, f"[download] Server returned HTML. Trying alternative method...")
|
||||
|
||||
# Try to extract file ID and use alternative CDN
|
||||
try:
|
||||
# Parse the HTML to extract MD5 or file ID
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
# Look for download link in the HTML
|
||||
# Common patterns: md5 hash in form, or direct link in anchor tags
|
||||
download_link = None
|
||||
|
||||
# Try to find forms that might contain download functionality
|
||||
forms = soup.find_all('form')
|
||||
for form in forms:
|
||||
action = form.get('action', '')
|
||||
if 'download' in action.lower() or 'get' in action.lower():
|
||||
download_link = action
|
||||
break
|
||||
|
||||
if not download_link:
|
||||
_call(log_error, f"[download] Could not extract alternative download link from HTML")
|
||||
return False
|
||||
|
||||
_call(log_info, f"[download] Using alternative download method: {download_link[:100]}")
|
||||
# Try downloading from alternative link
|
||||
response2 = session.get(download_link, stream=True, timeout=30, allow_redirects=True)
|
||||
response2.raise_for_status()
|
||||
response = response2 # Use the new response
|
||||
|
||||
except Exception as alt_error:
|
||||
_call(log_error, f"[download] Alternative method failed: {alt_error}")
|
||||
return False
|
||||
|
||||
total_size = int(response.headers.get("content-length", 0))
|
||||
downloaded = 0
|
||||
|
||||
with open(output_path, "wb") as handle:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
if not chunk:
|
||||
continue
|
||||
handle.write(chunk)
|
||||
downloaded += len(chunk)
|
||||
if total_size > 0:
|
||||
percent = downloaded / total_size * 100
|
||||
_call(
|
||||
log_info,
|
||||
f"[download] {percent:.1f}% - {downloaded // (1024*1024)}MB / {total_size // (1024*1024)}MB",
|
||||
)
|
||||
|
||||
_call(log_info, f"[download] Downloaded successfully to: {output_path}")
|
||||
return True
|
||||
except Exception as exc: # pragma: no cover - defensive
|
||||
_call(log_error, f"[download] Error: {exc}")
|
||||
return False
|
||||
Reference in New Issue
Block a user