jkjnkjkllkjjk
This commit is contained in:
@@ -1,21 +1,44 @@
|
||||
"""Shared Library Genesis search and download helpers."""
|
||||
"""Shared Library Genesis search and download helpers.
|
||||
|
||||
Replaces the old libgen backend with a robust scraper based on libgen-api-enhanced logic.
|
||||
Targets libgen.is/rs/st mirrors and parses the results table directly.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, Iterable, List, Optional
|
||||
import logging
|
||||
import re
|
||||
import requests
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, List, Optional
|
||||
from urllib.parse import quote, urljoin
|
||||
|
||||
from libgen import search_sync, LibgenError
|
||||
# Optional dependencies
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
except ImportError:
|
||||
BeautifulSoup = None
|
||||
|
||||
LogFn = Optional[Callable[[str], None]]
|
||||
ErrorFn = Optional[Callable[[str], None]]
|
||||
|
||||
DEFAULT_TIMEOUT = 10.0
|
||||
DEFAULT_TIMEOUT = 20.0
|
||||
DEFAULT_LIMIT = 50
|
||||
|
||||
logging.getLogger(__name__).setLevel(logging.WARNING)
|
||||
# Mirrors to try in order
|
||||
MIRRORS = [
|
||||
"https://libgen.is",
|
||||
"https://libgen.rs",
|
||||
"https://libgen.st",
|
||||
"http://libgen.is",
|
||||
"http://libgen.rs",
|
||||
"http://libgen.st",
|
||||
"https://libgen.li", # Different structure, fallback
|
||||
"http://libgen.li",
|
||||
"https://libgen.gl", # Different structure, fallback
|
||||
"http://libgen.gl",
|
||||
]
|
||||
|
||||
logging.getLogger(__name__).setLevel(logging.INFO)
|
||||
|
||||
|
||||
def _call(logger: LogFn, message: str) -> None:
|
||||
@@ -23,168 +46,248 @@ def _call(logger: LogFn, message: str) -> None:
|
||||
logger(message)
|
||||
|
||||
|
||||
def search_libgen_no_ads(query: str, session: Optional[requests.Session] = None) -> List[Dict[str, Any]]:
|
||||
"""Search Libgen without triggering ads.php requests."""
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
except ImportError: # pragma: no cover
|
||||
logging.warning("BeautifulSoup not available; falling back to standard search")
|
||||
class LibgenSearch:
|
||||
"""Robust LibGen searcher."""
|
||||
|
||||
def __init__(self, session: Optional[requests.Session] = None):
|
||||
self.session = session or requests.Session()
|
||||
self.session.headers.update({
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
})
|
||||
|
||||
def search(self, query: str, limit: int = DEFAULT_LIMIT) -> List[Dict[str, Any]]:
|
||||
"""Search LibGen mirrors."""
|
||||
if not BeautifulSoup:
|
||||
logging.error("BeautifulSoup not installed. Cannot search LibGen.")
|
||||
return []
|
||||
|
||||
for mirror in MIRRORS:
|
||||
try:
|
||||
if "libgen.li" in mirror or "libgen.gl" in mirror:
|
||||
results = self._search_libgen_li(mirror, query, limit)
|
||||
else:
|
||||
results = self._search_libgen_rs(mirror, query, limit)
|
||||
|
||||
if results:
|
||||
return results
|
||||
except Exception as e:
|
||||
logging.debug(f"Mirror {mirror} failed: {e}")
|
||||
continue
|
||||
|
||||
return []
|
||||
|
||||
mirrors = [
|
||||
"https://libgen.gl",
|
||||
"https://libgen.vg",
|
||||
"https://libgen.la",
|
||||
"https://libgen.bz",
|
||||
"https://libgen.gs",
|
||||
]
|
||||
def _search_libgen_rs(self, mirror: str, query: str, limit: int) -> List[Dict[str, Any]]:
|
||||
"""Search libgen.rs/is/st style mirrors."""
|
||||
# Search URL: /search.php?req=QUERY&res=100&column=def
|
||||
url = f"{mirror}/search.php"
|
||||
params = {
|
||||
"req": query,
|
||||
"res": 100, # Request more to filter later
|
||||
"column": "def",
|
||||
"open": 0,
|
||||
"view": "simple",
|
||||
"phrase": 1,
|
||||
}
|
||||
|
||||
resp = self.session.get(url, params=params, timeout=DEFAULT_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
# Find the table with results. usually class 'c'
|
||||
table = soup.find("table", {"class": "c"})
|
||||
if not table:
|
||||
# Try finding by structure (table with many rows)
|
||||
tables = soup.find_all("table")
|
||||
for t in tables:
|
||||
if len(t.find_all("tr")) > 5:
|
||||
table = t
|
||||
break
|
||||
|
||||
if not table:
|
||||
return []
|
||||
|
||||
session = session or requests.Session()
|
||||
session.headers.setdefault(
|
||||
"User-Agent",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
||||
)
|
||||
|
||||
for mirror in mirrors:
|
||||
try:
|
||||
search_url = f"{mirror}/index.php?req={quote(query)}&res=100&covers=on&filesuns=all"
|
||||
response = session.get(search_url, timeout=DEFAULT_TIMEOUT)
|
||||
if response.status_code != 200:
|
||||
results = []
|
||||
# Skip header row
|
||||
rows = table.find_all("tr")[1:]
|
||||
|
||||
for row in rows:
|
||||
cols = row.find_all("td")
|
||||
if len(cols) < 9:
|
||||
continue
|
||||
|
||||
# Columns:
|
||||
# 0: ID
|
||||
# 1: Author(s)
|
||||
# 2: Title
|
||||
# 3: Publisher
|
||||
# 4: Year
|
||||
# 5: Pages
|
||||
# 6: Language
|
||||
# 7: Size
|
||||
# 8: Extension
|
||||
# 9+: Mirrors
|
||||
|
||||
try:
|
||||
libgen_id = cols[0].get_text(strip=True)
|
||||
authors = [a.get_text(strip=True) for a in cols[1].find_all("a")]
|
||||
if not authors:
|
||||
authors = [cols[1].get_text(strip=True)]
|
||||
|
||||
title_tag = cols[2].find("a")
|
||||
title = title_tag.get_text(strip=True) if title_tag else cols[2].get_text(strip=True)
|
||||
|
||||
# Extract MD5 from title link if possible (often in href)
|
||||
# href='book/index.php?md5=...'
|
||||
md5 = ""
|
||||
if title_tag and title_tag.has_attr("href"):
|
||||
href = title_tag["href"]
|
||||
match = re.search(r"md5=([a-fA-F0-9]{32})", href)
|
||||
if match:
|
||||
md5 = match.group(1)
|
||||
|
||||
publisher = cols[3].get_text(strip=True)
|
||||
year = cols[4].get_text(strip=True)
|
||||
pages = cols[5].get_text(strip=True)
|
||||
language = cols[6].get_text(strip=True)
|
||||
size = cols[7].get_text(strip=True)
|
||||
extension = cols[8].get_text(strip=True)
|
||||
|
||||
# Mirrors
|
||||
# Usually col 9 is http://library.lol/main/MD5
|
||||
mirror_links = []
|
||||
for i in range(9, len(cols)):
|
||||
a = cols[i].find("a")
|
||||
if a and a.has_attr("href"):
|
||||
mirror_links.append(a["href"])
|
||||
|
||||
# Construct direct download page link (library.lol)
|
||||
# If we have MD5, we can guess it: http://library.lol/main/{md5}
|
||||
if md5:
|
||||
download_link = f"http://library.lol/main/{md5}"
|
||||
elif mirror_links:
|
||||
download_link = mirror_links[0]
|
||||
else:
|
||||
download_link = ""
|
||||
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
table = soup.find("table", {"class": "catalog"})
|
||||
if table is None:
|
||||
for candidate in soup.find_all("table"):
|
||||
rows = candidate.find_all("tr")
|
||||
if len(rows) > 2:
|
||||
table = candidate
|
||||
break
|
||||
if table is None:
|
||||
logging.debug("[libgen_no_ads] No results table on %s", mirror)
|
||||
results.append({
|
||||
"id": libgen_id,
|
||||
"title": title,
|
||||
"author": ", ".join(authors),
|
||||
"publisher": publisher,
|
||||
"year": year,
|
||||
"pages": pages,
|
||||
"language": language,
|
||||
"filesize_str": size,
|
||||
"extension": extension,
|
||||
"md5": md5,
|
||||
"mirror_url": download_link,
|
||||
"cover": "", # Could extract from hover if needed
|
||||
})
|
||||
|
||||
if len(results) >= limit:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
logging.debug(f"Error parsing row: {e}")
|
||||
continue
|
||||
|
||||
return results
|
||||
|
||||
rows = table.find_all("tr")[1:]
|
||||
results: List[Dict[str, Any]] = []
|
||||
for row in rows:
|
||||
try:
|
||||
cells = row.find_all("td")
|
||||
if len(cells) < 9:
|
||||
continue
|
||||
|
||||
size_cell = cells[7]
|
||||
file_link = size_cell.find("a")
|
||||
mirror_link = ""
|
||||
if file_link:
|
||||
href = str(file_link.get("href", ""))
|
||||
if href.startswith("/"):
|
||||
mirror_link = mirror + href
|
||||
elif href:
|
||||
mirror_link = urljoin(mirror, href)
|
||||
|
||||
if not mirror_link:
|
||||
title_link = cells[1].find("a") if len(cells) > 1 else None
|
||||
if title_link:
|
||||
href = str(title_link.get("href", ""))
|
||||
if href.startswith("/"):
|
||||
mirror_link = mirror + href
|
||||
elif href:
|
||||
mirror_link = urljoin(mirror, href)
|
||||
|
||||
if not mirror_link:
|
||||
continue
|
||||
|
||||
results.append(
|
||||
{
|
||||
"id": "",
|
||||
"mirror": mirror_link,
|
||||
"cover": "",
|
||||
"title": cells[1].get_text(strip=True) if len(cells) > 1 else "Unknown",
|
||||
"authors": [cells[2].get_text(strip=True)]
|
||||
if len(cells) > 2
|
||||
else ["Unknown"],
|
||||
"publisher": cells[3].get_text(strip=True) if len(cells) > 3 else "",
|
||||
"year": cells[4].get_text(strip=True) if len(cells) > 4 else "",
|
||||
"pages": cells[6].get_text(strip=True) if len(cells) > 6 else "",
|
||||
"language": cells[5].get_text(strip=True) if len(cells) > 5 else "",
|
||||
"size": cells[7].get_text(strip=True) if len(cells) > 7 else "",
|
||||
"extension": cells[8].get_text(strip=True) if len(cells) > 8 else "",
|
||||
"isbn": "",
|
||||
}
|
||||
)
|
||||
except Exception as exc: # pragma: no cover - defensive
|
||||
logging.debug("[libgen_no_ads] Error parsing row: %s", exc)
|
||||
continue
|
||||
|
||||
if results:
|
||||
logging.info("[libgen_no_ads] %d results from %s", len(results), mirror)
|
||||
return results
|
||||
except Exception as exc: # pragma: no cover - mirror issues
|
||||
logging.debug("[libgen_no_ads] Mirror %s failed: %s", mirror, exc)
|
||||
continue
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def format_book_info(book: Any) -> Dict[str, Any]:
|
||||
"""Format Libgen search result into a consistent dictionary."""
|
||||
filesize_bytes = 0
|
||||
size_str = getattr(book, "size", "") or ""
|
||||
if size_str:
|
||||
parts = size_str.strip().split()
|
||||
try:
|
||||
value = float(parts[0])
|
||||
unit = parts[1].upper() if len(parts) > 1 else "B"
|
||||
if unit in {"MB", "M"}:
|
||||
filesize_bytes = int(value * 1024 * 1024)
|
||||
elif unit in {"GB", "G"}:
|
||||
filesize_bytes = int(value * 1024 * 1024 * 1024)
|
||||
elif unit in {"KB", "K"}:
|
||||
filesize_bytes = int(value * 1024)
|
||||
else:
|
||||
filesize_bytes = int(value)
|
||||
except (ValueError, IndexError): # pragma: no cover - defensive
|
||||
filesize_bytes = 0
|
||||
|
||||
title = getattr(book, "title", "") or ""
|
||||
isbn = getattr(book, "isbn", "") or ""
|
||||
if not isbn and title:
|
||||
import re
|
||||
|
||||
match = re.search(
|
||||
r"((?:[\d]{10,13}(?:\s*[;,]\s*[\d]{10,13})+)|(?:[\d]{10,13})(?:\s*[;,]?\s*[\d\-]{0,50})?)\s*(?:\b|$)",
|
||||
title,
|
||||
)
|
||||
if match:
|
||||
potential_isbn = match.group(0).strip()
|
||||
if re.search(r"\d{10,13}", potential_isbn):
|
||||
isbn = potential_isbn
|
||||
title = re.sub(r"\s+[a-z]\s*$", "", title[: match.start()].strip(), flags=re.IGNORECASE)
|
||||
|
||||
authors_value = getattr(book, "authors", None)
|
||||
if isinstance(authors_value, Iterable) and not isinstance(authors_value, str):
|
||||
authors_str = ", ".join(str(author) for author in authors_value)
|
||||
else:
|
||||
authors_str = str(authors_value or "Unknown")
|
||||
|
||||
download_links = getattr(book, "download_links", None)
|
||||
mirror_url = None
|
||||
if download_links and getattr(download_links, "get_link", None):
|
||||
mirror_url = download_links.get_link
|
||||
|
||||
return {
|
||||
"title": title or "Unknown",
|
||||
"author": authors_str,
|
||||
"publisher": getattr(book, "publisher", "") or "",
|
||||
"year": getattr(book, "year", "") or "",
|
||||
"pages": getattr(book, "pages", "") or "",
|
||||
"language": getattr(book, "language", "") or "",
|
||||
"filesize": filesize_bytes,
|
||||
"filesize_str": size_str or "Unknown",
|
||||
"extension": getattr(book, "extension", "") or "",
|
||||
"isbn": isbn,
|
||||
"mirror_url": mirror_url,
|
||||
}
|
||||
def _search_libgen_li(self, mirror: str, query: str, limit: int) -> List[Dict[str, Any]]:
|
||||
"""Search libgen.li/gl style mirrors."""
|
||||
# Search URL: /index.php?req=QUERY&columns[]=t&columns[]=a...
|
||||
url = f"{mirror}/index.php"
|
||||
params = {
|
||||
"req": query,
|
||||
"res": 100,
|
||||
"covers": "on",
|
||||
"filesuns": "all",
|
||||
}
|
||||
|
||||
resp = self.session.get(url, params=params, timeout=DEFAULT_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
table = soup.find("table", {"id": "tablelibgen"})
|
||||
if not table:
|
||||
table = soup.find("table", {"class": "table table-striped"})
|
||||
|
||||
if not table:
|
||||
return []
|
||||
|
||||
results = []
|
||||
rows = table.find_all("tr")[1:]
|
||||
|
||||
for row in rows:
|
||||
cols = row.find_all("td")
|
||||
if len(cols) < 9:
|
||||
continue
|
||||
|
||||
try:
|
||||
# Structure is different
|
||||
# 0: Cover
|
||||
# 1: Title (with link to file.php?id=...)
|
||||
# 2: Author
|
||||
# 3: Publisher
|
||||
# 4: Year
|
||||
# 5: Language
|
||||
# 6: Pages
|
||||
# 7: Size
|
||||
# 8: Extension
|
||||
# 9: Mirrors
|
||||
|
||||
title_col = cols[1]
|
||||
title_link = title_col.find("a")
|
||||
title = title_link.get_text(strip=True) if title_link else title_col.get_text(strip=True)
|
||||
|
||||
# Extract ID from link
|
||||
libgen_id = ""
|
||||
if title_link and title_link.has_attr("href"):
|
||||
href = title_link["href"]
|
||||
# href is usually "file.php?id=..." or "edition.php?id=..."
|
||||
match = re.search(r"id=(\d+)", href)
|
||||
if match:
|
||||
libgen_id = match.group(1)
|
||||
|
||||
authors = cols[2].get_text(strip=True)
|
||||
publisher = cols[3].get_text(strip=True)
|
||||
year = cols[4].get_text(strip=True)
|
||||
language = cols[5].get_text(strip=True)
|
||||
pages = cols[6].get_text(strip=True)
|
||||
size = cols[7].get_text(strip=True)
|
||||
extension = cols[8].get_text(strip=True)
|
||||
|
||||
# Mirror link
|
||||
# Usually in col 9 or title link
|
||||
mirror_url = ""
|
||||
if title_link:
|
||||
href = title_link["href"]
|
||||
if href.startswith("/"):
|
||||
mirror_url = mirror + href
|
||||
else:
|
||||
mirror_url = urljoin(mirror, href)
|
||||
|
||||
results.append({
|
||||
"id": libgen_id,
|
||||
"title": title,
|
||||
"author": authors,
|
||||
"publisher": publisher,
|
||||
"year": year,
|
||||
"pages": pages,
|
||||
"language": language,
|
||||
"filesize_str": size,
|
||||
"extension": extension,
|
||||
"md5": "", # .li doesn't show MD5 easily in table
|
||||
"mirror_url": mirror_url,
|
||||
})
|
||||
|
||||
if len(results) >= limit:
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def search_libgen(
|
||||
@@ -195,183 +298,160 @@ def search_libgen(
|
||||
log_error: ErrorFn = None,
|
||||
session: Optional[requests.Session] = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Search Libgen returning formatted dictionaries with multiple mirrors.
|
||||
|
||||
Uses HTML scraper (search_libgen_no_ads) to find books quickly.
|
||||
Returns mirror URLs and book IDs that can be used to generate alternative mirrors.
|
||||
"""
|
||||
"""Search Libgen using the robust scraper."""
|
||||
searcher = LibgenSearch(session=session)
|
||||
try:
|
||||
_call(log_info, f"[search] Searching Libgen for: {query}")
|
||||
session = session or requests.Session()
|
||||
|
||||
# Use HTML scraper - more reliable and doesn't hang on mirror resolution
|
||||
_call(log_info, "[search] Using HTML scraper (search_libgen_no_ads)...")
|
||||
results: List[Any] = search_libgen_no_ads(query, session=session)
|
||||
|
||||
if not results:
|
||||
_call(log_info, "[search] No results from HTML scraper")
|
||||
return []
|
||||
|
||||
formatted: List[Dict[str, Any]] = []
|
||||
mirrors_list = [
|
||||
"https://libgen.gl",
|
||||
"https://libgen.vg",
|
||||
"https://libgen.la",
|
||||
"https://libgen.bz",
|
||||
"https://libgen.gs",
|
||||
]
|
||||
|
||||
for book in results[:limit]:
|
||||
if isinstance(book, dict):
|
||||
# Result from search_libgen_no_ads (HTML scraper)
|
||||
authors = book.get("authors", ["Unknown"])
|
||||
if isinstance(authors, list):
|
||||
author_value = ", ".join(str(a) for a in authors)
|
||||
else:
|
||||
author_value = str(authors)
|
||||
|
||||
# Extract book ID from mirror URL if available
|
||||
mirror = book.get("mirror", "")
|
||||
book_id = ""
|
||||
if mirror and "/file.php?id=" in mirror:
|
||||
try:
|
||||
book_id = mirror.split("/file.php?id=")[1].split("&")[0]
|
||||
except (IndexError, ValueError):
|
||||
pass
|
||||
|
||||
# Build list of alternative mirrors based on book ID
|
||||
mirrors_dict = {}
|
||||
if book_id:
|
||||
for mirror_base in mirrors_list:
|
||||
mirrors_dict[mirror_base] = f"{mirror_base}/file.php?id={book_id}"
|
||||
elif mirror:
|
||||
# Fallback: use the mirror we found
|
||||
mirrors_dict["primary"] = mirror
|
||||
|
||||
formatted.append(
|
||||
{
|
||||
"title": book.get("title", "Unknown"),
|
||||
"author": author_value,
|
||||
"publisher": book.get("publisher", ""),
|
||||
"year": book.get("year", ""),
|
||||
"pages": book.get("pages", ""),
|
||||
"language": book.get("language", ""),
|
||||
"filesize": 0,
|
||||
"filesize_str": book.get("size", "Unknown"),
|
||||
"extension": book.get("extension", ""),
|
||||
"isbn": book.get("isbn", ""),
|
||||
"mirror_url": mirror, # Primary mirror
|
||||
"mirrors": mirrors_dict, # Alternative mirrors
|
||||
"book_id": book_id,
|
||||
}
|
||||
)
|
||||
else:
|
||||
# Fallback: try to format as book object
|
||||
try:
|
||||
formatted.append(format_book_info(book))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
_call(log_info, f"[search] Found {len(formatted)} result(s)")
|
||||
return formatted
|
||||
except LibgenError as exc:
|
||||
_call(log_error, f"[search] Libgen error: {exc}")
|
||||
return []
|
||||
except Exception as exc: # pragma: no cover - defensive
|
||||
_call(log_error, f"[search] Error: {exc}")
|
||||
results = searcher.search(query, limit=limit)
|
||||
_call(log_info, f"[libgen] Found {len(results)} results")
|
||||
return results
|
||||
except Exception as e:
|
||||
_call(log_error, f"[libgen] Search failed: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def _resolve_download_url(
|
||||
session: requests.Session,
|
||||
url: str,
|
||||
log_info: LogFn = None
|
||||
) -> Optional[str]:
|
||||
"""Resolve the final download URL by following the LibGen chain."""
|
||||
current_url = url
|
||||
visited = set()
|
||||
|
||||
# Max hops to prevent infinite loops
|
||||
for _ in range(6):
|
||||
if current_url in visited:
|
||||
break
|
||||
visited.add(current_url)
|
||||
|
||||
_call(log_info, f"[resolve] Checking: {current_url}")
|
||||
|
||||
# Simple heuristic: if it looks like a file, return it
|
||||
if current_url.lower().endswith(('.pdf', '.epub', '.mobi', '.djvu', '.azw3', '.cbz', '.cbr')):
|
||||
return current_url
|
||||
|
||||
try:
|
||||
# Use HEAD first to check content type if possible, but some mirrors block HEAD or return 405
|
||||
# So we'll just GET with stream=True to peek headers/content without downloading everything
|
||||
with session.get(current_url, stream=True, timeout=30) as resp:
|
||||
resp.raise_for_status()
|
||||
ct = resp.headers.get("Content-Type", "").lower()
|
||||
|
||||
if "text/html" not in ct:
|
||||
# It's a binary file
|
||||
return current_url
|
||||
|
||||
# It's HTML, read content
|
||||
content = resp.text
|
||||
except Exception as e:
|
||||
_call(log_info, f"[resolve] Failed to fetch {current_url}: {e}")
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
|
||||
# 1. Check for "GET" link (library.lol / ads.php style)
|
||||
# Usually <h2>GET</h2> inside <a> or just text "GET"
|
||||
get_link = soup.find("a", string=re.compile(r"^GET$", re.IGNORECASE))
|
||||
if not get_link:
|
||||
# Try finding <a> containing <h2>GET</h2>
|
||||
h2_get = soup.find("h2", string=re.compile(r"^GET$", re.IGNORECASE))
|
||||
if h2_get and h2_get.parent.name == "a":
|
||||
get_link = h2_get.parent
|
||||
|
||||
if get_link and get_link.has_attr("href"):
|
||||
return urljoin(current_url, get_link["href"])
|
||||
|
||||
# 2. Check for "series.php" -> "edition.php"
|
||||
if "series.php" in current_url:
|
||||
# Find first edition link
|
||||
edition_link = soup.find("a", href=re.compile(r"edition\.php"))
|
||||
if edition_link:
|
||||
current_url = urljoin(current_url, edition_link["href"])
|
||||
continue
|
||||
|
||||
# 3. Check for "edition.php" -> "file.php"
|
||||
if "edition.php" in current_url:
|
||||
file_link = soup.find("a", href=re.compile(r"file\.php"))
|
||||
if file_link:
|
||||
current_url = urljoin(current_url, file_link["href"])
|
||||
continue
|
||||
|
||||
# 4. Check for "file.php" -> "ads.php" (Libgen badge)
|
||||
if "file.php" in current_url:
|
||||
# Look for link with title="libgen" or text "Libgen"
|
||||
libgen_link = soup.find("a", title="libgen")
|
||||
if not libgen_link:
|
||||
libgen_link = soup.find("a", string=re.compile(r"Libgen", re.IGNORECASE))
|
||||
|
||||
if libgen_link and libgen_link.has_attr("href"):
|
||||
current_url = urljoin(current_url, libgen_link["href"])
|
||||
continue
|
||||
|
||||
# 5. Check for "ads.php" -> "get.php" (Fallback if GET link logic above failed)
|
||||
if "ads.php" in current_url:
|
||||
get_php_link = soup.find("a", href=re.compile(r"get\.php"))
|
||||
if get_php_link:
|
||||
return urljoin(current_url, get_php_link["href"])
|
||||
|
||||
# 6. Library.lol / generic fallback
|
||||
for text in ["Cloudflare", "IPFS.io", "Infura"]:
|
||||
link = soup.find("a", string=re.compile(text, re.IGNORECASE))
|
||||
if link and link.has_attr("href"):
|
||||
return urljoin(current_url, link["href"])
|
||||
|
||||
# If we found nothing new, stop
|
||||
break
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def download_from_mirror(
|
||||
mirror_url: str,
|
||||
output_path: str | Path,
|
||||
output_path: Path,
|
||||
*,
|
||||
log_info: LogFn = None,
|
||||
log_error: ErrorFn = None,
|
||||
session: Optional[requests.Session] = None,
|
||||
) -> bool:
|
||||
"""Download a Libgen file and write it to disk.
|
||||
|
||||
Handles Libgen redirects and ensures proper file download by:
|
||||
- Following all redirects (default behavior)
|
||||
- Setting User-Agent header (required by some mirrors)
|
||||
- Validating that we're downloading binary content, not HTML
|
||||
- Attempting alternative download method if HTML is returned
|
||||
"""
|
||||
"""Download file from a LibGen mirror URL."""
|
||||
session = session or requests.Session()
|
||||
output_path = Path(output_path)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
output_path = Path(output_path)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
_call(log_info, f"[download] Downloading from mirror: {mirror_url}")
|
||||
_call(log_info, f"[download] Resolving download link from: {mirror_url}")
|
||||
|
||||
# Ensure session has proper headers for Libgen
|
||||
if 'User-Agent' not in session.headers:
|
||||
session.headers['User-Agent'] = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
)
|
||||
download_url = _resolve_download_url(session, mirror_url, log_info)
|
||||
|
||||
# Download with redirects enabled (default) and referer
|
||||
session.headers['Referer'] = 'https://libgen.gs/'
|
||||
response = session.get(mirror_url, stream=True, timeout=30, allow_redirects=True)
|
||||
response.raise_for_status()
|
||||
|
||||
# Check if we got HTML instead of a file (common Libgen issue)
|
||||
content_type = response.headers.get('content-type', '').lower()
|
||||
if 'text/html' in content_type:
|
||||
_call(log_error, f"[download] Server returned HTML. Trying alternative method...")
|
||||
if not download_url:
|
||||
_call(log_error, "[download] Could not find direct download link")
|
||||
return False
|
||||
|
||||
# Try to extract file ID and use alternative CDN
|
||||
try:
|
||||
# Parse the HTML to extract MD5 or file ID
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
# Look for download link in the HTML
|
||||
# Common patterns: md5 hash in form, or direct link in anchor tags
|
||||
download_link = None
|
||||
|
||||
# Try to find forms that might contain download functionality
|
||||
forms = soup.find_all('form')
|
||||
for form in forms:
|
||||
action = form.get('action', '')
|
||||
if 'download' in action.lower() or 'get' in action.lower():
|
||||
download_link = action
|
||||
break
|
||||
|
||||
if not download_link:
|
||||
_call(log_error, f"[download] Could not extract alternative download link from HTML")
|
||||
return False
|
||||
|
||||
_call(log_info, f"[download] Using alternative download method: {download_link[:100]}")
|
||||
# Try downloading from alternative link
|
||||
response2 = session.get(download_link, stream=True, timeout=30, allow_redirects=True)
|
||||
response2.raise_for_status()
|
||||
response = response2 # Use the new response
|
||||
_call(log_info, f"[download] Downloading from: {download_url}")
|
||||
|
||||
# Download the actual file
|
||||
with session.get(download_url, stream=True, timeout=60) as r:
|
||||
r.raise_for_status()
|
||||
|
||||
except Exception as alt_error:
|
||||
_call(log_error, f"[download] Alternative method failed: {alt_error}")
|
||||
# Verify it's not HTML (error page)
|
||||
ct = r.headers.get("content-type", "").lower()
|
||||
if "text/html" in ct:
|
||||
_call(log_error, "[download] Final URL returned HTML, not a file.")
|
||||
return False
|
||||
|
||||
total_size = int(response.headers.get("content-length", 0))
|
||||
downloaded = 0
|
||||
|
||||
with open(output_path, "wb") as handle:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
if not chunk:
|
||||
continue
|
||||
handle.write(chunk)
|
||||
downloaded += len(chunk)
|
||||
if total_size > 0:
|
||||
percent = downloaded / total_size * 100
|
||||
_call(
|
||||
log_info,
|
||||
f"[download] {percent:.1f}% - {downloaded // (1024*1024)}MB / {total_size // (1024*1024)}MB",
|
||||
)
|
||||
|
||||
_call(log_info, f"[download] Downloaded successfully to: {output_path}")
|
||||
total_size = int(r.headers.get("content-length", 0))
|
||||
downloaded = 0
|
||||
|
||||
with open(output_path, "wb") as f:
|
||||
for chunk in r.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
downloaded += len(chunk)
|
||||
# Optional: progress logging
|
||||
|
||||
_call(log_info, f"[download] Saved to {output_path}")
|
||||
return True
|
||||
except Exception as exc: # pragma: no cover - defensive
|
||||
_call(log_error, f"[download] Error: {exc}")
|
||||
|
||||
except Exception as e:
|
||||
_call(log_error, f"[download] Download failed: {e}")
|
||||
return False
|
||||
|
||||
Reference in New Issue
Block a user