f
This commit is contained in:
@@ -12,7 +12,7 @@ from urllib.parse import urljoin, urlparse, unquote
|
|||||||
|
|
||||||
from ProviderCore.base import Provider, SearchResult
|
from ProviderCore.base import Provider, SearchResult
|
||||||
from SYS.utils import sanitize_filename
|
from SYS.utils import sanitize_filename
|
||||||
from SYS.logger import log
|
from SYS.logger import log, debug
|
||||||
from SYS.models import ProgressBar
|
from SYS.models import ProgressBar
|
||||||
|
|
||||||
# Optional dependency for HTML scraping fallbacks
|
# Optional dependency for HTML scraping fallbacks
|
||||||
@@ -786,6 +786,10 @@ class Libgen(Provider):
|
|||||||
md = getattr(result, "full_metadata", None)
|
md = getattr(result, "full_metadata", None)
|
||||||
if not isinstance(md, dict):
|
if not isinstance(md, dict):
|
||||||
md = {}
|
md = {}
|
||||||
|
try:
|
||||||
|
setattr(result, "full_metadata", md)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
title = str(getattr(result, "title", "") or "").strip()
|
title = str(getattr(result, "title", "") or "").strip()
|
||||||
md5 = str(md.get("md5") or "").strip()
|
md5 = str(md.get("md5") or "").strip()
|
||||||
@@ -860,7 +864,11 @@ class Libgen(Provider):
|
|||||||
last_progress_time[0] = now
|
last_progress_time[0] = now
|
||||||
|
|
||||||
ok, final_path = download_from_mirror(
|
ok, final_path = download_from_mirror(
|
||||||
target, out_path, progress_callback=progress_callback
|
target,
|
||||||
|
out_path,
|
||||||
|
progress_callback=progress_callback,
|
||||||
|
log_info=debug,
|
||||||
|
log_error=log,
|
||||||
)
|
)
|
||||||
progress_bar.finish()
|
progress_bar.finish()
|
||||||
if ok and final_path:
|
if ok and final_path:
|
||||||
@@ -868,15 +876,16 @@ class Libgen(Provider):
|
|||||||
# and then enrich tags via OpenLibrary/isbnsearch. This ensures enrichment never
|
# and then enrich tags via OpenLibrary/isbnsearch. This ensures enrichment never
|
||||||
# blocks the download itself.
|
# blocks the download itself.
|
||||||
try:
|
try:
|
||||||
if isinstance(target, str) and target.startswith("http"):
|
target_str = str(target)
|
||||||
low = target.lower()
|
if isinstance(target, str) and target_str.startswith("http"):
|
||||||
|
low = target_str.lower()
|
||||||
# Preferred: ads.php pages often embed a complete tag block.
|
# Preferred: ads.php pages often embed a complete tag block.
|
||||||
# Parse it post-download (best-effort) and do NOT perform external
|
# Parse it post-download (best-effort) and do NOT perform external
|
||||||
# enrichment (OpenLibrary/isbnsearch) unless the user later chooses to.
|
# enrichment (OpenLibrary/isbnsearch) unless the user later chooses to.
|
||||||
if ("/ads.php" in low) or ("/get.php" in low):
|
if ("/ads.php" in low) or ("/get.php" in low):
|
||||||
ads_url = (
|
ads_url = (
|
||||||
target if "/ads.php" in low else
|
target_str if "/ads.php" in low else
|
||||||
_libgen_ads_url_for_target(target)
|
_libgen_ads_url_for_target(target_str)
|
||||||
)
|
)
|
||||||
if ads_url:
|
if ads_url:
|
||||||
html = _fetch_libgen_details_html(
|
html = _fetch_libgen_details_html(
|
||||||
@@ -889,6 +898,7 @@ class Libgen(Provider):
|
|||||||
extracted_title = str(meta.get("title")
|
extracted_title = str(meta.get("title")
|
||||||
or "").strip()
|
or "").strip()
|
||||||
if extracted_title:
|
if extracted_title:
|
||||||
|
if md is not None:
|
||||||
md["title"] = extracted_title
|
md["title"] = extracted_title
|
||||||
result.tag.add(f"title:{extracted_title}")
|
result.tag.add(f"title:{extracted_title}")
|
||||||
if (not title) or title.startswith("http"):
|
if (not title) or title.startswith("http"):
|
||||||
@@ -906,16 +916,19 @@ class Libgen(Provider):
|
|||||||
|
|
||||||
publisher = str(meta.get("publisher") or "").strip()
|
publisher = str(meta.get("publisher") or "").strip()
|
||||||
if publisher:
|
if publisher:
|
||||||
|
if md is not None:
|
||||||
md["publisher"] = publisher
|
md["publisher"] = publisher
|
||||||
result.tag.add(f"publisher:{publisher}")
|
result.tag.add(f"publisher:{publisher}")
|
||||||
|
|
||||||
year = str(meta.get("year") or "").strip()
|
year = str(meta.get("year") or "").strip()
|
||||||
if year:
|
if year:
|
||||||
|
if md is not None:
|
||||||
md["year"] = year
|
md["year"] = year
|
||||||
result.tag.add(f"year:{year}")
|
result.tag.add(f"year:{year}")
|
||||||
|
|
||||||
language = str(meta.get("language") or "").strip()
|
language = str(meta.get("language") or "").strip()
|
||||||
if language:
|
if language:
|
||||||
|
if md is not None:
|
||||||
md["language"] = language
|
md["language"] = language
|
||||||
result.tag.add(f"language:{language}")
|
result.tag.add(f"language:{language}")
|
||||||
|
|
||||||
@@ -929,6 +942,7 @@ class Libgen(Provider):
|
|||||||
if str(x).strip()
|
if str(x).strip()
|
||||||
]
|
]
|
||||||
if isbns:
|
if isbns:
|
||||||
|
if md is not None:
|
||||||
md["isbn"] = isbns
|
md["isbn"] = isbns
|
||||||
for isbn_val in isbns:
|
for isbn_val in isbns:
|
||||||
result.tag.add(f"isbn:{isbn_val}")
|
result.tag.add(f"isbn:{isbn_val}")
|
||||||
@@ -971,12 +985,12 @@ class Libgen(Provider):
|
|||||||
# Legacy: edition/file/series details pages (title + ISBN) + external enrichment.
|
# Legacy: edition/file/series details pages (title + ISBN) + external enrichment.
|
||||||
if (("/edition.php" in low) or ("/file.php" in low)
|
if (("/edition.php" in low) or ("/file.php" in low)
|
||||||
or ("/series.php" in low)):
|
or ("/series.php" in low)):
|
||||||
html = _fetch_libgen_details_html(target)
|
html = _fetch_libgen_details_html(target_str)
|
||||||
if html:
|
if html:
|
||||||
meta = _parse_libgen_details_html(html)
|
meta = _parse_libgen_details_html(html)
|
||||||
|
|
||||||
if not meta.get("edition_id"):
|
if not meta.get("edition_id"):
|
||||||
eid = _libgen_id_from_url(target)
|
eid = _libgen_id_from_url(target_str)
|
||||||
if eid:
|
if eid:
|
||||||
meta["edition_id"] = eid
|
meta["edition_id"] = eid
|
||||||
|
|
||||||
@@ -992,9 +1006,11 @@ class Libgen(Provider):
|
|||||||
]
|
]
|
||||||
|
|
||||||
if extracted_title:
|
if extracted_title:
|
||||||
|
if md is not None:
|
||||||
md["title"] = extracted_title
|
md["title"] = extracted_title
|
||||||
result.tag.add(f"title:{extracted_title}")
|
result.tag.add(f"title:{extracted_title}")
|
||||||
if extracted_isbns:
|
if extracted_isbns:
|
||||||
|
if md is not None:
|
||||||
md["isbn"] = extracted_isbns
|
md["isbn"] = extracted_isbns
|
||||||
for isbn_val in extracted_isbns:
|
for isbn_val in extracted_isbns:
|
||||||
isbn_norm = str(isbn_val
|
isbn_norm = str(isbn_val
|
||||||
@@ -1003,6 +1019,7 @@ class Libgen(Provider):
|
|||||||
if isbn_norm:
|
if isbn_norm:
|
||||||
result.tag.add(f"isbn:{isbn_norm}")
|
result.tag.add(f"isbn:{isbn_norm}")
|
||||||
if meta.get("edition_id"):
|
if meta.get("edition_id"):
|
||||||
|
if md is not None:
|
||||||
md["edition_id"] = str(meta.get("edition_id"))
|
md["edition_id"] = str(meta.get("edition_id"))
|
||||||
|
|
||||||
preferred_isbn = _prefer_isbn(extracted_isbns)
|
preferred_isbn = _prefer_isbn(extracted_isbns)
|
||||||
@@ -1017,15 +1034,40 @@ class Libgen(Provider):
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
if enriched_source:
|
if enriched_source:
|
||||||
|
if md is not None:
|
||||||
md["metadata_enriched_from"] = enriched_source
|
md["metadata_enriched_from"] = enriched_source
|
||||||
|
|
||||||
if extracted_title and ((not title)
|
if extracted_title and ((not title)
|
||||||
or title.startswith("http")):
|
or title.startswith("http")):
|
||||||
title = extracted_title
|
title = extracted_title
|
||||||
except Exception:
|
except Exception as e:
|
||||||
pass
|
debug(f"[libgen] Post-download enrichment failed: {e}")
|
||||||
|
|
||||||
|
debug(f"[libgen] Returning downloaded path: {final_path}")
|
||||||
return Path(final_path)
|
return Path(final_path)
|
||||||
|
|
||||||
|
debug(f"[libgen] Download mirror failed (ok={ok}, path={final_path})")
|
||||||
|
return None
|
||||||
|
except Exception as exc:
|
||||||
|
debug(f"[libgen] Download exception: {exc}")
|
||||||
|
import traceback
|
||||||
|
debug(traceback.format_exc())
|
||||||
|
return None
|
||||||
|
|
||||||
|
def download_url(self, url: str, output_dir: Path) -> Optional[Path]:
|
||||||
|
"""Download a direct LibGen URL using the regular mirror logic."""
|
||||||
|
try:
|
||||||
|
from ProviderCore.base import SearchResult
|
||||||
|
sr = SearchResult(
|
||||||
|
table="libgen",
|
||||||
|
title="libgen",
|
||||||
|
path=url,
|
||||||
|
full_metadata={
|
||||||
|
"md5": _libgen_md5_from_url(url)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return self.download(sr, output_dir)
|
||||||
|
except Exception:
|
||||||
return None
|
return None
|
||||||
except Exception:
|
except Exception:
|
||||||
return None
|
return None
|
||||||
@@ -1635,71 +1677,53 @@ def _resolve_download_url(
|
|||||||
# ads.php?md5=... -> get.php?md5=...
|
# ads.php?md5=... -> get.php?md5=...
|
||||||
# get.php?md5=... -> file response
|
# get.php?md5=... -> file response
|
||||||
|
|
||||||
|
# Use a more relaxed regex for href that handles spaces and missing quotes.
|
||||||
|
# Format: href [space] = [space] [quote] link [quote]
|
||||||
|
def _find_link(pattern: str) -> Optional[str]:
|
||||||
|
# This regex allows:
|
||||||
|
# href="link"
|
||||||
|
# href='link'
|
||||||
|
# href=link
|
||||||
|
# href = "link"
|
||||||
|
regex = r"href\s*=\s*['\"]?(" + pattern + r")['\"]?"
|
||||||
|
match = re.search(regex, html, flags=re.IGNORECASE)
|
||||||
|
if match:
|
||||||
|
u = str(match.group(1) or "").strip()
|
||||||
|
# Strip trailing quotes if the regex over-captured (e.g. unquoted link followed by space/quote)
|
||||||
|
u = u.split("'")[0].split('"')[0].split(">")[0].split(" ")[0].strip()
|
||||||
|
if u and not u.lower().startswith("javascript:"):
|
||||||
|
return urljoin(base_url, u)
|
||||||
|
return None
|
||||||
|
|
||||||
# Handle edition -> file links.
|
# Handle edition -> file links.
|
||||||
m = re.search(
|
found = _find_link(r'[^"\' >]*file\.php\?id=\d+[^"\' >]*')
|
||||||
r'href=["\']([^"\']*file\.php\?id=\d+[^"\']*)["\']',
|
if found:
|
||||||
html,
|
return found
|
||||||
flags=re.IGNORECASE
|
|
||||||
)
|
|
||||||
if m:
|
|
||||||
href = str(m.group(1) or "").strip()
|
|
||||||
if href and not href.lower().startswith("javascript:"):
|
|
||||||
return urljoin(base_url, href)
|
|
||||||
|
|
||||||
# Handle series -> edition links.
|
# Handle series -> edition links.
|
||||||
m = re.search(
|
found = _find_link(r'[^"\' >]*edition\.php\?id=\d+[^"\' >]*')
|
||||||
r'href=["\']([^"\']*edition\.php\?id=\d+[^"\']*)["\']',
|
if found:
|
||||||
html,
|
return found
|
||||||
flags=re.IGNORECASE
|
|
||||||
)
|
|
||||||
if m:
|
|
||||||
href = str(m.group(1) or "").strip()
|
|
||||||
if href and not href.lower().startswith("javascript:"):
|
|
||||||
return urljoin(base_url, href)
|
|
||||||
|
|
||||||
# Handle file -> ads/get links (sometimes present as the "Libgen" mirror).
|
# Handle file -> ads/get links (sometimes present as the "Libgen" mirror).
|
||||||
m = re.search(
|
found = _find_link(r'[^"\' >]*ads\.php\?md5=[a-fA-F0-9]{32}[^"\' >]*')
|
||||||
r'href=["\']([^"\']*ads\.php\?md5=[a-fA-F0-9]{32}[^"\']*)["\']',
|
if found:
|
||||||
html,
|
return found
|
||||||
flags=re.IGNORECASE,
|
|
||||||
)
|
|
||||||
if m:
|
|
||||||
href = str(m.group(1) or "").strip()
|
|
||||||
if href and not href.lower().startswith("javascript:"):
|
|
||||||
return urljoin(base_url, href)
|
|
||||||
|
|
||||||
# Prefer explicit get.php md5 links (most common successful chain).
|
# Prefer explicit get.php md5 links (most common successful chain).
|
||||||
m = re.search(
|
found = _find_link(r'[^"\' >]*get\.php\?md5=[a-fA-F0-9]{32}[^"\' >]*')
|
||||||
r'href=["\']([^"\']*get\.php\?md5=[a-fA-F0-9]{32}[^"\']*)["\']',
|
if found:
|
||||||
html,
|
return found
|
||||||
flags=re.IGNORECASE,
|
|
||||||
)
|
|
||||||
if m:
|
|
||||||
href = str(m.group(1) or "").strip()
|
|
||||||
if href and not href.lower().startswith("javascript:"):
|
|
||||||
return urljoin(base_url, href)
|
|
||||||
|
|
||||||
# Next: library.lol main links.
|
# Next: library.lol main links.
|
||||||
m = re.search(
|
found = _find_link(r'[^"\' >]*library\.lol[^"\' >]*')
|
||||||
r'href=["\']([^"\']*library\.lol[^"\']*)["\']',
|
if found:
|
||||||
html,
|
return found
|
||||||
flags=re.IGNORECASE
|
|
||||||
)
|
|
||||||
if m:
|
|
||||||
href = str(m.group(1) or "").strip()
|
|
||||||
if href and not href.lower().startswith("javascript:"):
|
|
||||||
return urljoin(base_url, href)
|
|
||||||
|
|
||||||
# Finally: any direct file extension link.
|
# Finally: any direct file extension link.
|
||||||
m = re.search(
|
found = _find_link(r'[^"\' >]+\.(?:pdf|epub|mobi|djvu|azw3|cbz|cbr)(?:\?[^"\' >]*)?')
|
||||||
r'href=["\']([^"\']+\.(?:pdf|epub|mobi|djvu|azw3|cbz|cbr)(?:\?[^"\']*)?)["\']',
|
if found:
|
||||||
html,
|
return found
|
||||||
flags=re.IGNORECASE,
|
|
||||||
)
|
|
||||||
if m:
|
|
||||||
href = str(m.group(1) or "").strip()
|
|
||||||
if href and not href.lower().startswith("javascript:"):
|
|
||||||
return urljoin(base_url, href)
|
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@@ -1713,12 +1737,12 @@ def _resolve_download_url(
|
|||||||
return href
|
return href
|
||||||
return None
|
return None
|
||||||
|
|
||||||
for _ in range(6):
|
for idx in range(10):
|
||||||
if current_url in visited:
|
if current_url in visited:
|
||||||
break
|
break
|
||||||
visited.add(current_url)
|
visited.add(current_url)
|
||||||
|
|
||||||
_call(log_info, f"[resolve] Checking: {current_url}")
|
_call(log_info, f"[resolve] Loop {idx+1} Checking: {current_url}")
|
||||||
|
|
||||||
if current_url.lower().endswith((".pdf",
|
if current_url.lower().endswith((".pdf",
|
||||||
".epub",
|
".epub",
|
||||||
@@ -1727,14 +1751,16 @@ def _resolve_download_url(
|
|||||||
".azw3",
|
".azw3",
|
||||||
".cbz",
|
".cbz",
|
||||||
".cbr")):
|
".cbr")):
|
||||||
|
_call(log_info, f"[resolve] URL looks like direct file: {current_url}")
|
||||||
return current_url
|
return current_url
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with session.get(current_url, stream=True, timeout=30) as resp:
|
with session.get(current_url, stream=True, timeout=30) as resp:
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
ct = resp.headers.get("Content-Type", "").lower()
|
ct = str(resp.headers.get("Content-Type", "")).lower()
|
||||||
|
|
||||||
if "text/html" not in ct:
|
if "text/html" not in ct:
|
||||||
|
_call(log_info, f"[resolve] URL returned non-HTML ({ct}): {current_url}")
|
||||||
return current_url
|
return current_url
|
||||||
|
|
||||||
content = resp.text
|
content = resp.text
|
||||||
@@ -1823,12 +1849,20 @@ def _guess_filename_extension(download_url: str,
|
|||||||
parsed = urlparse(download_url)
|
parsed = urlparse(download_url)
|
||||||
suffix = Path(parsed.path).suffix
|
suffix = Path(parsed.path).suffix
|
||||||
if suffix:
|
if suffix:
|
||||||
return suffix.lstrip(".")
|
ext = suffix.lstrip(".").lower()
|
||||||
|
if ext not in {"php",
|
||||||
|
"php3",
|
||||||
|
"html",
|
||||||
|
"htm",
|
||||||
|
"aspx",
|
||||||
|
"asp"}:
|
||||||
|
return ext
|
||||||
|
|
||||||
content_type = headers.get("content-type", "").lower()
|
content_type = headers.get("content-type", "").lower()
|
||||||
mime_map = {
|
mime_map = {
|
||||||
"application/pdf": "pdf",
|
"application/pdf": "pdf",
|
||||||
"application/epub+zip": "epub",
|
"application/epub+zip": "epub",
|
||||||
|
"application/epub": "epub",
|
||||||
"application/x-mobipocket-ebook": "mobi",
|
"application/x-mobipocket-ebook": "mobi",
|
||||||
"application/x-cbr": "cbr",
|
"application/x-cbr": "cbr",
|
||||||
"application/x-cbz": "cbz",
|
"application/x-cbz": "cbz",
|
||||||
@@ -1879,6 +1913,18 @@ def download_from_mirror(
|
|||||||
Optional[Path]]:
|
Optional[Path]]:
|
||||||
"""Download file from a LibGen mirror URL with optional progress tracking."""
|
"""Download file from a LibGen mirror URL with optional progress tracking."""
|
||||||
session = session or requests.Session()
|
session = session or requests.Session()
|
||||||
|
# Ensure a modern browser User-Agent is used for downloads to avoid mirror blocks.
|
||||||
|
if not any(
|
||||||
|
k.lower() == "user-agent"
|
||||||
|
for k in (session.headers or {})
|
||||||
|
):
|
||||||
|
session.headers.update(
|
||||||
|
{
|
||||||
|
"User-Agent":
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
output_path = Path(output_path)
|
output_path = Path(output_path)
|
||||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
@@ -1891,7 +1937,7 @@ def download_from_mirror(
|
|||||||
_call(log_error, "[download] Could not find direct download link")
|
_call(log_error, "[download] Could not find direct download link")
|
||||||
return False, None
|
return False, None
|
||||||
|
|
||||||
_call(log_info, f"[download] Downloading from: {download_url}")
|
_call(log_info, f"[download] Resolved final file URL: {download_url}")
|
||||||
|
|
||||||
downloaded = 0
|
downloaded = 0
|
||||||
total_size = 0
|
total_size = 0
|
||||||
@@ -1908,6 +1954,7 @@ def download_from_mirror(
|
|||||||
return False, None
|
return False, None
|
||||||
|
|
||||||
total_size = int(headers.get("content-length", 0) or 0)
|
total_size = int(headers.get("content-length", 0) or 0)
|
||||||
|
_call(log_info, f"[download] Starting transfer ({total_size} bytes)")
|
||||||
|
|
||||||
with open(output_path, "wb") as f:
|
with open(output_path, "wb") as f:
|
||||||
for chunk in r.iter_content(chunk_size=8192):
|
for chunk in r.iter_content(chunk_size=8192):
|
||||||
@@ -1923,7 +1970,7 @@ def download_from_mirror(
|
|||||||
if progress_callback and total_size > 0:
|
if progress_callback and total_size > 0:
|
||||||
progress_callback(downloaded, total_size)
|
progress_callback(downloaded, total_size)
|
||||||
|
|
||||||
_call(log_info, f"[download] Saved to {final_path}")
|
_call(log_info, f"[download] Successfully saved to {final_path}")
|
||||||
return True, final_path
|
return True, final_path
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@@ -401,22 +401,11 @@ class Download_File(Cmdlet):
|
|||||||
|
|
||||||
downloaded_path = provider_obj.download(sr, output_dir)
|
downloaded_path = provider_obj.download(sr, output_dir)
|
||||||
provider_sr = sr
|
provider_sr = sr
|
||||||
|
debug(f"[download-file] Provider download result: {downloaded_path}")
|
||||||
|
|
||||||
if downloaded_path is None:
|
# Fallback: if we have a direct HTTP URL and no provider successfully handled it
|
||||||
# Some providers might work via callback 'download_items', mostly legacy.
|
if (downloaded_path is None and not attempted_provider_download
|
||||||
# If relevant, check for it.
|
and isinstance(target, str) and target.startswith("http")):
|
||||||
download_items = getattr(provider_obj, "download_items", None)
|
|
||||||
if callable(download_items):
|
|
||||||
pass # We can implement generic callback support if needed,
|
|
||||||
# but pure download() is preferred.
|
|
||||||
|
|
||||||
# Fallback: if we have a direct HTTP URL, download it directly
|
|
||||||
if (downloaded_path is None and isinstance(target,
|
|
||||||
str)
|
|
||||||
and target.startswith("http")):
|
|
||||||
|
|
||||||
# Generic guard for known "not-a-file" URLs could go here or in a helper,
|
|
||||||
# but for now we rely on user or provider.
|
|
||||||
|
|
||||||
debug(
|
debug(
|
||||||
f"[download-file] Provider item looks like direct URL, downloading: {target}"
|
f"[download-file] Provider item looks like direct URL, downloading: {target}"
|
||||||
|
|||||||
Reference in New Issue
Block a user