diff --git a/Provider/libgen.py b/Provider/libgen.py index bc6e6a8..40b1abe 100644 --- a/Provider/libgen.py +++ b/Provider/libgen.py @@ -12,7 +12,7 @@ from urllib.parse import urljoin, urlparse, unquote from ProviderCore.base import Provider, SearchResult from SYS.utils import sanitize_filename -from SYS.logger import log +from SYS.logger import log, debug from SYS.models import ProgressBar # Optional dependency for HTML scraping fallbacks @@ -786,6 +786,10 @@ class Libgen(Provider): md = getattr(result, "full_metadata", None) if not isinstance(md, dict): md = {} + try: + setattr(result, "full_metadata", md) + except Exception: + pass title = str(getattr(result, "title", "") or "").strip() md5 = str(md.get("md5") or "").strip() @@ -860,7 +864,11 @@ class Libgen(Provider): last_progress_time[0] = now ok, final_path = download_from_mirror( - target, out_path, progress_callback=progress_callback + target, + out_path, + progress_callback=progress_callback, + log_info=debug, + log_error=log, ) progress_bar.finish() if ok and final_path: @@ -868,15 +876,16 @@ class Libgen(Provider): # and then enrich tags via OpenLibrary/isbnsearch. This ensures enrichment never # blocks the download itself. try: - if isinstance(target, str) and target.startswith("http"): - low = target.lower() + target_str = str(target) + if isinstance(target, str) and target_str.startswith("http"): + low = target_str.lower() # Preferred: ads.php pages often embed a complete tag block. # Parse it post-download (best-effort) and do NOT perform external # enrichment (OpenLibrary/isbnsearch) unless the user later chooses to. if ("/ads.php" in low) or ("/get.php" in low): ads_url = ( - target if "/ads.php" in low else - _libgen_ads_url_for_target(target) + target_str if "/ads.php" in low else + _libgen_ads_url_for_target(target_str) ) if ads_url: html = _fetch_libgen_details_html( @@ -889,7 +898,8 @@ class Libgen(Provider): extracted_title = str(meta.get("title") or "").strip() if extracted_title: - md["title"] = extracted_title + if md is not None: + md["title"] = extracted_title result.tag.add(f"title:{extracted_title}") if (not title) or title.startswith("http"): title = extracted_title @@ -906,17 +916,20 @@ class Libgen(Provider): publisher = str(meta.get("publisher") or "").strip() if publisher: - md["publisher"] = publisher + if md is not None: + md["publisher"] = publisher result.tag.add(f"publisher:{publisher}") year = str(meta.get("year") or "").strip() if year: - md["year"] = year + if md is not None: + md["year"] = year result.tag.add(f"year:{year}") language = str(meta.get("language") or "").strip() if language: - md["language"] = language + if md is not None: + md["language"] = language result.tag.add(f"language:{language}") isbns = ( @@ -929,7 +942,8 @@ class Libgen(Provider): if str(x).strip() ] if isbns: - md["isbn"] = isbns + if md is not None: + md["isbn"] = isbns for isbn_val in isbns: result.tag.add(f"isbn:{isbn_val}") @@ -971,12 +985,12 @@ class Libgen(Provider): # Legacy: edition/file/series details pages (title + ISBN) + external enrichment. if (("/edition.php" in low) or ("/file.php" in low) or ("/series.php" in low)): - html = _fetch_libgen_details_html(target) + html = _fetch_libgen_details_html(target_str) if html: meta = _parse_libgen_details_html(html) if not meta.get("edition_id"): - eid = _libgen_id_from_url(target) + eid = _libgen_id_from_url(target_str) if eid: meta["edition_id"] = eid @@ -992,10 +1006,12 @@ class Libgen(Provider): ] if extracted_title: - md["title"] = extracted_title + if md is not None: + md["title"] = extracted_title result.tag.add(f"title:{extracted_title}") if extracted_isbns: - md["isbn"] = extracted_isbns + if md is not None: + md["isbn"] = extracted_isbns for isbn_val in extracted_isbns: isbn_norm = str(isbn_val ).strip().replace("-", @@ -1003,7 +1019,8 @@ class Libgen(Provider): if isbn_norm: result.tag.add(f"isbn:{isbn_norm}") if meta.get("edition_id"): - md["edition_id"] = str(meta.get("edition_id")) + if md is not None: + md["edition_id"] = str(meta.get("edition_id")) preferred_isbn = _prefer_isbn(extracted_isbns) if preferred_isbn: @@ -1017,15 +1034,40 @@ class Libgen(Provider): except Exception: pass if enriched_source: - md["metadata_enriched_from"] = enriched_source + if md is not None: + md["metadata_enriched_from"] = enriched_source - if extracted_title and ((not title) + if extracted_title and ((not title) or title.startswith("http")): title = extracted_title - except Exception: - pass + except Exception as e: + debug(f"[libgen] Post-download enrichment failed: {e}") + debug(f"[libgen] Returning downloaded path: {final_path}") return Path(final_path) + + debug(f"[libgen] Download mirror failed (ok={ok}, path={final_path})") + return None + except Exception as exc: + debug(f"[libgen] Download exception: {exc}") + import traceback + debug(traceback.format_exc()) + return None + + def download_url(self, url: str, output_dir: Path) -> Optional[Path]: + """Download a direct LibGen URL using the regular mirror logic.""" + try: + from ProviderCore.base import SearchResult + sr = SearchResult( + table="libgen", + title="libgen", + path=url, + full_metadata={ + "md5": _libgen_md5_from_url(url) + } + ) + return self.download(sr, output_dir) + except Exception: return None except Exception: return None @@ -1635,71 +1677,53 @@ def _resolve_download_url( # ads.php?md5=... -> get.php?md5=... # get.php?md5=... -> file response + # Use a more relaxed regex for href that handles spaces and missing quotes. + # Format: href [space] = [space] [quote] link [quote] + def _find_link(pattern: str) -> Optional[str]: + # This regex allows: + # href="link" + # href='link' + # href=link + # href = "link" + regex = r"href\s*=\s*['\"]?(" + pattern + r")['\"]?" + match = re.search(regex, html, flags=re.IGNORECASE) + if match: + u = str(match.group(1) or "").strip() + # Strip trailing quotes if the regex over-captured (e.g. unquoted link followed by space/quote) + u = u.split("'")[0].split('"')[0].split(">")[0].split(" ")[0].strip() + if u and not u.lower().startswith("javascript:"): + return urljoin(base_url, u) + return None + # Handle edition -> file links. - m = re.search( - r'href=["\']([^"\']*file\.php\?id=\d+[^"\']*)["\']', - html, - flags=re.IGNORECASE - ) - if m: - href = str(m.group(1) or "").strip() - if href and not href.lower().startswith("javascript:"): - return urljoin(base_url, href) + found = _find_link(r'[^"\' >]*file\.php\?id=\d+[^"\' >]*') + if found: + return found # Handle series -> edition links. - m = re.search( - r'href=["\']([^"\']*edition\.php\?id=\d+[^"\']*)["\']', - html, - flags=re.IGNORECASE - ) - if m: - href = str(m.group(1) or "").strip() - if href and not href.lower().startswith("javascript:"): - return urljoin(base_url, href) + found = _find_link(r'[^"\' >]*edition\.php\?id=\d+[^"\' >]*') + if found: + return found # Handle file -> ads/get links (sometimes present as the "Libgen" mirror). - m = re.search( - r'href=["\']([^"\']*ads\.php\?md5=[a-fA-F0-9]{32}[^"\']*)["\']', - html, - flags=re.IGNORECASE, - ) - if m: - href = str(m.group(1) or "").strip() - if href and not href.lower().startswith("javascript:"): - return urljoin(base_url, href) + found = _find_link(r'[^"\' >]*ads\.php\?md5=[a-fA-F0-9]{32}[^"\' >]*') + if found: + return found # Prefer explicit get.php md5 links (most common successful chain). - m = re.search( - r'href=["\']([^"\']*get\.php\?md5=[a-fA-F0-9]{32}[^"\']*)["\']', - html, - flags=re.IGNORECASE, - ) - if m: - href = str(m.group(1) or "").strip() - if href and not href.lower().startswith("javascript:"): - return urljoin(base_url, href) + found = _find_link(r'[^"\' >]*get\.php\?md5=[a-fA-F0-9]{32}[^"\' >]*') + if found: + return found # Next: library.lol main links. - m = re.search( - r'href=["\']([^"\']*library\.lol[^"\']*)["\']', - html, - flags=re.IGNORECASE - ) - if m: - href = str(m.group(1) or "").strip() - if href and not href.lower().startswith("javascript:"): - return urljoin(base_url, href) + found = _find_link(r'[^"\' >]*library\.lol[^"\' >]*') + if found: + return found # Finally: any direct file extension link. - m = re.search( - r'href=["\']([^"\']+\.(?:pdf|epub|mobi|djvu|azw3|cbz|cbr)(?:\?[^"\']*)?)["\']', - html, - flags=re.IGNORECASE, - ) - if m: - href = str(m.group(1) or "").strip() - if href and not href.lower().startswith("javascript:"): - return urljoin(base_url, href) + found = _find_link(r'[^"\' >]+\.(?:pdf|epub|mobi|djvu|azw3|cbz|cbr)(?:\?[^"\' >]*)?') + if found: + return found return None @@ -1713,12 +1737,12 @@ def _resolve_download_url( return href return None - for _ in range(6): + for idx in range(10): if current_url in visited: break visited.add(current_url) - _call(log_info, f"[resolve] Checking: {current_url}") + _call(log_info, f"[resolve] Loop {idx+1} Checking: {current_url}") if current_url.lower().endswith((".pdf", ".epub", @@ -1727,14 +1751,16 @@ def _resolve_download_url( ".azw3", ".cbz", ".cbr")): + _call(log_info, f"[resolve] URL looks like direct file: {current_url}") return current_url try: with session.get(current_url, stream=True, timeout=30) as resp: resp.raise_for_status() - ct = resp.headers.get("Content-Type", "").lower() + ct = str(resp.headers.get("Content-Type", "")).lower() if "text/html" not in ct: + _call(log_info, f"[resolve] URL returned non-HTML ({ct}): {current_url}") return current_url content = resp.text @@ -1823,12 +1849,20 @@ def _guess_filename_extension(download_url: str, parsed = urlparse(download_url) suffix = Path(parsed.path).suffix if suffix: - return suffix.lstrip(".") + ext = suffix.lstrip(".").lower() + if ext not in {"php", + "php3", + "html", + "htm", + "aspx", + "asp"}: + return ext content_type = headers.get("content-type", "").lower() mime_map = { "application/pdf": "pdf", "application/epub+zip": "epub", + "application/epub": "epub", "application/x-mobipocket-ebook": "mobi", "application/x-cbr": "cbr", "application/x-cbz": "cbz", @@ -1879,6 +1913,18 @@ def download_from_mirror( Optional[Path]]: """Download file from a LibGen mirror URL with optional progress tracking.""" session = session or requests.Session() + # Ensure a modern browser User-Agent is used for downloads to avoid mirror blocks. + if not any( + k.lower() == "user-agent" + for k in (session.headers or {}) + ): + session.headers.update( + { + "User-Agent": + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + } + ) + output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) @@ -1891,7 +1937,7 @@ def download_from_mirror( _call(log_error, "[download] Could not find direct download link") return False, None - _call(log_info, f"[download] Downloading from: {download_url}") + _call(log_info, f"[download] Resolved final file URL: {download_url}") downloaded = 0 total_size = 0 @@ -1908,6 +1954,7 @@ def download_from_mirror( return False, None total_size = int(headers.get("content-length", 0) or 0) + _call(log_info, f"[download] Starting transfer ({total_size} bytes)") with open(output_path, "wb") as f: for chunk in r.iter_content(chunk_size=8192): @@ -1923,7 +1970,7 @@ def download_from_mirror( if progress_callback and total_size > 0: progress_callback(downloaded, total_size) - _call(log_info, f"[download] Saved to {final_path}") + _call(log_info, f"[download] Successfully saved to {final_path}") return True, final_path except Exception as e: diff --git a/cmdlet/download_file.py b/cmdlet/download_file.py index 02f44b2..1a1a3b2 100644 --- a/cmdlet/download_file.py +++ b/cmdlet/download_file.py @@ -401,22 +401,11 @@ class Download_File(Cmdlet): downloaded_path = provider_obj.download(sr, output_dir) provider_sr = sr + debug(f"[download-file] Provider download result: {downloaded_path}") - if downloaded_path is None: - # Some providers might work via callback 'download_items', mostly legacy. - # If relevant, check for it. - download_items = getattr(provider_obj, "download_items", None) - if callable(download_items): - pass # We can implement generic callback support if needed, - # but pure download() is preferred. - - # Fallback: if we have a direct HTTP URL, download it directly - if (downloaded_path is None and isinstance(target, - str) - and target.startswith("http")): - - # Generic guard for known "not-a-file" URLs could go here or in a helper, - # but for now we rely on user or provider. + # Fallback: if we have a direct HTTP URL and no provider successfully handled it + if (downloaded_path is None and not attempted_provider_download + and isinstance(target, str) and target.startswith("http")): debug( f"[download-file] Provider item looks like direct URL, downloading: {target}"