f

2026-01-14 19:23:22 -08:00
parent 838008a933
commit 4b324b1e8e
2 changed files with 131 additions and 95 deletions
@@ -12,7 +12,7 @@ from urllib.parse import urljoin, urlparse, unquote
 from ProviderCore.base import Provider, SearchResult
 from SYS.utils import sanitize_filename
-from SYS.logger import log
+from SYS.logger import log, debug
 from SYS.models import ProgressBar
 # Optional dependency for HTML scraping fallbacks
@@ -786,6 +786,10 @@ class Libgen(Provider):
            md = getattr(result, "full_metadata", None)
            if not isinstance(md, dict):
                md = {}
                try:
                    setattr(result, "full_metadata", md)
                except Exception:
                    pass
            title = str(getattr(result, "title", "") or "").strip()
            md5 = str(md.get("md5") or "").strip()
@@ -860,7 +864,11 @@ class Libgen(Provider):
                last_progress_time[0] = now
            ok, final_path = download_from_mirror(
-                target, out_path, progress_callback=progress_callback
+                target,
                out_path,
                progress_callback=progress_callback,
                log_info=debug,
                log_error=log,
            )
            progress_bar.finish()
            if ok and final_path:
@@ -868,15 +876,16 @@ class Libgen(Provider):
                # and then enrich tags via OpenLibrary/isbnsearch. This ensures enrichment never
                # blocks the download itself.
                try:
-                    if isinstance(target, str) and target.startswith("http"):
+                    target_str = str(target)
-                        low = target.lower()
+                    if isinstance(target, str) and target_str.startswith("http"):
                        low = target_str.lower()
                        # Preferred: ads.php pages often embed a complete tag block.
                        # Parse it post-download (best-effort) and do NOT perform external
                        # enrichment (OpenLibrary/isbnsearch) unless the user later chooses to.
                        if ("/ads.php" in low) or ("/get.php" in low):
                            ads_url = (
-                                target if "/ads.php" in low else
+                                target_str if "/ads.php" in low else
-                                _libgen_ads_url_for_target(target)
+                                _libgen_ads_url_for_target(target_str)
                            )
                            if ads_url:
                                html = _fetch_libgen_details_html(
@@ -889,6 +898,7 @@ class Libgen(Provider):
                                    extracted_title = str(meta.get("title")
                                                          or "").strip()
                                    if extracted_title:
                                        if md is not None:
                                            md["title"] = extracted_title
                                        result.tag.add(f"title:{extracted_title}")
                                        if (not title) or title.startswith("http"):
@@ -906,16 +916,19 @@ class Libgen(Provider):
                                    publisher = str(meta.get("publisher") or "").strip()
                                    if publisher:
                                        if md is not None:
                                            md["publisher"] = publisher
                                        result.tag.add(f"publisher:{publisher}")
                                    year = str(meta.get("year") or "").strip()
                                    if year:
                                        if md is not None:
                                            md["year"] = year
                                        result.tag.add(f"year:{year}")
                                    language = str(meta.get("language") or "").strip()
                                    if language:
                                        if md is not None:
                                            md["language"] = language
                                        result.tag.add(f"language:{language}")
@@ -929,6 +942,7 @@ class Libgen(Provider):
                                        if str(x).strip()
                                    ]
                                    if isbns:
                                        if md is not None:
                                            md["isbn"] = isbns
                                        for isbn_val in isbns:
                                            result.tag.add(f"isbn:{isbn_val}")
@@ -971,12 +985,12 @@ class Libgen(Provider):
                        # Legacy: edition/file/series details pages (title + ISBN) + external enrichment.
                        if (("/edition.php" in low) or ("/file.php" in low)
                                or ("/series.php" in low)):
-                            html = _fetch_libgen_details_html(target)
+                            html = _fetch_libgen_details_html(target_str)
                            if html:
                                meta = _parse_libgen_details_html(html)
                                if not meta.get("edition_id"):
-                                    eid = _libgen_id_from_url(target)
+                                    eid = _libgen_id_from_url(target_str)
                                    if eid:
                                        meta["edition_id"] = eid
@@ -992,9 +1006,11 @@ class Libgen(Provider):
                                ]
                                if extracted_title:
                                    if md is not None:
                                        md["title"] = extracted_title
                                    result.tag.add(f"title:{extracted_title}")
                                if extracted_isbns:
                                    if md is not None:
                                        md["isbn"] = extracted_isbns
                                    for isbn_val in extracted_isbns:
                                        isbn_norm = str(isbn_val
@@ -1003,6 +1019,7 @@ class Libgen(Provider):
                                        if isbn_norm:
                                            result.tag.add(f"isbn:{isbn_norm}")
                                if meta.get("edition_id"):
                                    if md is not None:
                                        md["edition_id"] = str(meta.get("edition_id"))
                                preferred_isbn = _prefer_isbn(extracted_isbns)
@@ -1017,15 +1034,40 @@ class Libgen(Provider):
                                        except Exception:
                                            pass
                                    if enriched_source:
                                        if md is not None:
                                            md["metadata_enriched_from"] = enriched_source
                if extracted_title and ((not title)
                                                        or title.startswith("http")):
                                    title = extracted_title
-                except Exception:
+                except Exception as e:
-                    pass
+                    debug(f"[libgen] Post-download enrichment failed: {e}")
                debug(f"[libgen] Returning downloaded path: {final_path}")
                return Path(final_path)
            debug(f"[libgen] Download mirror failed (ok={ok}, path={final_path})")
            return None
        except Exception as exc:
            debug(f"[libgen] Download exception: {exc}")
            import traceback
            debug(traceback.format_exc())
            return None
    def download_url(self, url: str, output_dir: Path) -> Optional[Path]:
        """Download a direct LibGen URL using the regular mirror logic."""
        try:
            from ProviderCore.base import SearchResult
            sr = SearchResult(
                table="libgen",
                title="libgen",
                path=url,
                full_metadata={
                    "md5": _libgen_md5_from_url(url)
                }
            )
            return self.download(sr, output_dir)
        except Exception:
            return None
        except Exception:
            return None
@@ -1635,71 +1677,53 @@ def _resolve_download_url(
        #   ads.php?md5=...     -> get.php?md5=...
        #   get.php?md5=...     -> file response
        # Use a more relaxed regex for href that handles spaces and missing quotes.
        # Format: href [space] = [space] [quote] link [quote]
        def _find_link(pattern: str) -> Optional[str]:
            # This regex allows:
            #   href="link"
            #   href='link'
            #   href=link
            #   href = "link"
            regex = r"href\s*=\s*['\"]?(" + pattern + r")['\"]?"
            match = re.search(regex, html, flags=re.IGNORECASE)
            if match:
                u = str(match.group(1) or "").strip()
                # Strip trailing quotes if the regex over-captured (e.g. unquoted link followed by space/quote)
                u = u.split("'")[0].split('"')[0].split(">")[0].split(" ")[0].strip()
                if u and not u.lower().startswith("javascript:"):
                    return urljoin(base_url, u)
            return None
        # Handle edition -> file links.
-        m = re.search(
+        found = _find_link(r'[^"\' >]*file\.php\?id=\d+[^"\' >]*')
-            r'href=["\']([^"\']*file\.php\?id=\d+[^"\']*)["\']',
+        if found:
-            html,
+            return found
            flags=re.IGNORECASE
        )
        if m:
            href = str(m.group(1) or "").strip()
            if href and not href.lower().startswith("javascript:"):
                return urljoin(base_url, href)
        # Handle series -> edition links.
-        m = re.search(
+        found = _find_link(r'[^"\' >]*edition\.php\?id=\d+[^"\' >]*')
-            r'href=["\']([^"\']*edition\.php\?id=\d+[^"\']*)["\']',
+        if found:
-            html,
+            return found
            flags=re.IGNORECASE
        )
        if m:
            href = str(m.group(1) or "").strip()
            if href and not href.lower().startswith("javascript:"):
                return urljoin(base_url, href)
        # Handle file -> ads/get links (sometimes present as the "Libgen" mirror).
-        m = re.search(
+        found = _find_link(r'[^"\' >]*ads\.php\?md5=[a-fA-F0-9]{32}[^"\' >]*')
-            r'href=["\']([^"\']*ads\.php\?md5=[a-fA-F0-9]{32}[^"\']*)["\']',
+        if found:
-            html,
+            return found
            flags=re.IGNORECASE,
        )
        if m:
            href = str(m.group(1) or "").strip()
            if href and not href.lower().startswith("javascript:"):
                return urljoin(base_url, href)
        # Prefer explicit get.php md5 links (most common successful chain).
-        m = re.search(
+        found = _find_link(r'[^"\' >]*get\.php\?md5=[a-fA-F0-9]{32}[^"\' >]*')
-            r'href=["\']([^"\']*get\.php\?md5=[a-fA-F0-9]{32}[^"\']*)["\']',
+        if found:
-            html,
+            return found
            flags=re.IGNORECASE,
        )
        if m:
            href = str(m.group(1) or "").strip()
            if href and not href.lower().startswith("javascript:"):
                return urljoin(base_url, href)
        # Next: library.lol main links.
-        m = re.search(
+        found = _find_link(r'[^"\' >]*library\.lol[^"\' >]*')
-            r'href=["\']([^"\']*library\.lol[^"\']*)["\']',
+        if found:
-            html,
+            return found
            flags=re.IGNORECASE
        )
        if m:
            href = str(m.group(1) or "").strip()
            if href and not href.lower().startswith("javascript:"):
                return urljoin(base_url, href)
        # Finally: any direct file extension link.
-        m = re.search(
+        found = _find_link(r'[^"\' >]+\.(?:pdf|epub|mobi|djvu|azw3|cbz|cbr)(?:\?[^"\' >]*)?')
-            r'href=["\']([^"\']+\.(?:pdf|epub|mobi|djvu|azw3|cbz|cbr)(?:\?[^"\']*)?)["\']',
+        if found:
-            html,
+            return found
            flags=re.IGNORECASE,
        )
        if m:
            href = str(m.group(1) or "").strip()
            if href and not href.lower().startswith("javascript:"):
                return urljoin(base_url, href)
        return None
@@ -1713,12 +1737,12 @@ def _resolve_download_url(
                    return href
        return None
-    for _ in range(6):
+    for idx in range(10):
        if current_url in visited:
            break
        visited.add(current_url)
-        _call(log_info, f"[resolve] Checking: {current_url}")
+        _call(log_info, f"[resolve] Loop {idx+1} Checking: {current_url}")
        if current_url.lower().endswith((".pdf",
                                         ".epub",
@@ -1727,14 +1751,16 @@ def _resolve_download_url(
                                         ".azw3",
                                         ".cbz",
                                         ".cbr")):
            _call(log_info, f"[resolve] URL looks like direct file: {current_url}")
            return current_url
        try:
            with session.get(current_url, stream=True, timeout=30) as resp:
                resp.raise_for_status()
-                ct = resp.headers.get("Content-Type", "").lower()
+                ct = str(resp.headers.get("Content-Type", "")).lower()
                if "text/html" not in ct:
                    _call(log_info, f"[resolve] URL returned non-HTML ({ct}): {current_url}")
                    return current_url
                content = resp.text
@@ -1823,12 +1849,20 @@ def _guess_filename_extension(download_url: str,
    parsed = urlparse(download_url)
    suffix = Path(parsed.path).suffix
    if suffix:
-        return suffix.lstrip(".")
+        ext = suffix.lstrip(".").lower()
        if ext not in {"php",
                       "php3",
                       "html",
                       "htm",
                       "aspx",
                       "asp"}:
            return ext
    content_type = headers.get("content-type", "").lower()
    mime_map = {
        "application/pdf": "pdf",
        "application/epub+zip": "epub",
        "application/epub": "epub",
        "application/x-mobipocket-ebook": "mobi",
        "application/x-cbr": "cbr",
        "application/x-cbz": "cbz",
@@ -1879,6 +1913,18 @@ def download_from_mirror(
           Optional[Path]]:
    """Download file from a LibGen mirror URL with optional progress tracking."""
    session = session or requests.Session()
    # Ensure a modern browser User-Agent is used for downloads to avoid mirror blocks.
    if not any(
            k.lower() == "user-agent"
            for k in (session.headers or {})
    ):
        session.headers.update(
            {
                "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
            }
        )
    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)
@@ -1891,7 +1937,7 @@ def download_from_mirror(
            _call(log_error, "[download] Could not find direct download link")
            return False, None
-        _call(log_info, f"[download] Downloading from: {download_url}")
+        _call(log_info, f"[download] Resolved final file URL: {download_url}")
        downloaded = 0
        total_size = 0
@@ -1908,6 +1954,7 @@ def download_from_mirror(
                return False, None
            total_size = int(headers.get("content-length", 0) or 0)
            _call(log_info, f"[download] Starting transfer ({total_size} bytes)")
            with open(output_path, "wb") as f:
                for chunk in r.iter_content(chunk_size=8192):
@@ -1923,7 +1970,7 @@ def download_from_mirror(
        if progress_callback and total_size > 0:
            progress_callback(downloaded, total_size)
-        _call(log_info, f"[download] Saved to {final_path}")
+        _call(log_info, f"[download] Successfully saved to {final_path}")
        return True, final_path
    except Exception as e:
@@ -401,22 +401,11 @@ class Download_File(Cmdlet):
                        downloaded_path = provider_obj.download(sr, output_dir)
                        provider_sr = sr
                        debug(f"[download-file] Provider download result: {downloaded_path}")
-                        if downloaded_path is None:
+                # Fallback: if we have a direct HTTP URL and no provider successfully handled it
-                             # Some providers might work via callback 'download_items', mostly legacy.
+                if (downloaded_path is None and not attempted_provider_download 
-                             # If relevant, check for it.
+                        and isinstance(target, str) and target.startswith("http")):
                             download_items = getattr(provider_obj, "download_items", None)
                             if callable(download_items):
                                pass # We can implement generic callback support if needed, 
                                     # but pure download() is preferred.
                # Fallback: if we have a direct HTTP URL, download it directly
                if (downloaded_path is None and isinstance(target,
                                                           str)
                        and target.startswith("http")):
                    # Generic guard for known "not-a-file" URLs could go here or in a helper,
                    # but for now we rely on user or provider.
                    debug(
                        f"[download-file] Provider item looks like direct URL, downloading: {target}"