hh

2026-01-14 21:53:07 -08:00
parent 4b324b1e8e
commit 5d63777dee
5 changed files with 146 additions and 116 deletions
@@ -202,6 +202,8 @@ class API_folder_store:

    DB_NAME = "medios-macina.db"
    SCHEMA_VERSION = 4
+    # Global lock across all instances to prevent 'database is locked' during concurrent operations.
+    _shared_db_lock = RLock()

    def __init__(self, library_root: Path):
        """Initialize the database at the library root.
@@ -212,10 +214,8 @@ class API_folder_store:
        self.library_root = expand_path(library_root).resolve()
        self.db_path = self.library_root / self.DB_NAME
        self.connection: Optional[sqlite3.Connection] = None
-        # sqlite3 connections are not safe for concurrent use across threads.
-        # We intentionally keep a single connection per API_folder_store instance,
-        # so we must serialize all DB operations on that connection.
-        self._db_lock = RLock()
+        # Use the shared lock
+        self._db_lock = self._shared_db_lock
        self._init_db()

    def _normalize_input_path(self, file_path: Path) -> Path:
@@ -1037,7 +1037,7 @@ class Libgen(Provider):
                                        if md is not None:
                                            md["metadata_enriched_from"] = enriched_source

-                if extracted_title and ((not title)
+                                if extracted_title and ((not title)
                                                        or title.startswith("http")):
                                    title = extracted_title
                except Exception as e:
@@ -1069,8 +1069,6 @@ class Libgen(Provider):
            return self.download(sr, output_dir)
        except Exception:
            return None
-        except Exception:
-            return None


 LogFn = Optional[Callable[[str], None]]
@@ -1656,9 +1654,10 @@ def _resolve_download_url(
    session: requests.Session,
    url: str,
    log_info: LogFn = None,
-) -> Optional[str]:
+) -> Tuple[Optional[str], Optional[str]]:
    """Resolve the final download URL by following the LibGen chain."""
    current_url = url
+    referer = None
    visited = set()

    def _resolve_html_links_regex(base_url: str, html: str) -> Optional[str]:
@@ -1670,57 +1669,44 @@ def _resolve_download_url(
        if not html:
            return None

-        # LibGen chain helpers (for environments without lxml).
-        # Typical chain:
-        #   edition.php?id=...  -> file.php?id=...
-        #   file.php?id=...     -> ads.php?md5=... (or get.php?md5=...)
-        #   ads.php?md5=...     -> get.php?md5=...
-        #   get.php?md5=...     -> file response
-
        # Use a more relaxed regex for href that handles spaces and missing quotes.
-        # Format: href [space] = [space] [quote] link [quote]
        def _find_link(pattern: str) -> Optional[str]:
-            # This regex allows:
-            #   href="link"
-            #   href='link'
-            #   href=link
-            #   href = "link"
            regex = r"href\s*=\s*['\"]?(" + pattern + r")['\"]?"
            match = re.search(regex, html, flags=re.IGNORECASE)
            if match:
                u = str(match.group(1) or "").strip()
-                # Strip trailing quotes if the regex over-captured (e.g. unquoted link followed by space/quote)
                u = u.split("'")[0].split('"')[0].split(">")[0].split(" ")[0].strip()
                if u and not u.lower().startswith("javascript:"):
                    return urljoin(base_url, u)
            return None

-        # Handle edition -> file links.
-        found = _find_link(r'[^"\' >]*file\.php\?id=\d+[^"\' >]*')
+        # Priority patterns for LibGen mirrors (e.g., library.lol, libgen.li)
+        # 1. library.lol "GET" link or direct /main/
+        found = _find_link(r'[^"\' >]*/main/\d+/[^"\' >]*')
        if found:
            return found

-        # Handle series -> edition links.
-        found = _find_link(r'[^"\' >]*edition\.php\?id=\d+[^"\' >]*')
-        if found:
-            return found
-
-        # Handle file -> ads/get links (sometimes present as the "Libgen" mirror).
-        found = _find_link(r'[^"\' >]*ads\.php\?md5=[a-fA-F0-9]{32}[^"\' >]*')
-        if found:
-            return found
-
-        # Prefer explicit get.php md5 links (most common successful chain).
+        # 2. get.php md5 links
        found = _find_link(r'[^"\' >]*get\.php\?md5=[a-fA-F0-9]{32}[^"\' >]*')
        if found:
            return found

-        # Next: library.lol main links.
-        found = _find_link(r'[^"\' >]*library\.lol[^"\' >]*')
+        # 3. ads.php md5 links
+        found = _find_link(r'[^"\' >]*ads\.php\?md5=[a-fA-F0-9]{32}[^"\' >]*')
        if found:
            return found

-        # Finally: any direct file extension link.
+        # 4. file.php id links
+        found = _find_link(r'[^"\' >]*file\.php\?id=\d+[^"\' >]*')
+        if found:
+            return found
+
+        # 5. edition.php id links
+        found = _find_link(r'[^"\' >]*edition\.php\?id=\d+[^"\' >]*')
+        if found:
+            return found
+
+        # 6. Direct file extensions
        found = _find_link(r'[^"\' >]+\.(?:pdf|epub|mobi|djvu|azw3|cbz|cbr)(?:\?[^"\' >]*)?')
        if found:
            return found
@@ -1744,30 +1730,33 @@ def _resolve_download_url(

        _call(log_info, f"[resolve] Loop {idx+1} Checking: {current_url}")

-        if current_url.lower().endswith((".pdf",
-                                         ".epub",
-                                         ".mobi",
-                                         ".djvu",
-                                         ".azw3",
-                                         ".cbz",
-                                         ".cbr")):
+        if current_url.lower().split("?")[0].split("#")[0].endswith(
+            (".pdf", ".epub", ".mobi", ".djvu", ".azw3", ".cbz", ".cbr")
+        ):
            _call(log_info, f"[resolve] URL looks like direct file: {current_url}")
-            return current_url
+            return current_url, referer

        try:
-            with session.get(current_url, stream=True, timeout=30) as resp:
+            # Pass Referer to stay in the mirror's good graces
+            headers = {}
+            if referer:
+                headers["Referer"] = referer
+
+            with session.get(current_url, stream=True, timeout=30, headers=headers) as resp:
                resp.raise_for_status()
                ct = str(resp.headers.get("Content-Type", "")).lower()

                if "text/html" not in ct:
                    _call(log_info, f"[resolve] URL returned non-HTML ({ct}): {current_url}")
-                    return current_url
+                    return current_url, referer

+                # Only read if it's small enough to be a landing page
                content = resp.text
        except Exception as e:
            _call(log_info, f"[resolve] Failed to fetch {current_url}: {e}")
-            return None
+            return None, None

+        next_url = None
        doc = None
        if lxml_html is not None:
            try:
@@ -1775,58 +1764,58 @@ def _resolve_download_url(
            except Exception:
                doc = None

-        if doc is None:
+        if doc is not None:
+            # Try to find common mirror links via XPath
+            get_href = _find_href_by_text(doc, r"^GET$")
+            if get_href:
+                next_url = urljoin(current_url, get_href)
+
+            if not next_url:
+                # Mirror-specific patterns
+                if "series.php" in current_url:
+                    hrefs = doc.xpath("//a[contains(@href,'edition.php')]/@href")
+                    if hrefs:
+                        next_url = urljoin(current_url, str(hrefs[0] or ""))
+                elif "edition.php" in current_url:
+                    hrefs = doc.xpath("//a[contains(@href,'file.php')]/@href")
+                    if hrefs:
+                        next_url = urljoin(current_url, str(hrefs[0] or ""))
+                elif "file.php" in current_url:
+                    libgen_href = None
+                    for a in doc.xpath("//a[@href]"):
+                        if str(a.get("title") or "").strip().lower() == "libgen":
+                            libgen_href = str(a.get("href") or "").strip()
+                            break
+                    if not libgen_href:
+                        libgen_href = _find_href_by_text(doc, r"Libgen")
+                    if libgen_href:
+                        next_url = urljoin(current_url, libgen_href)
+                elif "ads.php" in current_url:
+                    hrefs = doc.xpath("//a[contains(@href,'get.php')]/@href")
+                    if hrefs:
+                        next_url = urljoin(current_url, str(hrefs[0] or ""))
+
+            if not next_url:
+                # General provider links
+                for text in ["Cloudflare", "IPFS.io", "Infura"]:
+                    href = _find_href_by_text(doc, re.escape(text))
+                    if href:
+                        next_url = urljoin(current_url, href)
+                        break
+
+        # Fallback to regex if XPath failed or lxml is missing
+        if not next_url:
            next_url = _resolve_html_links_regex(current_url, content)
-            if next_url:
-                current_url = next_url
-                continue
-            _call(
-                log_info,
-                "[resolve] lxml not available and regex resolver found no links"
-            )
-            return None

-        get_href = _find_href_by_text(doc, r"^GET$")
-        if get_href:
-            return urljoin(current_url, get_href)
-
-        if "series.php" in current_url:
-            hrefs = doc.xpath("//a[contains(@href,'edition.php')]/@href")
-            if hrefs:
-                current_url = urljoin(current_url, str(hrefs[0] or ""))
-                continue
-
-        if "edition.php" in current_url:
-            hrefs = doc.xpath("//a[contains(@href,'file.php')]/@href")
-            if hrefs:
-                current_url = urljoin(current_url, str(hrefs[0] or ""))
-                continue
-
-        if "file.php" in current_url:
-            libgen_href = None
-            for a in doc.xpath("//a[@href]"):
-                if str(a.get("title") or "").strip().lower() == "libgen":
-                    libgen_href = str(a.get("href") or "").strip()
-                    break
-            if not libgen_href:
-                libgen_href = _find_href_by_text(doc, r"Libgen")
-            if libgen_href:
-                current_url = urljoin(current_url, libgen_href)
-                continue
-
-        if "ads.php" in current_url:
-            hrefs = doc.xpath("//a[contains(@href,'get.php')]/@href")
-            if hrefs:
-                return urljoin(current_url, str(hrefs[0] or ""))
-
-        for text in ["Cloudflare", "IPFS.io", "Infura"]:
-            href = _find_href_by_text(doc, re.escape(text))
-            if href:
-                return urljoin(current_url, href)
+        if next_url:
+            referer = current_url
+            current_url = next_url
+            continue

+        _call(log_info, "[resolve] No further links found in content")
        break

-    return None
+    return None, None


 def _guess_filename_extension(download_url: str,
@@ -1931,7 +1920,7 @@ def download_from_mirror(
    try:
        _call(log_info, f"[download] Resolving download link from: {mirror_url}")

-        download_url = _resolve_download_url(session, mirror_url, log_info)
+        download_url, referer = _resolve_download_url(session, mirror_url, log_info)

        if not download_url:
            _call(log_error, "[download] Could not find direct download link")
@@ -1944,7 +1933,11 @@ def download_from_mirror(
        headers: Dict[str,
                      str] = {}

-        with session.get(download_url, stream=True, timeout=60) as r:
+        req_headers = {}
+        if referer:
+            req_headers["Referer"] = referer
+
+        with session.get(download_url, stream=True, timeout=60, headers=req_headers) as r:
            r.raise_for_status()
            headers = dict(r.headers)

@@ -288,14 +288,20 @@ class ZeroTier(Store):
            url += f"{sep}api_key={self._api_key}"
        return url

-    def download_to_temp(self, file_hash: str, temp_root: Optional[Path] = None, suffix: Optional[str] = None) -> Optional[Path]:
+    def download_to_temp(
+        self,
+        file_hash: str,
+        temp_root: Optional[Path] = None,
+        suffix: Optional[str] = None,
+        progress_callback: Optional[Callable[[int, int], None]] = None,
+    ) -> Optional[Path]:
        """Download a file from the remote peer to a local temporary file."""
        import os
        import httpx
        import tempfile

        if self._service == "hydrus":
-             return None
+            return None

        url = self.get_file(file_hash)
        if not url or not isinstance(url, str) or not url.startswith("http"):
@@ -314,21 +320,32 @@ class ZeroTier(Store):
                fd, tmp_path = tempfile.mkstemp(dir=str(temp_root), suffix=suffix)
            else:
                fd, tmp_path = tempfile.mkstemp(suffix=suffix)
-            
-            os_fd = os.fdopen(fd, 'wb')
-            
+
+            os_fd = os.fdopen(fd, "wb")
+
            headers = {}
            if self._api_key:
                headers["X-API-Key"] = self._api_key

+            downloaded = 0
+            total = 0
            with httpx.stream("GET", url, headers=headers, timeout=self._timeout) as r:
                r.raise_for_status()
-                for chunk in r.iter_bytes():
-                    os_fd.write(chunk)
-            
+                total = int(r.headers.get("Content-Length", 0))
+                # Use a larger chunk size for ZeroTier/P2P efficiency
+                for chunk in r.iter_bytes(chunk_size=128 * 1024):
+                    if chunk:
+                        os_fd.write(chunk)
+                        downloaded += len(chunk)
+                        if progress_callback:
+                            try:
+                                progress_callback(downloaded, total)
+                            except Exception:
+                                pass
+
            os_fd.close()
            return Path(tmp_path)
-            
+
        except Exception as exc:
            debug(f"ZeroTier download_to_temp failed for {file_hash}: {exc}")
            return None
@@ -504,9 +504,15 @@ class Add_File(Cmdlet):
        # When add-file -store is the last stage, always show a final search-file table.
        # This is especially important for multi-item ingests (e.g., multi-clip downloads)
        # so the user always gets a selectable ResultTable.
+        live_progress = None
+        try:
+            live_progress = ctx.get_live_progress()
+        except Exception:
+            live_progress = None
+
        want_final_search_file = (
            bool(is_last_stage) and bool(is_storage_backend_location)
-            and bool(location)
+            and bool(location) and bool(live_progress)
        )
        auto_search_file_after_add = False

@@ -994,15 +1000,27 @@ class Add_File(Cmdlet):
                    suffix = metadata.get("ext")
            
            tmp_dir = Path(tempfile.mkdtemp(prefix="add-file-src-"))
-            
-            # Pass suffix to downloader if it supports it
+
+            # Introspect downloader to pass supported args (suffix, progress_callback)
            import inspect
+
            sig = inspect.signature(downloader)
+            kwargs = {"temp_root": tmp_dir}
            if "suffix" in sig.parameters:
-                downloaded = downloader(str(file_hash), temp_root=tmp_dir, suffix=suffix)
-            else:
-                downloaded = downloader(str(file_hash), temp_root=tmp_dir)
-                
+                kwargs["suffix"] = suffix
+
+            # Hook into global PipelineProgress if available
+            pp = PipelineProgress.get()
+            if pp and "progress_callback" in sig.parameters:
+
+                def _cb(done, total):
+                    # Show fetch progress instead of just 'resolving'
+                    pp.update(downloaded=done, total=total, label="peer transfer")
+
+                kwargs["progress_callback"] = _cb
+
+            downloaded = downloader(str(file_hash), **kwargs)
+
            if isinstance(downloaded, Path) and downloaded.exists():
                pipe_obj.is_temp = True
                return downloaded, tmp_dir
@@ -229,9 +229,11 @@ class Download_File(Cmdlet):
                                
                    except Exception as e:
                        log(f"Provider {provider_name} error handling {url}: {e}", file=sys.stderr)
-                        # Fallthrough to direct download? 
-                        # If a provider explicitly claimed it but failed, we'll try direct download as a last resort.
                        pass
+                    
+                    if not handled:
+                        debug(f"Provider {provider_name} matched URL but failed to download. Skipping direct fallback to avoid landing pages.")
+                        continue

                # Direct Download Fallback
                result_obj = _download_direct_file(