diff --git a/API/folder.py b/API/folder.py index 9dc32f8..c46319f 100644 --- a/API/folder.py +++ b/API/folder.py @@ -202,6 +202,8 @@ class API_folder_store: DB_NAME = "medios-macina.db" SCHEMA_VERSION = 4 + # Global lock across all instances to prevent 'database is locked' during concurrent operations. + _shared_db_lock = RLock() def __init__(self, library_root: Path): """Initialize the database at the library root. @@ -212,10 +214,8 @@ class API_folder_store: self.library_root = expand_path(library_root).resolve() self.db_path = self.library_root / self.DB_NAME self.connection: Optional[sqlite3.Connection] = None - # sqlite3 connections are not safe for concurrent use across threads. - # We intentionally keep a single connection per API_folder_store instance, - # so we must serialize all DB operations on that connection. - self._db_lock = RLock() + # Use the shared lock + self._db_lock = self._shared_db_lock self._init_db() def _normalize_input_path(self, file_path: Path) -> Path: diff --git a/Provider/libgen.py b/Provider/libgen.py index 40b1abe..0e2e046 100644 --- a/Provider/libgen.py +++ b/Provider/libgen.py @@ -1037,7 +1037,7 @@ class Libgen(Provider): if md is not None: md["metadata_enriched_from"] = enriched_source - if extracted_title and ((not title) + if extracted_title and ((not title) or title.startswith("http")): title = extracted_title except Exception as e: @@ -1069,8 +1069,6 @@ class Libgen(Provider): return self.download(sr, output_dir) except Exception: return None - except Exception: - return None LogFn = Optional[Callable[[str], None]] @@ -1656,9 +1654,10 @@ def _resolve_download_url( session: requests.Session, url: str, log_info: LogFn = None, -) -> Optional[str]: +) -> Tuple[Optional[str], Optional[str]]: """Resolve the final download URL by following the LibGen chain.""" current_url = url + referer = None visited = set() def _resolve_html_links_regex(base_url: str, html: str) -> Optional[str]: @@ -1670,57 +1669,44 @@ def _resolve_download_url( if not html: return None - # LibGen chain helpers (for environments without lxml). - # Typical chain: - # edition.php?id=... -> file.php?id=... - # file.php?id=... -> ads.php?md5=... (or get.php?md5=...) - # ads.php?md5=... -> get.php?md5=... - # get.php?md5=... -> file response - # Use a more relaxed regex for href that handles spaces and missing quotes. - # Format: href [space] = [space] [quote] link [quote] def _find_link(pattern: str) -> Optional[str]: - # This regex allows: - # href="link" - # href='link' - # href=link - # href = "link" regex = r"href\s*=\s*['\"]?(" + pattern + r")['\"]?" match = re.search(regex, html, flags=re.IGNORECASE) if match: u = str(match.group(1) or "").strip() - # Strip trailing quotes if the regex over-captured (e.g. unquoted link followed by space/quote) u = u.split("'")[0].split('"')[0].split(">")[0].split(" ")[0].strip() if u and not u.lower().startswith("javascript:"): return urljoin(base_url, u) return None - # Handle edition -> file links. - found = _find_link(r'[^"\' >]*file\.php\?id=\d+[^"\' >]*') + # Priority patterns for LibGen mirrors (e.g., library.lol, libgen.li) + # 1. library.lol "GET" link or direct /main/ + found = _find_link(r'[^"\' >]*/main/\d+/[^"\' >]*') if found: return found - # Handle series -> edition links. - found = _find_link(r'[^"\' >]*edition\.php\?id=\d+[^"\' >]*') - if found: - return found - - # Handle file -> ads/get links (sometimes present as the "Libgen" mirror). - found = _find_link(r'[^"\' >]*ads\.php\?md5=[a-fA-F0-9]{32}[^"\' >]*') - if found: - return found - - # Prefer explicit get.php md5 links (most common successful chain). + # 2. get.php md5 links found = _find_link(r'[^"\' >]*get\.php\?md5=[a-fA-F0-9]{32}[^"\' >]*') if found: return found - # Next: library.lol main links. - found = _find_link(r'[^"\' >]*library\.lol[^"\' >]*') + # 3. ads.php md5 links + found = _find_link(r'[^"\' >]*ads\.php\?md5=[a-fA-F0-9]{32}[^"\' >]*') if found: return found - # Finally: any direct file extension link. + # 4. file.php id links + found = _find_link(r'[^"\' >]*file\.php\?id=\d+[^"\' >]*') + if found: + return found + + # 5. edition.php id links + found = _find_link(r'[^"\' >]*edition\.php\?id=\d+[^"\' >]*') + if found: + return found + + # 6. Direct file extensions found = _find_link(r'[^"\' >]+\.(?:pdf|epub|mobi|djvu|azw3|cbz|cbr)(?:\?[^"\' >]*)?') if found: return found @@ -1744,30 +1730,33 @@ def _resolve_download_url( _call(log_info, f"[resolve] Loop {idx+1} Checking: {current_url}") - if current_url.lower().endswith((".pdf", - ".epub", - ".mobi", - ".djvu", - ".azw3", - ".cbz", - ".cbr")): + if current_url.lower().split("?")[0].split("#")[0].endswith( + (".pdf", ".epub", ".mobi", ".djvu", ".azw3", ".cbz", ".cbr") + ): _call(log_info, f"[resolve] URL looks like direct file: {current_url}") - return current_url + return current_url, referer try: - with session.get(current_url, stream=True, timeout=30) as resp: + # Pass Referer to stay in the mirror's good graces + headers = {} + if referer: + headers["Referer"] = referer + + with session.get(current_url, stream=True, timeout=30, headers=headers) as resp: resp.raise_for_status() ct = str(resp.headers.get("Content-Type", "")).lower() if "text/html" not in ct: _call(log_info, f"[resolve] URL returned non-HTML ({ct}): {current_url}") - return current_url + return current_url, referer + # Only read if it's small enough to be a landing page content = resp.text except Exception as e: _call(log_info, f"[resolve] Failed to fetch {current_url}: {e}") - return None + return None, None + next_url = None doc = None if lxml_html is not None: try: @@ -1775,58 +1764,58 @@ def _resolve_download_url( except Exception: doc = None - if doc is None: + if doc is not None: + # Try to find common mirror links via XPath + get_href = _find_href_by_text(doc, r"^GET$") + if get_href: + next_url = urljoin(current_url, get_href) + + if not next_url: + # Mirror-specific patterns + if "series.php" in current_url: + hrefs = doc.xpath("//a[contains(@href,'edition.php')]/@href") + if hrefs: + next_url = urljoin(current_url, str(hrefs[0] or "")) + elif "edition.php" in current_url: + hrefs = doc.xpath("//a[contains(@href,'file.php')]/@href") + if hrefs: + next_url = urljoin(current_url, str(hrefs[0] or "")) + elif "file.php" in current_url: + libgen_href = None + for a in doc.xpath("//a[@href]"): + if str(a.get("title") or "").strip().lower() == "libgen": + libgen_href = str(a.get("href") or "").strip() + break + if not libgen_href: + libgen_href = _find_href_by_text(doc, r"Libgen") + if libgen_href: + next_url = urljoin(current_url, libgen_href) + elif "ads.php" in current_url: + hrefs = doc.xpath("//a[contains(@href,'get.php')]/@href") + if hrefs: + next_url = urljoin(current_url, str(hrefs[0] or "")) + + if not next_url: + # General provider links + for text in ["Cloudflare", "IPFS.io", "Infura"]: + href = _find_href_by_text(doc, re.escape(text)) + if href: + next_url = urljoin(current_url, href) + break + + # Fallback to regex if XPath failed or lxml is missing + if not next_url: next_url = _resolve_html_links_regex(current_url, content) - if next_url: - current_url = next_url - continue - _call( - log_info, - "[resolve] lxml not available and regex resolver found no links" - ) - return None - get_href = _find_href_by_text(doc, r"^GET$") - if get_href: - return urljoin(current_url, get_href) - - if "series.php" in current_url: - hrefs = doc.xpath("//a[contains(@href,'edition.php')]/@href") - if hrefs: - current_url = urljoin(current_url, str(hrefs[0] or "")) - continue - - if "edition.php" in current_url: - hrefs = doc.xpath("//a[contains(@href,'file.php')]/@href") - if hrefs: - current_url = urljoin(current_url, str(hrefs[0] or "")) - continue - - if "file.php" in current_url: - libgen_href = None - for a in doc.xpath("//a[@href]"): - if str(a.get("title") or "").strip().lower() == "libgen": - libgen_href = str(a.get("href") or "").strip() - break - if not libgen_href: - libgen_href = _find_href_by_text(doc, r"Libgen") - if libgen_href: - current_url = urljoin(current_url, libgen_href) - continue - - if "ads.php" in current_url: - hrefs = doc.xpath("//a[contains(@href,'get.php')]/@href") - if hrefs: - return urljoin(current_url, str(hrefs[0] or "")) - - for text in ["Cloudflare", "IPFS.io", "Infura"]: - href = _find_href_by_text(doc, re.escape(text)) - if href: - return urljoin(current_url, href) + if next_url: + referer = current_url + current_url = next_url + continue + _call(log_info, "[resolve] No further links found in content") break - return None + return None, None def _guess_filename_extension(download_url: str, @@ -1931,7 +1920,7 @@ def download_from_mirror( try: _call(log_info, f"[download] Resolving download link from: {mirror_url}") - download_url = _resolve_download_url(session, mirror_url, log_info) + download_url, referer = _resolve_download_url(session, mirror_url, log_info) if not download_url: _call(log_error, "[download] Could not find direct download link") @@ -1944,7 +1933,11 @@ def download_from_mirror( headers: Dict[str, str] = {} - with session.get(download_url, stream=True, timeout=60) as r: + req_headers = {} + if referer: + req_headers["Referer"] = referer + + with session.get(download_url, stream=True, timeout=60, headers=req_headers) as r: r.raise_for_status() headers = dict(r.headers) diff --git a/Store/ZeroTier.py b/Store/ZeroTier.py index b82dac1..054d862 100644 --- a/Store/ZeroTier.py +++ b/Store/ZeroTier.py @@ -288,14 +288,20 @@ class ZeroTier(Store): url += f"{sep}api_key={self._api_key}" return url - def download_to_temp(self, file_hash: str, temp_root: Optional[Path] = None, suffix: Optional[str] = None) -> Optional[Path]: + def download_to_temp( + self, + file_hash: str, + temp_root: Optional[Path] = None, + suffix: Optional[str] = None, + progress_callback: Optional[Callable[[int, int], None]] = None, + ) -> Optional[Path]: """Download a file from the remote peer to a local temporary file.""" import os import httpx import tempfile if self._service == "hydrus": - return None + return None url = self.get_file(file_hash) if not url or not isinstance(url, str) or not url.startswith("http"): @@ -314,21 +320,32 @@ class ZeroTier(Store): fd, tmp_path = tempfile.mkstemp(dir=str(temp_root), suffix=suffix) else: fd, tmp_path = tempfile.mkstemp(suffix=suffix) - - os_fd = os.fdopen(fd, 'wb') - + + os_fd = os.fdopen(fd, "wb") + headers = {} if self._api_key: headers["X-API-Key"] = self._api_key + downloaded = 0 + total = 0 with httpx.stream("GET", url, headers=headers, timeout=self._timeout) as r: r.raise_for_status() - for chunk in r.iter_bytes(): - os_fd.write(chunk) - + total = int(r.headers.get("Content-Length", 0)) + # Use a larger chunk size for ZeroTier/P2P efficiency + for chunk in r.iter_bytes(chunk_size=128 * 1024): + if chunk: + os_fd.write(chunk) + downloaded += len(chunk) + if progress_callback: + try: + progress_callback(downloaded, total) + except Exception: + pass + os_fd.close() return Path(tmp_path) - + except Exception as exc: debug(f"ZeroTier download_to_temp failed for {file_hash}: {exc}") return None diff --git a/cmdlet/add_file.py b/cmdlet/add_file.py index 5747493..4ee7b1f 100644 --- a/cmdlet/add_file.py +++ b/cmdlet/add_file.py @@ -504,9 +504,15 @@ class Add_File(Cmdlet): # When add-file -store is the last stage, always show a final search-file table. # This is especially important for multi-item ingests (e.g., multi-clip downloads) # so the user always gets a selectable ResultTable. + live_progress = None + try: + live_progress = ctx.get_live_progress() + except Exception: + live_progress = None + want_final_search_file = ( bool(is_last_stage) and bool(is_storage_backend_location) - and bool(location) + and bool(location) and bool(live_progress) ) auto_search_file_after_add = False @@ -994,15 +1000,27 @@ class Add_File(Cmdlet): suffix = metadata.get("ext") tmp_dir = Path(tempfile.mkdtemp(prefix="add-file-src-")) - - # Pass suffix to downloader if it supports it + + # Introspect downloader to pass supported args (suffix, progress_callback) import inspect + sig = inspect.signature(downloader) + kwargs = {"temp_root": tmp_dir} if "suffix" in sig.parameters: - downloaded = downloader(str(file_hash), temp_root=tmp_dir, suffix=suffix) - else: - downloaded = downloader(str(file_hash), temp_root=tmp_dir) - + kwargs["suffix"] = suffix + + # Hook into global PipelineProgress if available + pp = PipelineProgress.get() + if pp and "progress_callback" in sig.parameters: + + def _cb(done, total): + # Show fetch progress instead of just 'resolving' + pp.update(downloaded=done, total=total, label="peer transfer") + + kwargs["progress_callback"] = _cb + + downloaded = downloader(str(file_hash), **kwargs) + if isinstance(downloaded, Path) and downloaded.exists(): pipe_obj.is_temp = True return downloaded, tmp_dir diff --git a/cmdlet/download_file.py b/cmdlet/download_file.py index 1a1a3b2..7d994fa 100644 --- a/cmdlet/download_file.py +++ b/cmdlet/download_file.py @@ -229,9 +229,11 @@ class Download_File(Cmdlet): except Exception as e: log(f"Provider {provider_name} error handling {url}: {e}", file=sys.stderr) - # Fallthrough to direct download? - # If a provider explicitly claimed it but failed, we'll try direct download as a last resort. pass + + if not handled: + debug(f"Provider {provider_name} matched URL but failed to download. Skipping direct fallback to avoid landing pages.") + continue # Direct Download Fallback result_obj = _download_direct_file(