diff --git a/API/HTTP.py b/API/HTTP.py index 650db19..1c8e350 100644 --- a/API/HTTP.py +++ b/API/HTTP.py @@ -244,6 +244,8 @@ class HTTPClient: self, method: str, url: str, + raise_for_status: bool = True, + log_http_errors: bool = True, **kwargs ) -> httpx.Response: """ @@ -273,7 +275,8 @@ class HTTPClient: for attempt in range(self.retries): try: response = self._client.request(method, url, **kwargs) - response.raise_for_status() + if raise_for_status: + response.raise_for_status() return response except httpx.TimeoutException as e: last_exception = e @@ -287,7 +290,8 @@ class HTTPClient: response_text = e.response.text[:500] except: response_text = "" - logger.error(f"HTTP {e.response.status_code} from {url}: {response_text}") + if log_http_errors: + logger.error(f"HTTP {e.response.status_code} from {url}: {response_text}") raise last_exception = e try: diff --git a/API/HydrusNetwork.py b/API/HydrusNetwork.py index 494d951..6668e2e 100644 --- a/API/HydrusNetwork.py +++ b/API/HydrusNetwork.py @@ -71,6 +71,7 @@ class HydrusNetwork: url: str access_key: str = "" timeout: float = 60.0 + instance_name: str = "" # Optional store name (e.g., 'home') for namespaced logs scheme: str = field(init=False) hostname: str = field(init=False) @@ -90,6 +91,12 @@ class HydrusNetwork: self.port = parsed.port or (443 if self.scheme == "https" else 80) self.base_path = parsed.path.rstrip("/") self.access_key = self.access_key or "" + self.instance_name = str(self.instance_name or "").strip() + + def _log_prefix(self) -> str: + if self.instance_name: + return f"[hydrusnetwork:{self.instance_name}]" + return f"[hydrusnetwork:{self.hostname}:{self.port}]" # ------------------------------------------------------------------ # low-level helpers @@ -120,7 +127,7 @@ class HydrusNetwork: url = f"{self.scheme}://{self.hostname}:{self.port}{path}" # Log request details - logger.debug(f"[Hydrus] {spec.method} {spec.endpoint} (auth: {'session_key' if self._session_key else 'access_key' if self.access_key else 'none'})") + logger.debug(f"{self._log_prefix()} {spec.method} {spec.endpoint} (auth: {'session_key' if self._session_key else 'access_key' if self.access_key else 'none'})") status = 0 reason = "" @@ -135,14 +142,14 @@ class HydrusNetwork: file_path = Path(spec.file_path) if not file_path.is_file(): error_msg = f"Upload file not found: {file_path}" - logger.error(f"[Hydrus] {error_msg}") + logger.error(f"{self._log_prefix()} {error_msg}") raise FileNotFoundError(error_msg) file_size = file_path.stat().st_size headers["Content-Type"] = spec.content_type or "application/octet-stream" headers["Content-Length"] = str(file_size) - logger.debug(f"[Hydrus] Uploading file {file_path.name} ({file_size} bytes)") + logger.debug(f"{self._log_prefix()} Uploading file {file_path.name} ({file_size} bytes)") def file_gen(): with file_path.open("rb") as handle: @@ -153,7 +160,9 @@ class HydrusNetwork: spec.method, url, content=file_gen(), - headers=headers + headers=headers, + raise_for_status=False, + log_http_errors=False, ) else: content = None @@ -163,14 +172,16 @@ class HydrusNetwork: content = spec.data else: json_data = spec.data - logger.debug(f"[Hydrus] Request body size: {len(content) if content else 'json'}") + logger.debug(f"{self._log_prefix()} Request body size: {len(content) if content else 'json'}") response = client.request( spec.method, url, content=content, json=json_data, - headers=headers + headers=headers, + raise_for_status=False, + log_http_errors=False, ) status = response.status_code @@ -178,20 +189,14 @@ class HydrusNetwork: body = response.content content_type = response.headers.get("Content-Type", "") or "" - logger.debug(f"[Hydrus] Response {status} {reason} ({len(body)} bytes)") + logger.debug(f"{self._log_prefix()} Response {status} {reason} ({len(body)} bytes)") except (httpx.ConnectError, httpx.TimeoutException, httpx.NetworkError) as exc: msg = f"Hydrus unavailable: {exc}" - logger.warning(f"[Hydrus] {msg}") + logger.warning(f"{self._log_prefix()} {msg}") raise HydrusConnectionError(msg) from exc - except httpx.HTTPStatusError as exc: - response = exc.response - status = response.status_code - reason = response.reason_phrase - body = response.content - content_type = response.headers.get("Content-Type", "") or "" except Exception as exc: - logger.error(f"[Hydrus] Connection error: {exc}", exc_info=True) + logger.error(f"{self._log_prefix()} Connection error: {exc}", exc_info=True) raise payload: Any @@ -219,19 +224,23 @@ class HydrusNetwork: message = payload else: message = reason or "HTTP error" - - logger.error(f"[Hydrus] HTTP {status}: {message}") + + # Some endpoints are naturally "missing" sometimes and should not spam logs. + if status == 404 and spec.endpoint.rstrip("/") == "/get_files/file_path": + return {} + + logger.error(f"{self._log_prefix()} HTTP {status}: {message}") # Handle expired session key (419) by clearing cache and retrying once if status == 419 and self._session_key and "session" in message.lower(): - logger.warning(f"[Hydrus] Session key expired, acquiring new one and retrying...") + logger.warning(f"{self._log_prefix()} Session key expired, acquiring new one and retrying...") self._session_key = "" # Clear expired session key try: self._acquire_session_key() # Retry the request with new session key return self._perform_request(spec) except Exception as retry_error: - logger.error(f"[Hydrus] Retry failed: {retry_error}", exc_info=True) + logger.error(f"{self._log_prefix()} Retry failed: {retry_error}", exc_info=True) # If retry fails, raise the original error raise HydrusRequestError(status, message, payload) from retry_error @@ -316,6 +325,16 @@ class HydrusNetwork: def add_file(self, file_path: Path) -> dict[str, Any]: return self._post("/add_files/add_file", file_path=file_path) + def undelete_files(self, hashes: Union[str, Iterable[str]]) -> dict[str, Any]: + """Restore files from Hydrus trash back into 'my files'. + + Hydrus Client API: POST /add_files/undelete_files + Required JSON args: {"hashes": [, ...]} + """ + hash_list = self._ensure_hashes(hashes) + body = {"hashes": hash_list} + return self._post("/add_files/undelete_files", data=body) + def add_tag(self, hash: Union[str, Iterable[str]], tags: Iterable[str], service_name: str) -> dict[str, Any]: hash = self._ensure_hashes(hash) body = {"hashes": hash, "service_names_to_tags": {service_name: list(tags)}} diff --git a/CLI.py b/CLI.py index 7672226..df3753a 100644 --- a/CLI.py +++ b/CLI.py @@ -68,7 +68,7 @@ from typing import Callable from config import get_local_storage_path, load_config -from cmdlet.catalog import ( +from cmdlet_catalog import ( import_cmd_module as _catalog_import_cmd_module, list_cmdlet_metadata as _catalog_list_cmdlet_metadata, list_cmdlet_names as _catalog_list_cmdlet_names, @@ -305,8 +305,6 @@ def _get_table_title_for_command( 'add_file': 'Results', 'delete-file': 'Results', 'delete_file': 'Results', - 'check-file-status': 'Status', - 'check_file_status': 'Status', 'get-metadata': None, 'get_metadata': None, } @@ -843,10 +841,6 @@ def _create_cmdlet_cli(): # Load config config = _load_cli_config() - # Initialize cookies check for yt-dlp - from hydrus_health_check import initialize_cookies_check - initialize_cookies_check(config, emit_debug=False) - # Initialize debug logging if enabled if config: from SYS.logger import set_debug @@ -991,8 +985,6 @@ def _create_cmdlet_cli(): # Run startup checks and render table try: - from hydrus_health_check import initialize_cookies_check - # MPV availability is validated by MPV.MPV.__init__. try: from MPV.mpv_ipc import MPV @@ -1294,8 +1286,13 @@ def _create_cmdlet_cli(): # Cookies are used by yt-dlp; keep this centralized utility. try: - ok, detail = initialize_cookies_check(config, emit_debug=False) - _add_startup_check("FOUND" if ok else "MISSING", "Cookies", "N/A", detail or "Not found") + from tool.ytdlp import YtDlpTool + + cookiefile = YtDlpTool(config).resolve_cookiefile() + if cookiefile is not None: + _add_startup_check("FOUND", "Cookies", "N/A", str(cookiefile)) + else: + _add_startup_check("MISSING", "Cookies", "N/A", "Not found") except Exception as exc: _add_startup_check("ERROR", "Cookies", "N/A", str(exc)) @@ -1580,10 +1577,11 @@ def _execute_pipeline(tokens: list): hash_val = getattr(item, 'hash', getattr(item, 'hash_hex', 'N/A')) title_val = getattr(item, 'title', 'N/A') if hash_val != 'N/A': - hash_display = hash_val[:8] + '...' if len(str(hash_val)) > 8 else hash_val - print(f" -> hash={hash_display}, title={title_val}") + hash_display = str(hash_val) + title_display = str(title_val) + print(f" -> hash:{hash_display}, title:{title_display}") else: - print(f" -> title={title_val}") + print(f" -> title:{title_val}") else: print(" -> [source_index out of range]") if resolved_list is not None: @@ -2143,14 +2141,14 @@ def _execute_pipeline(tokens: list): display_only_commands = { 'get-note', 'get_note', 'get-relationship', 'get_relationship', 'get-file', 'get_file', - 'check-file-status', 'check_file_status' } # Commands that manage their own table/history state (e.g. get-tag) self_managing_commands = { 'get-tag', 'get_tag', 'tags', 'get-url', 'get_url', 'search-file', 'search_file', - 'search-provider', 'search_provider' + 'search-provider', 'search_provider', + 'search-store', 'search_store' } overlay_table = ctx.get_display_table() if hasattr(ctx, 'get_display_table') else None @@ -2382,7 +2380,7 @@ def _execute_cmdlet(cmd_name: str, args: list): # Ensure native commands (cmdnat) are loaded try: - from cmdlet.catalog import ensure_registry_loaded as _ensure_registry_loaded + from cmdlet_catalog import ensure_registry_loaded as _ensure_registry_loaded _ensure_registry_loaded() except Exception: pass @@ -2391,7 +2389,7 @@ def _execute_cmdlet(cmd_name: str, args: list): cmd_fn = REGISTRY.get(cmd_name) if not cmd_fn: # Attempt lazy import of the module and retry - from cmdlet.catalog import import_cmd_module as _catalog_import + from cmdlet_catalog import import_cmd_module as _catalog_import try: mod = _catalog_import(cmd_name) data = getattr(mod, "CMDLET", None) if mod else None @@ -2537,13 +2535,13 @@ def _execute_cmdlet(cmd_name: str, args: list): display_only_commands = { 'get-url', 'get_url', 'get-note', 'get_note', 'get-relationship', 'get_relationship', 'get-file', 'get_file', - 'check-file-status', 'check_file_status' } # Commands that manage their own table/history state (e.g. get-tag) self_managing_commands = { 'get-tag', 'get_tag', 'tags', 'search-file', 'search_file', - 'search-provider', 'search_provider' + 'search-provider', 'search_provider', + 'search-store', 'search_store' } if cmd_name in self_managing_commands: @@ -2596,7 +2594,6 @@ def _execute_cmdlet(cmd_name: str, args: list): display_only_commands = { 'get-url', 'get_url', 'get-note', 'get_note', 'get-relationship', 'get_relationship', 'get-file', 'get_file', - 'check-file-status', 'check_file_status' } self_managing_commands = { 'get-tag', 'get_tag', 'tags', diff --git a/Provider/libgen.py b/Provider/libgen.py index e83381d..240055c 100644 --- a/Provider/libgen.py +++ b/Provider/libgen.py @@ -15,11 +15,11 @@ from SYS.logger import log from models import ProgressBar -# Optional dependencies +# Optional dependency for HTML scraping fallbacks try: - from bs4 import BeautifulSoup + from lxml import html as lxml_html except ImportError: - BeautifulSoup = None + lxml_html = None class Libgen(SearchProvider): @@ -116,7 +116,7 @@ class Libgen(SearchProvider): return [] def validate(self) -> bool: - # JSON-based searching can work without BeautifulSoup; HTML parsing is a fallback. + # JSON-based searching can work without lxml; HTML parsing is a fallback. return True def download(self, result: SearchResult, output_dir: Path) -> Optional[Path]: @@ -342,8 +342,8 @@ class LibgenSearch: Uses a total time budget across mirrors to avoid long hangs. """ - # Prefer JSON API (no BeautifulSoup needed); HTML scraping is a fallback. - has_bs4 = BeautifulSoup is not None + # Prefer JSON API (no lxml needed); HTML scraping is a fallback. + has_lxml = lxml_html is not None started = time.monotonic() @@ -372,7 +372,7 @@ class LibgenSearch: results = [] if not results: - if not has_bs4: + if not has_lxml: continue if "libgen.li" in mirror or "libgen.gl" in mirror: @@ -417,57 +417,73 @@ class LibgenSearch: resp = self.session.get(url, params=params, timeout=timeout) resp.raise_for_status() - if BeautifulSoup is None: + if lxml_html is None: return [] - soup = BeautifulSoup(resp.text, "html.parser") - table = soup.find("table", {"class": "c"}) - if not table: - tables = soup.find_all("table") - for t in tables: - if len(t.find_all("tr")) > 5: + def _text(el: Any) -> str: + return " ".join([t.strip() for t in el.itertext() if t and str(t).strip()]).strip() + + try: + doc = lxml_html.fromstring(resp.content) + except Exception: + return [] + + table_nodes = doc.xpath( + "//table[contains(concat(' ', normalize-space(@class), ' '), ' c ')]" + ) + table = table_nodes[0] if table_nodes else None + if table is None: + for t in doc.xpath("//table"): + if len(t.xpath(".//tr")) > 5: table = t break - if not table: + if table is None: return [] results: List[Dict[str, Any]] = [] - rows = table.find_all("tr")[1:] + rows = table.xpath(".//tr")[1:] for row in rows: - cols = row.find_all("td") + cols = row.xpath("./td") if len(cols) < 9: continue try: - libgen_id = cols[0].get_text(strip=True) - authors = [a.get_text(strip=True) for a in cols[1].find_all("a")] - if not authors: - authors = [cols[1].get_text(strip=True)] + libgen_id = _text(cols[0]) - title_tag = cols[2].find("a") - title = title_tag.get_text(strip=True) if title_tag else cols[2].get_text(strip=True) + author_links = cols[1].xpath(".//a") + authors = [_text(a) for a in author_links if _text(a)] + if not authors: + authors = [_text(cols[1])] + + title_tag = None + title_links = cols[2].xpath(".//a") + if title_links: + title_tag = title_links[0] + title = _text(title_tag) if title_tag is not None else _text(cols[2]) md5 = "" - if title_tag and title_tag.has_attr("href"): + if title_tag is not None: href = str(title_tag.get("href") or "") match = re.search(r"md5=([a-fA-F0-9]{32})", href) if match: md5 = match.group(1) - publisher = cols[3].get_text(strip=True) - year = cols[4].get_text(strip=True) - pages = cols[5].get_text(strip=True) - language = cols[6].get_text(strip=True) - size = cols[7].get_text(strip=True) - extension = cols[8].get_text(strip=True) + publisher = _text(cols[3]) + year = _text(cols[4]) + pages = _text(cols[5]) + language = _text(cols[6]) + size = _text(cols[7]) + extension = _text(cols[8]) - mirror_links = [] + mirror_links: List[str] = [] for i in range(9, len(cols)): - a = cols[i].find("a") - if a and a.has_attr("href"): - mirror_links.append(a["href"]) + a_nodes = cols[i].xpath(".//a[@href]") + if a_nodes: + href = str(a_nodes[0].get("href") or "").strip() + if href: + mirror_links.append(href) if md5: download_link = f"http://library.lol/main/{md5}" @@ -476,24 +492,25 @@ class LibgenSearch: else: download_link = "" - results.append({ - "id": libgen_id, - "title": title, - "author": ", ".join(authors), - "publisher": publisher, - "year": year, - "pages": pages, - "language": language, - "filesize_str": size, - "extension": extension, - "md5": md5, - "mirror_url": download_link, - "cover": "", - }) + results.append( + { + "id": libgen_id, + "title": title, + "author": ", ".join([a for a in authors if a]) or "Unknown", + "publisher": publisher, + "year": year, + "pages": pages, + "language": language, + "filesize_str": size, + "extension": extension, + "md5": md5, + "mirror_url": download_link, + "cover": "", + } + ) if len(results) >= limit: break - except Exception as e: logging.debug(f"Error parsing row: {e}") continue @@ -521,21 +538,35 @@ class LibgenSearch: resp = self.session.get(url, params=params, timeout=timeout) resp.raise_for_status() - if BeautifulSoup is None: + if lxml_html is None: return [] - soup = BeautifulSoup(resp.text, "html.parser") - table = soup.find("table", {"id": "tablelibgen"}) - if not table: - table = soup.find("table", {"class": "table table-striped"}) - if not table: + def _text(el: Any) -> str: + return " ".join([t.strip() for t in el.itertext() if t and str(t).strip()]).strip() + + try: + doc = lxml_html.fromstring(resp.content) + except Exception: + return [] + + table_nodes = doc.xpath("//table[@id='tablelibgen']") + table = table_nodes[0] if table_nodes else None + if table is None: + # Common libgen.li/gl fallback + table_nodes = doc.xpath( + "//table[contains(concat(' ', normalize-space(@class), ' '), ' table ') and " + "contains(concat(' ', normalize-space(@class), ' '), ' table-striped ')]" + ) + table = table_nodes[0] if table_nodes else None + + if table is None: return [] results: List[Dict[str, Any]] = [] - rows = table.find_all("tr")[1:] + rows = table.xpath(".//tr")[1:] for row in rows: - cols = row.find_all("td") + cols = row.xpath("./td") if len(cols) < 9: continue @@ -543,26 +574,30 @@ class LibgenSearch: # Extract md5 (libgen.gl exposes /ads.php?md5=... in mirror column) md5 = "" mirror_url = "" - for a in row.find_all("a"): - href = a.get("href") + for a in row.xpath(".//a[@href]"): + href = str(a.get("href") or "") if not href: continue - m = re.search(r"md5=([a-fA-F0-9]{32})", str(href)) + m = re.search(r"md5=([a-fA-F0-9]{32})", href) if m: md5 = m.group(1) - if "ads.php" in str(href): - mirror_url = urljoin(mirror, str(href)) + if "ads.php" in href: + mirror_url = urljoin(mirror, href) break if not mirror_url and md5: mirror_url = urljoin(mirror, f"/ads.php?md5={md5}") # Extract numeric file id from /file.php?id=... libgen_id = "" - file_link = row.find("a", href=re.compile(r"/file\.php\?id=\d+")) - if file_link and file_link.get("href"): - m = re.search(r"id=(\d+)", str(file_link.get("href"))) - if m: - libgen_id = m.group(1) + for a in row.xpath(".//a[@href]"): + href = str(a.get("href") or "") + if not href: + continue + if re.search(r"/file\.php\?id=\d+", href): + m = re.search(r"id=(\d+)", href) + if m: + libgen_id = m.group(1) + break title = "" authors = "" @@ -585,7 +620,7 @@ class LibgenSearch: if offset is not None: meta_cell = cols[offset] - meta_text = " ".join([str(s).strip() for s in meta_cell.stripped_strings if str(s).strip()]) + meta_text = _text(meta_cell) # Extract ISBNs from meta cell (avoid using them as title) # Matches 10 or 13-digit ISBN with optional leading 978/979. @@ -601,11 +636,11 @@ class LibgenSearch: # Choose a "real" title from meta cell. # libgen.gl meta can include series/edition/isbn blobs; prefer text with letters. raw_candidates: List[str] = [] - for a in meta_cell.find_all("a"): - t = a.get_text(" ", strip=True) + for a in meta_cell.xpath(".//a"): + t = _text(a) if t: raw_candidates.append(t) - for s in meta_cell.stripped_strings: + for s in meta_cell.itertext(): t = str(s).strip() if t: raw_candidates.append(t) @@ -645,27 +680,27 @@ class LibgenSearch: best_score = score best_title = cand - title = best_title or meta_cell.get_text(" ", strip=True) + title = best_title or _text(meta_cell) - authors = cols[offset + 1].get_text(" ", strip=True) - publisher = cols[offset + 2].get_text(" ", strip=True) - year = cols[offset + 3].get_text(" ", strip=True) - language = cols[offset + 4].get_text(" ", strip=True) - pages = cols[offset + 5].get_text(" ", strip=True) - size = cols[offset + 6].get_text(" ", strip=True) - extension = cols[offset + 7].get_text(" ", strip=True) + authors = _text(cols[offset + 1]) + publisher = _text(cols[offset + 2]) + year = _text(cols[offset + 3]) + language = _text(cols[offset + 4]) + pages = _text(cols[offset + 5]) + size = _text(cols[offset + 6]) + extension = _text(cols[offset + 7]) else: # Older fallback structure title_col = cols[1] - title_link = title_col.find("a") - title = title_link.get_text(" ", strip=True) if title_link else title_col.get_text(" ", strip=True) - authors = cols[2].get_text(" ", strip=True) - publisher = cols[3].get_text(" ", strip=True) - year = cols[4].get_text(" ", strip=True) - language = cols[5].get_text(" ", strip=True) - pages = cols[6].get_text(" ", strip=True) - size = cols[7].get_text(" ", strip=True) - extension = cols[8].get_text(" ", strip=True) + title_links = title_col.xpath(".//a") + title = _text(title_links[0]) if title_links else _text(title_col) + authors = _text(cols[2]) + publisher = _text(cols[3]) + year = _text(cols[4]) + language = _text(cols[5]) + pages = _text(cols[6]) + size = _text(cols[7]) + extension = _text(cols[8]) title = (title or "").strip() or "Unknown" authors = (authors or "").strip() or "Unknown" @@ -729,15 +764,49 @@ def _resolve_download_url( current_url = url visited = set() - if BeautifulSoup is None: - _call(log_info, "[resolve] BeautifulSoup not available; cannot resolve HTML download chain") + def _resolve_html_links_regex(base_url: str, html: str) -> Optional[str]: + """Best-effort HTML link resolver without lxml. + + This is intentionally minimal: it primarily targets LibGen landing pages like + `/ads.php?md5=...` which contain a `get.php?md5=...` link. + """ + if not html: + return None + + # Prefer explicit get.php md5 links (most common successful chain). + m = re.search(r'href=["\']([^"\']*get\.php\?md5=[a-fA-F0-9]{32}[^"\']*)["\']', html, flags=re.IGNORECASE) + if m: + href = str(m.group(1) or "").strip() + if href and not href.lower().startswith("javascript:"): + return urljoin(base_url, href) + + # Next: library.lol main links. + m = re.search(r'href=["\']([^"\']*library\.lol[^"\']*)["\']', html, flags=re.IGNORECASE) + if m: + href = str(m.group(1) or "").strip() + if href and not href.lower().startswith("javascript:"): + return urljoin(base_url, href) + + # Finally: any direct file extension link. + m = re.search( + r'href=["\']([^"\']+\.(?:pdf|epub|mobi|djvu|azw3|cbz|cbr)(?:\?[^"\']*)?)["\']', + html, + flags=re.IGNORECASE, + ) + if m: + href = str(m.group(1) or "").strip() + if href and not href.lower().startswith("javascript:"): + return urljoin(base_url, href) + return None - def _find_a_by_text(pattern: str) -> Optional[Any]: - for a in soup.find_all("a"): - t = a.get_text(" ", strip=True) + def _find_href_by_text(doc: Any, pattern: str) -> Optional[str]: + for a in doc.xpath("//a[@href]"): + t = " ".join([s.strip() for s in a.itertext() if s and str(s).strip()]).strip() if t and re.search(pattern, t, re.IGNORECASE): - return a + href = str(a.get("href") or "").strip() + if href and not href.lower().startswith("javascript:"): + return href return None for _ in range(6): @@ -763,42 +832,58 @@ def _resolve_download_url( _call(log_info, f"[resolve] Failed to fetch {current_url}: {e}") return None - soup = BeautifulSoup(content, "html.parser") + doc = None + if lxml_html is not None: + try: + doc = lxml_html.fromstring(content) + except Exception: + doc = None - get_link = _find_a_by_text(r"^GET$") - if get_link and get_link.has_attr("href"): - return urljoin(current_url, str(get_link.get("href") or "")) + if doc is None: + next_url = _resolve_html_links_regex(current_url, content) + if next_url: + current_url = next_url + continue + _call(log_info, "[resolve] lxml not available and regex resolver found no links") + return None + + get_href = _find_href_by_text(doc, r"^GET$") + if get_href: + return urljoin(current_url, get_href) if "series.php" in current_url: - edition_link = soup.find("a", href=re.compile(r"edition\.php")) - if edition_link: - current_url = urljoin(current_url, str(edition_link.get("href") or "")) + hrefs = doc.xpath("//a[contains(@href,'edition.php')]/@href") + if hrefs: + current_url = urljoin(current_url, str(hrefs[0] or "")) continue if "edition.php" in current_url: - file_link = soup.find("a", href=re.compile(r"file\.php")) - if file_link: - current_url = urljoin(current_url, str(file_link.get("href") or "")) + hrefs = doc.xpath("//a[contains(@href,'file.php')]/@href") + if hrefs: + current_url = urljoin(current_url, str(hrefs[0] or "")) continue if "file.php" in current_url: - libgen_link = soup.find("a", title="libgen") - if not libgen_link: - libgen_link = _find_a_by_text(r"Libgen") - - if libgen_link and libgen_link.has_attr("href"): - current_url = urljoin(current_url, str(libgen_link.get("href") or "")) + libgen_href = None + for a in doc.xpath("//a[@href]"): + if str(a.get("title") or "").strip().lower() == "libgen": + libgen_href = str(a.get("href") or "").strip() + break + if not libgen_href: + libgen_href = _find_href_by_text(doc, r"Libgen") + if libgen_href: + current_url = urljoin(current_url, libgen_href) continue if "ads.php" in current_url: - get_php_link = soup.find("a", href=re.compile(r"get\.php")) - if get_php_link: - return urljoin(current_url, str(get_php_link.get("href") or "")) + hrefs = doc.xpath("//a[contains(@href,'get.php')]/@href") + if hrefs: + return urljoin(current_url, str(hrefs[0] or "")) for text in ["Cloudflare", "IPFS.io", "Infura"]: - link = _find_a_by_text(re.escape(text)) - if link and link.has_attr("href"): - return urljoin(current_url, str(link.get("href") or "")) + href = _find_href_by_text(doc, re.escape(text)) + if href: + return urljoin(current_url, href) break diff --git a/Provider/openlibrary.py b/Provider/openlibrary.py index 35519ff..c7dadd1 100644 --- a/Provider/openlibrary.py +++ b/Provider/openlibrary.py @@ -1,6 +1,7 @@ from __future__ import annotations import base64 +import io from concurrent import futures import hashlib import json as json_module @@ -34,6 +35,53 @@ except ImportError: tqdm = None # type: ignore +def _image_paths_to_pdf_bytes(images: List[str]) -> Optional[bytes]: + if not images: + return None + try: + from PIL import Image # type: ignore + except Exception: + return None + + pil_images: List[Any] = [] + try: + for p in images: + img_path = Path(p) + if not img_path.is_file(): + continue + with Image.open(img_path) as im: # type: ignore[attr-defined] + # Ensure PDF-compatible mode. + if im.mode in {"RGBA", "LA", "P"}: + im = im.convert("RGB") + else: + im = im.convert("RGB") + pil_images.append(im.copy()) + except Exception: + for im in pil_images: + try: + im.close() + except Exception: + pass + return None + + if not pil_images: + return None + + buf = io.BytesIO() + first, rest = pil_images[0], pil_images[1:] + try: + first.save(buf, format="PDF", save_all=True, append_images=rest) + return buf.getvalue() + except Exception: + return None + finally: + for im in pil_images: + try: + im.close() + except Exception: + pass + + def _looks_like_isbn(text: str) -> bool: t = (text or "").replace("-", "").strip() return t.isdigit() and len(t) in (10, 13) @@ -941,32 +989,22 @@ class OpenLibrary(SearchProvider): try: images = self._archive_download(session=session, n_threads=10, directory=temp_dir, links=links, scale=3, book_id=archive_id) - try: - import img2pdf # type: ignore - - pdf_bytes = img2pdf.convert(images) if images else None - if not pdf_bytes: - log("[openlibrary] PDF conversion failed", file=sys.stderr) - try: - shutil.rmtree(temp_dir) - except Exception: - pass - return None - - pdf_path = unique_path(output_dir / f"{title}.pdf") - with open(pdf_path, "wb") as f: - f.write(pdf_bytes) - - try: - shutil.rmtree(temp_dir) - except Exception: - pass - return pdf_path - - except ImportError: - # Keep images folder. + pdf_bytes = _image_paths_to_pdf_bytes(images) + if not pdf_bytes: + # Keep images folder for manual conversion. + log("[openlibrary] PDF conversion failed; keeping images folder", file=sys.stderr) return Path(temp_dir) + pdf_path = unique_path(output_dir / f"{title}.pdf") + with open(pdf_path, "wb") as f: + f.write(pdf_bytes) + + try: + shutil.rmtree(temp_dir) + except Exception: + pass + return pdf_path + except Exception: try: shutil.rmtree(temp_dir) diff --git a/SYS/download.py b/SYS/download.py index 9ad8bb0..3ba1831 100644 --- a/SYS/download.py +++ b/SYS/download.py @@ -282,15 +282,8 @@ def _build_ytdlp_options(opts: DownloadOptions) -> Dict[str, Any]: if opts.cookies_path and opts.cookies_path.is_file(): base_options["cookiefile"] = str(opts.cookies_path) else: - # Check global cookies file lazily to avoid import cycles - from hydrus_health_check import get_cookies_file_path # local import - - global_cookies = get_cookies_file_path() - if global_cookies: - base_options["cookiefile"] = global_cookies - else: - # Fallback to browser cookies - base_options["cookiesfrombrowser"] = ("chrome",) + # Fallback to browser cookies + base_options["cookiesfrombrowser"] = ("chrome",) # Add no-playlist option if specified (for single video from playlist url) if opts.no_playlist: @@ -453,21 +446,40 @@ def _get_libgen_download_url(libgen_url: str) -> Optional[str]: # Try to find actual download link in the page try: - from bs4 import BeautifulSoup - soup = BeautifulSoup(response.content, 'html.parser') - - # Look for download links - LibGen typically has forms with download buttons - # Look for all links and forms that might lead to download - for link in soup.find_all('a'): - href = link.get('href') - if href and isinstance(href, str): - # Look for direct file links or get.php redirects - if 'get.php' in href.lower() or href.endswith(('.pdf', '.epub', '.djvu', '.mobi')): - download_url = href if href.startswith('http') else urljoin(final_url, href) + try: + from lxml import html as lxml_html + except ImportError: + lxml_html = None + + if lxml_html is not None: + doc = lxml_html.fromstring(response.content) + for a in doc.xpath("//a[@href]"): + href = str(a.get("href") or "").strip() + if not href: + continue + + href_lower = href.lower() + if "get.php" in href_lower or href_lower.endswith((".pdf", ".epub", ".djvu", ".mobi")): + download_url = href if href.startswith("http") else urljoin(final_url, href) debug(f"Found download link: {download_url}") return download_url - except ImportError: - pass # BeautifulSoup not available + else: + # Regex fallback + for m in re.finditer( + r"href=[\"\']([^\"\']+)[\"\']", + response.text or "", + flags=re.IGNORECASE, + ): + href = str(m.group(1) or "").strip() + if not href or href.lower().startswith("javascript:"): + continue + href_lower = href.lower() + if "get.php" in href_lower or href_lower.endswith((".pdf", ".epub", ".djvu", ".mobi")): + download_url = href if href.startswith("http") else urljoin(final_url, href) + debug(f"Found download link: {download_url}") + return download_url + except Exception: + pass # If we followed redirects successfully, return the final URL # This handles cases where libgen redirects to a direct download mirror @@ -708,12 +720,7 @@ def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15) -> "noprogress": True, # No progress bars } - # Add cookies if available (lazy import to avoid circular dependency) - from hydrus_health_check import get_cookies_file_path # local import - - global_cookies = get_cookies_file_path() - if global_cookies: - ydl_opts["cookiefile"] = global_cookies + # Cookies are optional for probing; callers should pass cookiefile via DownloadOptions when needed. # Add no_playlist option if specified if no_playlist: diff --git a/Store/HydrusNetwork.py b/Store/HydrusNetwork.py index d5a4f31..6387cba 100644 --- a/Store/HydrusNetwork.py +++ b/Store/HydrusNetwork.py @@ -23,6 +23,10 @@ class HydrusNetwork(Store): Maintains its own HydrusClient. """ + def _log_prefix(self) -> str: + store_name = getattr(self, "NAME", None) or "unknown" + return f"[hydrusnetwork:{store_name}]" + def __new__(cls, *args: Any, **kwargs: Any) -> "HydrusNetwork": instance = super().__new__(cls) name = kwargs.get("NAME") @@ -109,7 +113,7 @@ class HydrusNetwork(Store): raise RuntimeError(f"Hydrus '{self.NAME}' unavailable: {err}") from exc # Create a persistent client for this instance (auth via access key by default). - self._client = HydrusClient(url=self.URL, access_key=self.API) + self._client = HydrusClient(url=self.URL, access_key=self.API, instance_name=self.NAME) # Best-effort total count (fast on Hydrus side; does not fetch IDs/hashes). try: @@ -129,7 +133,7 @@ class HydrusNetwork(Store): if isinstance(count_val, int): self.total_count = count_val except Exception as exc: - debug(f"Hydrus total count unavailable for '{self.NAME}': {exc}", file=sys.stderr) + debug(f"{self._log_prefix()} total count unavailable: {exc}", file=sys.stderr) def name(self) -> str: return self.NAME @@ -167,7 +171,7 @@ class HydrusNetwork(Store): try: # Compute file hash file_hash = sha256_file(file_path) - debug(f"File hash: {file_hash}") + debug(f"{self._log_prefix()} file hash: {file_hash}") # Use persistent client with session key client = self._client @@ -177,21 +181,42 @@ class HydrusNetwork(Store): # Check if file already exists in Hydrus file_exists = False try: - metadata = client.fetch_file_metadata(hashes=[file_hash]) + metadata = client.fetch_file_metadata( + hashes=[file_hash], + include_service_keys_to_tags=False, + include_file_url=False, + include_duration=False, + include_size=False, + include_mime=False, + ) if metadata and isinstance(metadata, dict): - files = metadata.get("metadata", []) - if files: - file_exists = True - log( - f"ℹ️ Duplicate detected - file already in Hydrus with hash: {file_hash}", - file=sys.stderr, - ) + metas = metadata.get("metadata", []) + if isinstance(metas, list) and metas: + # Hydrus returns placeholder rows for unknown hashes. + # Only treat as a real duplicate if it has a concrete file_id. + for meta in metas: + if isinstance(meta, dict) and meta.get("file_id") is not None: + file_exists = True + break + if file_exists: + log( + f"ℹ️ Duplicate detected - file already in Hydrus with hash: {file_hash}", + file=sys.stderr, + ) except Exception: pass + # If Hydrus reports an existing file, it may be in trash. Best-effort restore it to 'my files'. + # This keeps behavior aligned with user expectation: "use API only" and ensure it lands in my files. + if file_exists: + try: + client.undelete_files([file_hash]) + except Exception: + pass + # Upload file if not already present if not file_exists: - log(f"Uploading to Hydrus: {file_path.name}", file=sys.stderr) + log(f"{self._log_prefix()} Uploading: {file_path.name}", file=sys.stderr) response = client.add_file(file_path) # Extract hash from response @@ -207,7 +232,7 @@ class HydrusNetwork(Store): raise Exception(f"Hydrus response missing file hash: {response}") file_hash = hydrus_hash - log(f"Hydrus: {file_hash}", file=sys.stderr) + log(f"{self._log_prefix()} hash: {file_hash}", file=sys.stderr) # Add tags if provided (both for new and existing files) if tag_list: @@ -218,27 +243,27 @@ class HydrusNetwork(Store): service_name = "my tags" try: - debug(f"Adding {len(tag_list)} tag(s) to Hydrus: {tag_list}") + debug(f"{self._log_prefix()} Adding {len(tag_list)} tag(s): {tag_list}") client.add_tag(file_hash, tag_list, service_name) - log(f"Tags added via '{service_name}'", file=sys.stderr) + log(f"{self._log_prefix()} Tags added via '{service_name}'", file=sys.stderr) except Exception as exc: - log(f"⚠️ Failed to add tags: {exc}", file=sys.stderr) + log(f"{self._log_prefix()} ⚠️ Failed to add tags: {exc}", file=sys.stderr) # Associate url if provided (both for new and existing files) if url: - log(f"Associating {len(url)} URL(s) with file", file=sys.stderr) + log(f"{self._log_prefix()} Associating {len(url)} URL(s) with file", file=sys.stderr) for url in url: if url: try: client.associate_url(file_hash, str(url)) - debug(f"Associated URL: {url}") + debug(f"{self._log_prefix()} Associated URL: {url}") except Exception as exc: - log(f"⚠️ Failed to associate URL {url}: {exc}", file=sys.stderr) + log(f"{self._log_prefix()} ⚠️ Failed to associate URL {url}: {exc}", file=sys.stderr) return file_hash except Exception as exc: - log(f"❌ Hydrus upload failed: {exc}", file=sys.stderr) + log(f"{self._log_prefix()} ❌ upload failed: {exc}", file=sys.stderr) raise def search(self, query: str, **kwargs: Any) -> list[Dict[str, Any]]: @@ -262,7 +287,8 @@ class HydrusNetwork(Store): if client is None: raise Exception("Hydrus client unavailable") - debug(f"Searching Hydrus for: {query}") + prefix = self._log_prefix() + debug(f"{prefix} Searching for: {query}") def _extract_urls(meta_obj: Any) -> list[str]: if not isinstance(meta_obj, dict): @@ -446,7 +472,7 @@ class HydrusNetwork(Store): tags = [query_lower] if not tags: - debug(f"Found 0 result(s)") + debug(f"{prefix} 0 result(s)") return [] # Search files with the tags (unless url: search already produced metadata) @@ -465,7 +491,7 @@ class HydrusNetwork(Store): hashes = search_result.get("hashes", []) if isinstance(search_result, dict) else [] if not file_ids and not hashes: - debug(f"Found 0 result(s)") + debug(f"{prefix} 0 result(s)") return [] if file_ids: @@ -595,7 +621,7 @@ class HydrusNetwork(Store): "ext": ext, }) - debug(f"Found {len(results)} result(s)") + debug(f"{prefix} {len(results)} result(s)") return results[:limit] except Exception as exc: @@ -611,13 +637,13 @@ class HydrusNetwork(Store): Only explicit user actions (e.g. the get-file cmdlet) should open files. """ - debug(f"[HydrusNetwork.get_file] Starting for hash: {file_hash[:12]}...") + debug(f"{self._log_prefix()} get_file: start hash={file_hash[:12]}...") # Build browser URL with access key base_url = str(self.URL).rstrip('/') access_key = str(self.API) browser_url = f"{base_url}/get_files/file?hash={file_hash}&Hydrus-Client-API-Access-Key={access_key}" - debug(f"[HydrusNetwork.get_file] Returning URL: {browser_url}") + debug(f"{self._log_prefix()} get_file: url={browser_url}") return browser_url def get_metadata(self, file_hash: str, **kwargs: Any) -> Optional[Dict[str, Any]]: @@ -632,16 +658,27 @@ class HydrusNetwork(Store): try: client = self._client if not client: - debug("get_metadata: Hydrus client unavailable") + debug(f"{self._log_prefix()} get_metadata: client unavailable") return None - # Fetch file metadata - payload = client.fetch_file_metadata(hashes=[file_hash], include_service_keys_to_tags=True) + # Fetch file metadata with the fields we need for CLI display. + payload = client.fetch_file_metadata( + hashes=[file_hash], + include_service_keys_to_tags=True, + include_file_url=True, + include_duration=True, + include_size=True, + include_mime=True, + ) if not payload or not payload.get("metadata"): return None meta = payload["metadata"][0] + + # Hydrus can return placeholder metadata rows for unknown hashes. + if not isinstance(meta, dict) or meta.get("file_id") is None: + return None # Extract title from tags title = f"Hydrus_{file_hash[:12]}" @@ -660,33 +697,109 @@ class HydrusNetwork(Store): if title != f"Hydrus_{file_hash[:12]}": break - # Prefer Hydrus-provided extension (e.g. ".webm"); fall back to MIME map if needed. - mime_type = meta.get("mime", "") - ext_raw = meta.get("ext") - ext = str(ext_raw or "").strip().lstrip(".") - if not ext and mime_type: + # Hydrus may return mime as an int enum, or sometimes a human label. + mime_val = meta.get("mime") + filetype_human = meta.get("filetype_human") or meta.get("mime_human") or meta.get("mime_string") + + # Determine ext: prefer Hydrus metadata ext, then filetype_human (when it looks like an ext), + # then title suffix, then file path suffix. + ext = str(meta.get("ext") or "").strip().lstrip(".") + if not ext: + ft = str(filetype_human or "").strip().lstrip(".").lower() + if ft and ft != "unknown filetype" and ft.isalnum() and len(ft) <= 8: + # Treat simple labels like "mp4", "m4a", "webm" as extensions. + ext = ft + if not ext and isinstance(title, str) and "." in title: try: - from SYS.utils_constant import mime_maps - for category in mime_maps.values(): - for _ext_key, info in category.items(): - if mime_type in info.get("mimes", []): - ext = str(info.get("ext", "")).strip().lstrip(".") - break - if ext: - break + ext = Path(title).suffix.lstrip(".") except Exception: ext = "" + if not ext: + try: + path_payload = client.get_file_path(file_hash) + if isinstance(path_payload, dict): + p = path_payload.get("path") + if isinstance(p, str) and p.strip(): + ext = Path(p.strip()).suffix.lstrip(".") + except Exception: + ext = "" + + # If extension is still unknown, attempt a best-effort lookup from MIME. + def _mime_from_ext(ext_value: str) -> str: + ext_clean = str(ext_value or "").strip().lstrip(".").lower() + if not ext_clean: + return "" + try: + for category in mime_maps.values(): + info = category.get(ext_clean) + if isinstance(info, dict): + mimes = info.get("mimes") + if isinstance(mimes, list) and mimes: + first = mimes[0] + return str(first) + except Exception: + return "" + return "" + + # Normalize to a MIME string for CLI output. + # Avoid passing through human labels like "unknown filetype". + mime_type = "" + if isinstance(mime_val, str): + candidate = mime_val.strip() + if "/" in candidate and candidate.lower() != "unknown filetype": + mime_type = candidate + if not mime_type and isinstance(filetype_human, str): + candidate = filetype_human.strip() + if "/" in candidate and candidate.lower() != "unknown filetype": + mime_type = candidate + if not mime_type: + mime_type = _mime_from_ext(ext) + + # Normalize size/duration to stable scalar types. + size_val = meta.get("size") + if size_val is None: + size_val = meta.get("size_bytes") + try: + size_int: int | None = int(size_val) if size_val is not None else None + except Exception: + size_int = None + + dur_val = meta.get("duration") + if dur_val is None: + dur_val = meta.get("duration_ms") + try: + dur_int: int | None = int(dur_val) if dur_val is not None else None + except Exception: + dur_int = None + raw_urls = ( + meta.get("known_urls") + or meta.get("urls") + or meta.get("url") + or [] + ) + url_list: list[str] = [] + if isinstance(raw_urls, str): + s = raw_urls.strip() + url_list = [s] if s else [] + elif isinstance(raw_urls, list): + url_list = [str(u).strip() for u in raw_urls if isinstance(u, str) and str(u).strip()] + return { "hash": file_hash, "title": title, "ext": ext, - "size": meta.get("size"), + "size": size_int, "mime": mime_type, + # Keep raw fields available for troubleshooting/other callers. + "hydrus_mime": mime_val, + "filetype_human": filetype_human, + "duration_ms": dur_int, + "url": url_list, } except Exception as exc: - debug(f"Failed to get metadata from Hydrus: {exc}") + debug(f"{self._log_prefix()} get_metadata failed: {exc}") return None def get_tag(self, file_identifier: str, **kwargs: Any) -> Tuple[List[str], str]: @@ -705,13 +818,13 @@ class HydrusNetwork(Store): file_hash = str(file_identifier or "").strip().lower() if len(file_hash) != 64 or not all(ch in "0123456789abcdef" for ch in file_hash): - debug(f"get_tags: invalid file hash '{file_identifier}'") + debug(f"{self._log_prefix()} get_tags: invalid file hash '{file_identifier}'") return [], "unknown" # Get Hydrus client and service info client = self._client if not client: - debug("get_tags: Hydrus client unavailable") + debug(f"{self._log_prefix()} get_tags: client unavailable") return [], "unknown" # Fetch file metadata @@ -723,12 +836,12 @@ class HydrusNetwork(Store): items = payload.get("metadata") if isinstance(payload, dict) else None if not isinstance(items, list) or not items: - debug(f"get_tags: No metadata returned for hash {file_hash}") + debug(f"{self._log_prefix()} get_tags: no metadata for hash {file_hash}") return [], "unknown" meta = items[0] if isinstance(items[0], dict) else None if not isinstance(meta, dict) or meta.get("file_id") is None: - debug(f"get_tags: Invalid metadata for hash {file_hash}") + debug(f"{self._log_prefix()} get_tags: invalid metadata for hash {file_hash}") return [], "unknown" # Extract tags using service name @@ -741,7 +854,7 @@ class HydrusNetwork(Store): return tags, "hydrus" except Exception as exc: - debug(f"get_tags failed for Hydrus file: {exc}") + debug(f"{self._log_prefix()} get_tags failed: {exc}") return [], "unknown" def add_tag(self, file_identifier: str, tags: List[str], **kwargs: Any) -> bool: @@ -750,12 +863,12 @@ class HydrusNetwork(Store): try: client = self._client if client is None: - debug("add_tag: Hydrus client unavailable") + debug(f"{self._log_prefix()} add_tag: client unavailable") return False file_hash = str(file_identifier or "").strip().lower() if len(file_hash) != 64 or not all(ch in "0123456789abcdef" for ch in file_hash): - debug(f"add_tag: invalid file hash '{file_identifier}'") + debug(f"{self._log_prefix()} add_tag: invalid file hash '{file_identifier}'") return False service_name = kwargs.get("service_name") or "my tags" # Ensure tags is a list @@ -765,7 +878,7 @@ class HydrusNetwork(Store): client.add_tag(file_hash, tag_list, service_name) return True except Exception as exc: - debug(f"Hydrus add_tag failed: {exc}") + debug(f"{self._log_prefix()} add_tag failed: {exc}") return False def delete_tag(self, file_identifier: str, tags: List[str], **kwargs: Any) -> bool: @@ -774,12 +887,12 @@ class HydrusNetwork(Store): try: client = self._client if client is None: - debug("delete_tag: Hydrus client unavailable") + debug(f"{self._log_prefix()} delete_tag: client unavailable") return False file_hash = str(file_identifier or "").strip().lower() if len(file_hash) != 64 or not all(ch in "0123456789abcdef" for ch in file_hash): - debug(f"delete_tag: invalid file hash '{file_identifier}'") + debug(f"{self._log_prefix()} delete_tag: invalid file hash '{file_identifier}'") return False service_name = kwargs.get("service_name") or "my tags" tag_list = list(tags) if isinstance(tags, (list, tuple)) else [str(tags)] @@ -788,7 +901,7 @@ class HydrusNetwork(Store): client.delete_tag(file_hash, tag_list, service_name) return True except Exception as exc: - debug(f"Hydrus delete_tag failed: {exc}") + debug(f"{self._log_prefix()} delete_tag failed: {exc}") return False def get_url(self, file_identifier: str, **kwargs: Any) -> List[str]: @@ -797,7 +910,7 @@ class HydrusNetwork(Store): try: client = self._client if client is None: - debug("get_url: Hydrus client unavailable") + debug(f"{self._log_prefix()} get_url: client unavailable") return [] file_hash = str(file_identifier or "").strip().lower() @@ -830,7 +943,7 @@ class HydrusNetwork(Store): return out return [] except Exception as exc: - debug(f"Hydrus get_url failed: {exc}") + debug(f"{self._log_prefix()} get_url failed: {exc}") return [] def add_url(self, file_identifier: str, url: List[str], **kwargs: Any) -> bool: @@ -839,13 +952,13 @@ class HydrusNetwork(Store): try: client = self._client if client is None: - debug("add_url: Hydrus client unavailable") + debug(f"{self._log_prefix()} add_url: client unavailable") return False for u in url: client.associate_url(file_identifier, u) return True except Exception as exc: - debug(f"Hydrus add_url failed: {exc}") + debug(f"{self._log_prefix()} add_url failed: {exc}") return False def delete_url(self, file_identifier: str, url: List[str], **kwargs: Any) -> bool: @@ -854,13 +967,13 @@ class HydrusNetwork(Store): try: client = self._client if client is None: - debug("delete_url: Hydrus client unavailable") + debug(f"{self._log_prefix()} delete_url: client unavailable") return False for u in url: client.delete_url(file_identifier, u) return True except Exception as exc: - debug(f"Hydrus delete_url failed: {exc}") + debug(f"{self._log_prefix()} delete_url failed: {exc}") return False def get_note(self, file_identifier: str, **kwargs: Any) -> Dict[str, str]: @@ -868,7 +981,7 @@ class HydrusNetwork(Store): try: client = self._client if client is None: - debug("get_note: Hydrus client unavailable") + debug(f"{self._log_prefix()} get_note: client unavailable") return {} file_hash = str(file_identifier or "").strip().lower() @@ -889,7 +1002,7 @@ class HydrusNetwork(Store): return {} except Exception as exc: - debug(f"Hydrus get_note failed: {exc}") + debug(f"{self._log_prefix()} get_note failed: {exc}") return {} def set_note(self, file_identifier: str, name: str, text: str, **kwargs: Any) -> bool: @@ -897,7 +1010,7 @@ class HydrusNetwork(Store): try: client = self._client if client is None: - debug("set_note: Hydrus client unavailable") + debug(f"{self._log_prefix()} set_note: client unavailable") return False file_hash = str(file_identifier or "").strip().lower() @@ -912,7 +1025,7 @@ class HydrusNetwork(Store): client.set_notes(file_hash, {note_name: note_text}) return True except Exception as exc: - debug(f"Hydrus set_note failed: {exc}") + debug(f"{self._log_prefix()} set_note failed: {exc}") return False def delete_note(self, file_identifier: str, name: str, **kwargs: Any) -> bool: @@ -920,7 +1033,7 @@ class HydrusNetwork(Store): try: client = self._client if client is None: - debug("delete_note: Hydrus client unavailable") + debug(f"{self._log_prefix()} delete_note: client unavailable") return False file_hash = str(file_identifier or "").strip().lower() @@ -934,7 +1047,7 @@ class HydrusNetwork(Store): client.delete_notes(file_hash, [note_name]) return True except Exception as exc: - debug(f"Hydrus delete_note failed: {exc}") + debug(f"{self._log_prefix()} delete_note failed: {exc}") return False @staticmethod diff --git a/cmdlet/add_file.py b/cmdlet/add_file.py index 76bc9bf..abbaede 100644 --- a/cmdlet/add_file.py +++ b/cmdlet/add_file.py @@ -6,6 +6,7 @@ import sys import shutil import tempfile import re +from urllib.parse import urlsplit, parse_qs import models import pipeline as ctx @@ -13,12 +14,20 @@ from API import HydrusNetwork as hydrus_wrapper from SYS.logger import log, debug from SYS.utils_constant import ALL_SUPPORTED_EXTENSIONS from Store import Store -from ._shared import ( - Cmdlet, CmdletArg, parse_cmdlet_args, SharedArgs, - extract_tag_from_result, extract_title_from_result, extract_url_from_result, - merge_sequences, extract_relationships, extract_duration, coerce_to_pipe_object -) -from ._shared import collapse_namespace_tag +from . import _shared as sh + +Cmdlet = sh.Cmdlet +CmdletArg = sh.CmdletArg +parse_cmdlet_args = sh.parse_cmdlet_args +SharedArgs = sh.SharedArgs +extract_tag_from_result = sh.extract_tag_from_result +extract_title_from_result = sh.extract_title_from_result +extract_url_from_result = sh.extract_url_from_result +merge_sequences = sh.merge_sequences +extract_relationships = sh.extract_relationships +extract_duration = sh.extract_duration +coerce_to_pipe_object = sh.coerce_to_pipe_object +collapse_namespace_tag = sh.collapse_namespace_tag from API.folder import read_sidecar, find_sidecar, write_sidecar, API_folder_store from SYS.utils import sha256_file, unique_path from metadata import write_metadata @@ -181,7 +190,7 @@ class Add_File(Cmdlet): downloaded_path = Path(downloaded) if downloaded_path.exists() and downloaded_path.is_dir(): log( - "[add-file] OpenLibrary download produced a directory (missing img2pdf?). Cannot ingest.", + "[add-file] OpenLibrary download produced a directory (PDF conversion failed). Cannot ingest.", file=sys.stderr, ) failures += 1 @@ -195,12 +204,32 @@ class Add_File(Cmdlet): if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith( ("http://", "https://", "magnet:", "torrent:") ): - code = self._delegate_to_download_data(item, media_path_or_url, location, provider_name, args, config) - if code == 0: - successes += 1 - else: - failures += 1 - continue + # Hydrus file URLs are direct file downloads and may require Hydrus auth headers. + # If the user provided a destination (-provider or -store), download now and continue. + if (provider_name or location) and isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith(("http://", "https://")): + downloaded = self._try_download_hydrus_file_url( + file_url=str(media_path_or_url), + pipe_obj=pipe_obj, + config=config, + ) + if downloaded is not None: + downloaded_path, downloaded_temp_dir = downloaded + temp_dir_to_cleanup = downloaded_temp_dir + media_path_or_url = str(downloaded_path) + pipe_obj.path = str(downloaded_path) + pipe_obj.is_temp = True + delete_after_item = True + + # If it's still a URL target, fall back to the legacy delegate. + if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith( + ("http://", "https://", "magnet:", "torrent:") + ): + code = self._delegate_to_download_data(item, media_path_or_url, location, provider_name, args, config) + if code == 0: + successes += 1 + else: + failures += 1 + continue media_path = Path(media_path_or_url) if isinstance(media_path_or_url, str) else media_path_or_url @@ -767,6 +796,134 @@ class Add_File(Cmdlet): return True return False + @staticmethod + def _sanitize_filename(value: str) -> str: + # Minimal Windows-safe filename sanitization. + text = str(value or "").strip() + if not text: + return "file" + invalid = '<>:"/\\|?*' + text = "".join("_" if (ch in invalid or ord(ch) < 32) else ch for ch in text) + text = re.sub(r"\s+", " ", text).strip(" .") + return text or "file" + + @staticmethod + def _parse_hydrus_file_url(file_url: str) -> Optional[str]: + """Return the sha256 hash from a Hydrus /get_files/file URL, or None.""" + try: + split = urlsplit(str(file_url)) + if split.scheme.lower() not in {"http", "https"}: + return None + path_lower = (split.path or "").lower() + if "/get_files/file" not in path_lower: + return None + params = parse_qs(split.query or "") + raw = None + if "hash" in params and params["hash"]: + raw = params["hash"][0] + if not raw: + return None + hash_val = str(raw).strip().lower() + if not re.fullmatch(r"[0-9a-f]{64}", hash_val): + return None + return hash_val + except Exception: + return None + + def _try_download_hydrus_file_url( + self, + *, + file_url: str, + pipe_obj: models.PipeObject, + config: Dict[str, Any], + ) -> Optional[tuple[Path, Path]]: + """If *file_url* is a Hydrus file URL, download it to temp and return (path, temp_dir).""" + file_hash = self._parse_hydrus_file_url(file_url) + if not file_hash: + return None + + # Resolve Hydrus backend for auth. + store_name = str(getattr(pipe_obj, "store", "") or "").strip() + if ":" in store_name: + store_name = store_name.split(":", 1)[-1].strip() + + backend = None + try: + store_registry = Store(config) + if store_name and store_registry.is_available(store_name): + candidate = store_registry[store_name] + if type(candidate).__name__.lower() == "hydrusnetwork": + backend = candidate + except Exception: + backend = None + + if backend is None: + try: + store_registry = Store(config) + target_prefix = str(file_url).split("/get_files/file", 1)[0].rstrip("/") + for backend_name in store_registry.list_backends(): + candidate = store_registry[backend_name] + if type(candidate).__name__.lower() != "hydrusnetwork": + continue + base_url = str(getattr(candidate, "URL", "") or "").rstrip("/") + if base_url and (target_prefix.lower() == base_url.lower() or target_prefix.lower().startswith(base_url.lower())): + backend = candidate + break + except Exception: + backend = None + + if backend is None: + debug("[add-file] Hydrus file URL detected but no Hydrus backend matched for auth") + return None + + api_key = str(getattr(backend, "API", "") or "").strip() + if not api_key: + debug(f"[add-file] Hydrus backend '{getattr(backend, 'NAME', '') or store_name}' missing API key") + return None + + # Best-effort filename from title + ext. + ext = "" + try: + if isinstance(pipe_obj.extra, dict): + ext = str(pipe_obj.extra.get("ext") or "").strip().lstrip(".") + except Exception: + ext = "" + if not ext: + ext = "bin" + + title_hint = str(getattr(pipe_obj, "title", "") or "").strip() + base_name = self._sanitize_filename(title_hint) if title_hint else f"hydrus_{file_hash[:12]}" + + temp_dir = Path(tempfile.mkdtemp(prefix="medios_hydrus_")) + destination = unique_path(temp_dir / f"{base_name}.{ext}") + + headers = {"Hydrus-Client-API-Access-Key": api_key} + timeout = 60.0 + try: + client = getattr(backend, "_client", None) + timeout_val = getattr(client, "timeout", None) + if timeout_val is not None: + timeout = float(timeout_val) + except Exception: + timeout = 60.0 + + try: + log( + f"[add-file] Downloading Hydrus file via API ({getattr(backend, 'NAME', '') or store_name})", + file=sys.stderr, + ) + downloaded_bytes = hydrus_wrapper.download_hydrus_file(str(file_url), headers, destination, timeout) + if downloaded_bytes <= 0 and not destination.exists(): + return None + return destination, temp_dir + except Exception as exc: + log(f"[add-file] Hydrus download failed: {exc}", file=sys.stderr) + try: + shutil.rmtree(temp_dir, ignore_errors=True) + except Exception: + pass + return None + def _delegate_to_download_data( self, result: Any, @@ -883,6 +1040,61 @@ class Add_File(Cmdlet): except Exception: return None + @staticmethod + def _get_note_text(result: Any, pipe_obj: models.PipeObject, note_name: str) -> Optional[str]: + """Extract a named note text from a piped item. + + Supports: + - pipe_obj.extra["notes"][note_name] + - result["notes"][note_name] for dict results + - pipe_obj.extra[note_name] / result[note_name] as fallback + """ + + def _normalize(val: Any) -> Optional[str]: + if val is None: + return None + if isinstance(val, bytes): + try: + val = val.decode("utf-8", errors="ignore") + except Exception: + val = str(val) + if isinstance(val, str): + text = val.strip() + return text if text else None + try: + text = str(val).strip() + return text if text else None + except Exception: + return None + + note_key = str(note_name or "").strip() + if not note_key: + return None + + # Prefer notes dict on PipeObject.extra (common for cmdlet-emitted dicts) + try: + if isinstance(pipe_obj.extra, dict): + notes_val = pipe_obj.extra.get("notes") + if isinstance(notes_val, dict) and note_key in notes_val: + return _normalize(notes_val.get(note_key)) + if note_key in pipe_obj.extra: + return _normalize(pipe_obj.extra.get(note_key)) + except Exception: + pass + + # Fallback to raw result dict + if isinstance(result, dict): + try: + notes_val = result.get("notes") + if isinstance(notes_val, dict) and note_key in notes_val: + return _normalize(notes_val.get(note_key)) + if note_key in result: + return _normalize(result.get(note_key)) + except Exception: + pass + + return None + @staticmethod def _update_pipe_object_destination( pipe_obj: models.PipeObject, @@ -1451,6 +1663,26 @@ class Add_File(Cmdlet): except Exception: pass + # If a subtitle note was provided upstream (e.g., download-media writes notes.sub), + # persist it automatically like add-note would. + sub_note = Add_File._get_note_text(result, pipe_obj, "sub") + if sub_note: + try: + setter = getattr(backend, "set_note", None) + if callable(setter): + setter(resolved_hash, "sub", sub_note) + except Exception: + pass + + chapters_note = Add_File._get_note_text(result, pipe_obj, "chapters") + if chapters_note: + try: + setter = getattr(backend, "set_note", None) + if callable(setter): + setter(resolved_hash, "chapters", chapters_note) + except Exception: + pass + meta: Dict[str, Any] = {} try: meta = backend.get_metadata(resolved_hash) or {} diff --git a/cmdlet/add_note.py b/cmdlet/add_note.py index 446ef02..9a97da0 100644 --- a/cmdlet/add_note.py +++ b/cmdlet/add_note.py @@ -7,15 +7,15 @@ import sys from SYS.logger import log import pipeline as ctx -from ._shared import ( - Cmdlet, - CmdletArg, - SharedArgs, - normalize_hash, - parse_cmdlet_args, - normalize_result_input, - should_show_help, -) +from . import _shared as sh + +Cmdlet = sh.Cmdlet +CmdletArg = sh.CmdletArg +SharedArgs = sh.SharedArgs +normalize_hash = sh.normalize_hash +parse_cmdlet_args = sh.parse_cmdlet_args +normalize_result_input = sh.normalize_result_input +should_show_help = sh.should_show_help from Store import Store from SYS.utils import sha256_file @@ -84,9 +84,9 @@ class Add_Note(Cmdlet): else: note_text = str(text_parts or "").strip() - if not note_text: - log("[add_note] Error: Empty note text", file=sys.stderr) - return 1 + # Note text can be omitted when upstream stages provide it (e.g. download-media --write-sub + # attaches notes.sub). In that case we resolve per-item below. + user_provided_text = bool(note_text) results = normalize_result_input(result) if not results: @@ -99,11 +99,56 @@ class Add_Note(Cmdlet): store_registry = Store(config) updated = 0 + # Optional global fallback for note text from pipeline values. + # Allows patterns like: ... | add-note sub + pipeline_default_text = None + if not user_provided_text: + try: + pipeline_default_text = ctx.load_value(note_name) + except Exception: + pipeline_default_text = None + if isinstance(pipeline_default_text, list): + pipeline_default_text = " ".join([str(x) for x in pipeline_default_text]).strip() + elif pipeline_default_text is not None: + pipeline_default_text = str(pipeline_default_text).strip() + for res in results: if not isinstance(res, dict): ctx.emit(res) continue + # Resolve note text for this item when not provided explicitly. + item_note_text = note_text + if not user_provided_text: + # Prefer item-scoped notes dict. + candidate = None + try: + notes = res.get("notes") + if isinstance(notes, dict): + candidate = notes.get(note_name) + except Exception: + candidate = None + + # Also allow direct field fallback: res["sub"], etc. + if candidate is None: + try: + candidate = res.get(note_name) + except Exception: + candidate = None + + if candidate is None: + candidate = pipeline_default_text + + if isinstance(candidate, list): + item_note_text = " ".join([str(x) for x in candidate]).strip() + else: + item_note_text = str(candidate or "").strip() + + if not item_note_text: + log(f"[add_note] Warning: No note text found for '{note_name}'; skipping", file=sys.stderr) + ctx.emit(res) + continue + store_name = str(store_override or res.get("store") or "").strip() raw_hash = res.get("hash") raw_path = res.get("path") @@ -130,7 +175,7 @@ class Add_Note(Cmdlet): ok = False try: - ok = bool(backend.set_note(resolved_hash, note_name, note_text, config=config)) + ok = bool(backend.set_note(resolved_hash, note_name, item_note_text, config=config)) except Exception as exc: log(f"[add_note] Error: Failed to set note: {exc}", file=sys.stderr) ok = False diff --git a/cmdlet/add_relationship.py b/cmdlet/add_relationship.py index fbaa51c..f70759f 100644 --- a/cmdlet/add_relationship.py +++ b/cmdlet/add_relationship.py @@ -11,7 +11,15 @@ from SYS.logger import log import pipeline as ctx from API import HydrusNetwork as hydrus_wrapper -from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, normalize_result_input, should_show_help, get_field +from . import _shared as sh + +Cmdlet = sh.Cmdlet +CmdletArg = sh.CmdletArg +SharedArgs = sh.SharedArgs +parse_cmdlet_args = sh.parse_cmdlet_args +normalize_result_input = sh.normalize_result_input +should_show_help = sh.should_show_help +get_field = sh.get_field from API.folder import read_sidecar, find_sidecar, API_folder_store from Store import Store diff --git a/cmdlet/add_tag.py b/cmdlet/add_tag.py index 91d477f..d63e8fa 100644 --- a/cmdlet/add_tag.py +++ b/cmdlet/add_tag.py @@ -8,19 +8,20 @@ from SYS.logger import log import models import pipeline as ctx -from ._shared import normalize_result_input, filter_results_by_temp -from ._shared import ( - Cmdlet, - CmdletArg, - SharedArgs, - normalize_hash, - parse_tag_arguments, - expand_tag_groups, - parse_cmdlet_args, - collapse_namespace_tag, - should_show_help, - get_field, -) +from . import _shared as sh + +normalize_result_input = sh.normalize_result_input +filter_results_by_temp = sh.filter_results_by_temp +Cmdlet = sh.Cmdlet +CmdletArg = sh.CmdletArg +SharedArgs = sh.SharedArgs +normalize_hash = sh.normalize_hash +parse_tag_arguments = sh.parse_tag_arguments +expand_tag_groups = sh.expand_tag_groups +parse_cmdlet_args = sh.parse_cmdlet_args +collapse_namespace_tag = sh.collapse_namespace_tag +should_show_help = sh.should_show_help +get_field = sh.get_field from Store import Store from SYS.utils import sha256_file diff --git a/cmdlet/add_tags.py b/cmdlet/add_tags.py index bc5a856..9fb85f8 100644 --- a/cmdlet/add_tags.py +++ b/cmdlet/add_tags.py @@ -8,19 +8,20 @@ from SYS.logger import log import models import pipeline as ctx -from ._shared import normalize_result_input, filter_results_by_temp -from ._shared import ( - Cmdlet, - CmdletArg, - SharedArgs, - normalize_hash, - parse_tag_arguments, - expand_tag_groups, - parse_cmdlet_args, - collapse_namespace_tags, - should_show_help, - get_field, -) +from . import _shared as sh + +normalize_result_input = sh.normalize_result_input +filter_results_by_temp = sh.filter_results_by_temp +Cmdlet = sh.Cmdlet +CmdletArg = sh.CmdletArg +SharedArgs = sh.SharedArgs +normalize_hash = sh.normalize_hash +parse_tag_arguments = sh.parse_tag_arguments +expand_tag_groups = sh.expand_tag_groups +parse_cmdlet_args = sh.parse_cmdlet_args +collapse_namespace_tags = sh.collapse_namespace_tags +should_show_help = sh.should_show_help +get_field = sh.get_field from Store import Store from SYS.utils import sha256_file diff --git a/cmdlet/add_url.py b/cmdlet/add_url.py index afccabe..f2d5bd5 100644 --- a/cmdlet/add_url.py +++ b/cmdlet/add_url.py @@ -4,12 +4,12 @@ from typing import Any, Dict, Sequence import sys import pipeline as ctx -from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field, normalize_hash +from . import _shared as sh from SYS.logger import log from Store import Store -class Add_Url(Cmdlet): +class Add_Url(sh.Cmdlet): """Add URL associations to files via hash+store.""" def __init__(self) -> None: @@ -18,9 +18,9 @@ class Add_Url(Cmdlet): summary="Associate a URL with a file", usage="@1 | add-url ", arg=[ - SharedArgs.HASH, - SharedArgs.STORE, - CmdletArg("url", required=True, description="URL to associate"), + sh.SharedArgs.HASH, + sh.SharedArgs.STORE, + sh.CmdletArg("url", required=True, description="URL to associate"), ], detail=[ "- Associates URL with file identified by hash+store", @@ -32,11 +32,11 @@ class Add_Url(Cmdlet): def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: """Add URL to file via hash+store backend.""" - parsed = parse_cmdlet_args(args, self) + parsed = sh.parse_cmdlet_args(args, self) # Extract hash and store from result or args - file_hash = parsed.get("hash") or get_field(result, "hash") - store_name = parsed.get("store") or get_field(result, "store") + file_hash = parsed.get("hash") or sh.get_field(result, "hash") + store_name = parsed.get("store") or sh.get_field(result, "store") url_arg = parsed.get("url") if not file_hash: @@ -52,7 +52,7 @@ class Add_Url(Cmdlet): return 1 # Normalize hash - file_hash = normalize_hash(file_hash) + file_hash = sh.normalize_hash(file_hash) if not file_hash: log("Error: Invalid hash format") return 1 diff --git a/cmdlet/check_file_status.py b/cmdlet/check_file_status.py deleted file mode 100644 index ffacac4..0000000 --- a/cmdlet/check_file_status.py +++ /dev/null @@ -1,190 +0,0 @@ -from __future__ import annotations - -from typing import Any, Dict, Sequence -import json -import sys - -from SYS.logger import log - -from API import HydrusNetwork as hydrus_wrapper -from ._shared import Cmdlet, CmdletArg, SharedArgs, normalize_hash, should_show_help -from Store import Store - - -CMDLET = Cmdlet( - name="check-file-status", - summary="Check if a file is active, deleted, or corrupted in Hydrus.", - usage="check-file-status [-hash ] [-store ]", - arg=[ - SharedArgs.HASH, - SharedArgs.STORE, - ], - detail=[ - "- Shows whether file is active in Hydrus or marked as deleted", - "- Detects corrupted data (e.g., comma-separated url)", - "- Displays file metadata and service locations", - "- Note: Hydrus keeps deleted files for recovery. Use cleanup-corrupted for full removal.", - ], -) - - -def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: - # Help - if should_show_help(args): - log(f"Cmdlet: {CMDLET.name}\nSummary: {CMDLET.summary}\nUsage: {CMDLET.usage}") - return 0 - - # Parse arguments - override_hash: str | None = None - override_store: str | None = None - i = 0 - while i < len(args): - token = args[i] - low = str(token).lower() - if low in {"-hash", "--hash", "hash"} and i + 1 < len(args): - override_hash = str(args[i + 1]).strip() - i += 2 - continue - if low in {"-store", "--store", "store"} and i + 1 < len(args): - override_store = str(args[i + 1]).strip() - i += 2 - continue - i += 1 - - store_name: str | None = override_store - if not store_name: - if isinstance(result, dict): - store_name = str(result.get("store") or "").strip() or None - else: - store_name = str(getattr(result, "store", "") or "").strip() or None - - if override_hash: - hash_hex = normalize_hash(override_hash) - else: - if isinstance(result, dict): - hash_hex = normalize_hash(result.get("hash") or result.get("hash_hex")) - else: - hash_hex = normalize_hash(getattr(result, "hash", None) or getattr(result, "hash_hex", None)) - - if not hash_hex: - log("No hash provided and no result selected", file=sys.stderr) - return 1 - - try: - client = None - if store_name: - # Store specified: do not fall back to a global/default Hydrus client. - try: - store = Store(config) - backend = store[str(store_name)] - candidate = getattr(backend, "_client", None) - if candidate is not None and hasattr(candidate, "fetch_file_metadata"): - client = candidate - except Exception: - client = None - - if client is None: - log(f"Hydrus client unavailable for store '{store_name}'", file=sys.stderr) - return 1 - else: - client = hydrus_wrapper.get_client(config) - - if client is None: - log("Hydrus client unavailable", file=sys.stderr) - return 1 - except Exception as exc: - log(f"Hydrus client unavailable: {exc}", file=sys.stderr) - return 1 - - try: - result_data = client.fetch_file_metadata(hashes=[hash_hex]) - if not result_data.get("metadata"): - log(f"File not found: {hash_hex[:16]}...", file=sys.stderr) - return 1 - - file_info = result_data["metadata"][0] - - # Status summary - is_deleted = file_info.get("is_deleted", False) - is_local = file_info.get("is_local", False) - is_trashed = file_info.get("is_trashed", False) - - status_str = "DELETED" if is_deleted else ("TRASHED" if is_trashed else "ACTIVE") - log(f"File status: {status_str}", file=sys.stderr) - - # File info - log(f"\n📄 File Information:", file=sys.stderr) - log(f" Hash: {file_info['hash'][:16]}...", file=sys.stderr) - log(f" Size: {file_info['size']:,} bytes", file=sys.stderr) - log(f" MIME: {file_info['mime']}", file=sys.stderr) - log(f" Dimensions: {file_info.get('width', '?')}x{file_info.get('height', '?')}", file=sys.stderr) - - # Service status - file_services = file_info.get("file_services", {}) - current_services = file_services.get("current", {}) - deleted_services = file_services.get("deleted", {}) - - if current_services: - log(f"\n✓ In services ({len(current_services)}):", file=sys.stderr) - for service_key, service_info in current_services.items(): - sname = service_info.get("name", "unknown") - stype = service_info.get("type_pretty", "unknown") - log(f" - {sname} ({stype})", file=sys.stderr) - - if deleted_services: - log(f"\n✗ Deleted from services ({len(deleted_services)}):", file=sys.stderr) - for service_key, service_info in deleted_services.items(): - sname = service_info.get("name", "unknown") - stype = service_info.get("type_pretty", "unknown") - time_deleted = service_info.get("time_deleted", "?") - log(f" - {sname} ({stype}) - deleted at {time_deleted}", file=sys.stderr) - - # URL check - url = file_info.get("url", []) - log(f"\n🔗 url ({len(url)}):", file=sys.stderr) - - corrupted_count = 0 - for i, url in enumerate(url, 1): - if "," in url: - corrupted_count += 1 - log(f" [{i}] ⚠️ CORRUPTED (comma-separated): {url[:50]}...", file=sys.stderr) - else: - log(f" [{i}] {url[:70]}{'...' if len(url) > 70 else ''}", file=sys.stderr) - - if corrupted_count > 0: - log(f"\n⚠️ WARNING: Found {corrupted_count} corrupted URL(s)", file=sys.stderr) - - # Tags - tags_dict = file_info.get("tags", {}) - total_tags = 0 - for service_key, service_data in tags_dict.items(): - service_name = service_data.get("name", "unknown") - display_tags = service_data.get("display_tags", {}).get("0", []) - total_tags += len(display_tags) - - if total_tags > 0: - log(f"\n🏷️ Tags ({total_tags}):", file=sys.stderr) - for service_key, service_data in tags_dict.items(): - display_tags = service_data.get("display_tags", {}).get("0", []) - if display_tags: - service_name = service_data.get("name", "unknown") - log(f" {service_name}:", file=sys.stderr) - for tag in display_tags[:5]: # Show first 5 - log(f" - {tag}", file=sys.stderr) - if len(display_tags) > 5: - log(f" ... and {len(display_tags) - 5} more", file=sys.stderr) - - log("\n", file=sys.stderr) - return 0 - - except Exception as exc: - log(f"Error checking file status: {exc}", file=sys.stderr) - import traceback - traceback.print_exc(file=sys.stderr) - return 1 - - -# Register cmdlet (no legacy decorator) -CMDLET.exec = _run -CMDLET.alias = ["check-status", "file-status", "status"] -CMDLET.register() diff --git a/cmdlet/cleanup.py b/cmdlet/cleanup.py deleted file mode 100644 index f147b2a..0000000 --- a/cmdlet/cleanup.py +++ /dev/null @@ -1,105 +0,0 @@ -"""Cleanup cmdlet for removing temporary artifacts from pipeline. - -This cmdlet processes result lists and removes temporary files (marked with is_temp=True), -then emits the remaining non-temporary results for further pipeline stages. -""" - -from __future__ import annotations - -from typing import Any, Dict, Sequence -from pathlib import Path -import sys -import json - -from SYS.logger import log - -from ._shared import Cmdlet, CmdletArg, get_pipe_object_path, normalize_result_input, filter_results_by_temp, should_show_help -import models -import pipeline as pipeline_context - -def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: - """Remove temporary files from pipeline results. - - Accepts: - - Single result object with is_temp field - - List of result objects to clean up - - Process: - - Filters results by is_temp=True - - Deletes those files from disk - - Emits only non-temporary results - - Typical pipeline usage: - download-data url | screen-shot | add-tag -store local "tag" --all | cleanup - """ - - # Help - if should_show_help(args): - log(f"Cmdlet: {CMDLET.name}\nSummary: {CMDLET.summary}\nUsage: {CMDLET.usage}") - return 0 - - # Normalize input to list - results = normalize_result_input(result) - - if not results: - log("[cleanup] No results to process", file=sys.stderr) - return 1 - - # Separate temporary and permanent results - temp_results = pipeline_context.filter_results_by_temp(results, include_temp=True) - perm_results = pipeline_context.filter_results_by_temp(results, include_temp=False) - - # Delete temporary files - deleted_count = 0 - for temp_result in temp_results: - try: - file_path = get_pipe_object_path(temp_result) - - if file_path: - path_obj = Path(file_path) - if path_obj.exists(): - # Delete the file - path_obj.unlink() - log(f"[cleanup] Deleted temporary file: {path_obj.name}", file=sys.stderr) - deleted_count += 1 - - # Clean up any associated sidecar files - for ext in ['.tag', '.metadata']: - sidecar = path_obj.parent / (path_obj.name + ext) - if sidecar.exists(): - try: - sidecar.unlink() - log(f"[cleanup] Deleted sidecar: {sidecar.name}", file=sys.stderr) - except Exception as e: - log(f"[cleanup] Warning: Could not delete sidecar {sidecar.name}: {e}", file=sys.stderr) - else: - log(f"[cleanup] File does not exist: {file_path}", file=sys.stderr) - except Exception as e: - log(f"[cleanup] Error deleting file: {e}", file=sys.stderr) - - # Log summary - log(f"[cleanup] Deleted {deleted_count} temporary file(s), emitting {len(perm_results)} permanent result(s)", file=sys.stderr) - - # Emit permanent results for downstream processing - for perm_result in perm_results: - pipeline_context.emit(perm_result) - - return 0 - - -CMDLET = Cmdlet( - name="cleanup", - summary="Remove temporary artifacts from pipeline (marked with is_temp=True).", - usage="cleanup", - arg=[], - detail=[ - "- Accepts pipeline results that may contain temporary files (screenshots, intermediate artifacts)", - "- Deletes files marked with is_temp=True from disk", - "- Also cleans up associated sidecar files (.tag, .metadata)", - "- Emits only non-temporary results for further processing", - "- Typical usage at end of pipeline: ... | add-tag -store local \"tag\" --all | cleanup", - "- Exit code 0 if cleanup successful, 1 if no results to process", - ], - exec=_run, -).register() - diff --git a/cmdlet/delete_file.py b/cmdlet/delete_file.py index 22ebfd0..b4e6a95 100644 --- a/cmdlet/delete_file.py +++ b/cmdlet/delete_file.py @@ -8,12 +8,12 @@ from pathlib import Path from SYS.logger import debug, log from Store.Folder import Folder from Store import Store -from ._shared import Cmdlet, CmdletArg, normalize_hash, looks_like_hash, get_field, should_show_help +from . import _shared as sh from API import HydrusNetwork as hydrus_wrapper import pipeline as ctx -class Delete_File(Cmdlet): +class Delete_File(sh.Cmdlet): """Class-based delete-file cmdlet with self-registration.""" def __init__(self) -> None: @@ -23,10 +23,10 @@ class Delete_File(Cmdlet): usage="delete-file [-hash ] [-conserve ] [-lib-root ] [reason]", alias=["del-file"], arg=[ - CmdletArg("hash", description="Override the Hydrus file hash (SHA256) to target instead of the selected result."), - CmdletArg("conserve", description="Choose which copy to keep: 'local' or 'hydrus'."), - CmdletArg("lib-root", description="Path to local library root for database cleanup."), - CmdletArg("reason", description="Optional reason for deletion (free text)."), + sh.CmdletArg("hash", description="Override the Hydrus file hash (SHA256) to target instead of the selected result."), + sh.CmdletArg("conserve", description="Choose which copy to keep: 'local' or 'hydrus'."), + sh.CmdletArg("lib-root", description="Path to local library root for database cleanup."), + sh.CmdletArg("reason", description="Optional reason for deletion (free text)."), ], detail=[ "Default removes both the local file and Hydrus file.", @@ -45,24 +45,28 @@ class Delete_File(Cmdlet): if isinstance(item, dict): hash_hex_raw = item.get("hash_hex") or item.get("hash") target = item.get("target") or item.get("file_path") or item.get("path") + title_val = item.get("title") or item.get("name") else: - hash_hex_raw = get_field(item, "hash_hex") or get_field(item, "hash") - target = get_field(item, "target") or get_field(item, "file_path") or get_field(item, "path") + hash_hex_raw = sh.get_field(item, "hash_hex") or sh.get_field(item, "hash") + target = sh.get_field(item, "target") or sh.get_field(item, "file_path") or sh.get_field(item, "path") + title_val = sh.get_field(item, "title") or sh.get_field(item, "name") store = None if isinstance(item, dict): store = item.get("store") else: - store = get_field(item, "store") + store = sh.get_field(item, "store") store_lower = str(store).lower() if store else "" is_hydrus_store = bool(store_lower) and ("hydrus" in store_lower or store_lower in {"home", "work"}) + store_label = str(store) if store else "default" + hydrus_prefix = f"[hydrusnetwork:{store_label}]" # For Hydrus files, the target IS the hash if is_hydrus_store and not hash_hex_raw: hash_hex_raw = target - hash_hex = normalize_hash(override_hash) if override_hash else normalize_hash(hash_hex_raw) + hash_hex = sh.normalize_hash(override_hash) if override_hash else sh.normalize_hash(hash_hex_raw) local_deleted = False local_target = isinstance(target, str) and target.strip() and not str(target).lower().startswith(("http://", "https://")) @@ -156,19 +160,28 @@ class Delete_File(Cmdlet): try: client._post("/add_files/delete_files", data=payload) # type: ignore[attr-defined] hydrus_deleted = True - preview = hash_hex[:12] + ('…' if len(hash_hex) > 12 else '') - debug(f"Deleted from Hydrus: {preview}…", file=sys.stderr) + title_str = str(title_val).strip() if title_val else "" + if title_str: + debug(f"{hydrus_prefix} Deleted title:{title_str} hash:{hash_hex}", file=sys.stderr) + else: + debug(f"{hydrus_prefix} Deleted hash:{hash_hex}", file=sys.stderr) except Exception: # If it's not in Hydrus (e.g. 404 or similar), that's fine if not local_deleted: return False if hydrus_deleted and hash_hex: - preview = hash_hex[:12] + ('…' if len(hash_hex) > 12 else '') + title_str = str(title_val).strip() if title_val else "" if reason: - ctx.emit(f"Deleted {preview} (reason: {reason}).") + if title_str: + ctx.emit(f"{hydrus_prefix} Deleted title:{title_str} hash:{hash_hex} (reason: {reason}).") + else: + ctx.emit(f"{hydrus_prefix} Deleted hash:{hash_hex} (reason: {reason}).") else: - ctx.emit(f"Deleted {preview}.") + if title_str: + ctx.emit(f"{hydrus_prefix} Deleted title:{title_str} hash:{hash_hex}.") + else: + ctx.emit(f"{hydrus_prefix} Deleted hash:{hash_hex}.") if hydrus_deleted or local_deleted: return True @@ -178,7 +191,7 @@ class Delete_File(Cmdlet): def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: """Execute delete-file command.""" - if should_show_help(args): + if sh.should_show_help(args): log(f"Cmdlet: {self.name}\nSummary: {self.summary}\nUsage: {self.usage}") return 0 diff --git a/cmdlet/delete_note.py b/cmdlet/delete_note.py index 1f52688..df81b3c 100644 --- a/cmdlet/delete_note.py +++ b/cmdlet/delete_note.py @@ -7,16 +7,16 @@ import sys from SYS.logger import log import pipeline as ctx -from ._shared import ( - Cmdlet, - CmdletArg, - SharedArgs, - normalize_hash, - parse_cmdlet_args, - normalize_result_input, - get_field, - should_show_help, -) +from . import _shared as sh + +Cmdlet = sh.Cmdlet +CmdletArg = sh.CmdletArg +SharedArgs = sh.SharedArgs +normalize_hash = sh.normalize_hash +parse_cmdlet_args = sh.parse_cmdlet_args +normalize_result_input = sh.normalize_result_input +get_field = sh.get_field +should_show_help = sh.should_show_help from Store import Store from SYS.utils import sha256_file diff --git a/cmdlet/delete_relationship.py b/cmdlet/delete_relationship.py index 80e0f53..cf2ba1e 100644 --- a/cmdlet/delete_relationship.py +++ b/cmdlet/delete_relationship.py @@ -10,7 +10,16 @@ import sys from SYS.logger import log import pipeline as ctx -from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, normalize_hash, normalize_result_input, get_field, should_show_help +from . import _shared as sh + +Cmdlet = sh.Cmdlet +CmdletArg = sh.CmdletArg +SharedArgs = sh.SharedArgs +parse_cmdlet_args = sh.parse_cmdlet_args +normalize_hash = sh.normalize_hash +normalize_result_input = sh.normalize_result_input +get_field = sh.get_field +should_show_help = sh.should_show_help from API.folder import API_folder_store from Store import Store from config import get_local_storage_path diff --git a/cmdlet/delete_tag.py b/cmdlet/delete_tag.py index 19bac7b..2ab2450 100644 --- a/cmdlet/delete_tag.py +++ b/cmdlet/delete_tag.py @@ -7,7 +7,15 @@ import sys import models import pipeline as ctx -from ._shared import Cmdlet, CmdletArg, SharedArgs, normalize_hash, parse_tag_arguments, should_show_help, get_field +from . import _shared as sh + +Cmdlet = sh.Cmdlet +CmdletArg = sh.CmdletArg +SharedArgs = sh.SharedArgs +normalize_hash = sh.normalize_hash +parse_tag_arguments = sh.parse_tag_arguments +should_show_help = sh.should_show_help +get_field = sh.get_field from SYS.logger import debug, log from Store import Store diff --git a/cmdlet/delete_url.py b/cmdlet/delete_url.py index 41c1b22..f66c1c4 100644 --- a/cmdlet/delete_url.py +++ b/cmdlet/delete_url.py @@ -4,7 +4,16 @@ from typing import Any, Dict, Sequence import sys import pipeline as ctx -from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field, normalize_hash +from . import _shared as sh + +Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field, normalize_hash = ( + sh.Cmdlet, + sh.CmdletArg, + sh.SharedArgs, + sh.parse_cmdlet_args, + sh.get_field, + sh.normalize_hash, +) from SYS.logger import log from Store import Store diff --git a/cmdlet/download_file.py b/cmdlet/download_file.py index a2412b1..8953dc5 100644 --- a/cmdlet/download_file.py +++ b/cmdlet/download_file.py @@ -17,15 +17,15 @@ from SYS.download import DownloadError, _download_direct_file from SYS.logger import log, debug import pipeline as pipeline_context -from ._shared import ( - Cmdlet, - CmdletArg, - SharedArgs, - parse_cmdlet_args, - register_url_with_local_library, - coerce_to_pipe_object, - get_field, -) +from . import _shared as sh + +Cmdlet = sh.Cmdlet +CmdletArg = sh.CmdletArg +SharedArgs = sh.SharedArgs +parse_cmdlet_args = sh.parse_cmdlet_args +register_url_with_local_library = sh.register_url_with_local_library +coerce_to_pipe_object = sh.coerce_to_pipe_object +get_field = sh.get_field class Download_File(Cmdlet): @@ -251,6 +251,13 @@ class Download_File(Cmdlet): # Fallback: if we have a direct HTTP URL, download it directly if downloaded_path is None and isinstance(target, str) and target.startswith("http"): + # Guard: provider landing pages (e.g. LibGen ads.php) are HTML, not files. + # Never download these as "files". + if str(table or "").lower() == "libgen": + low = target.lower() + if ("/ads.php" in low) or ("/file.php" in low) or ("/index.php" in low): + log("[download-file] Refusing to download LibGen landing page (expected provider to resolve file link)", file=sys.stderr) + continue debug(f"[download-file] Provider item looks like direct URL, downloading: {target}") result_obj = _download_direct_file(target, final_output_dir, quiet=quiet_mode) file_path = None diff --git a/cmdlet/download_media.py b/cmdlet/download_media.py index 0b7b1df..2e0fa86 100644 --- a/cmdlet/download_media.py +++ b/cmdlet/download_media.py @@ -38,7 +38,18 @@ from models import DownloadError, DownloadOptions, DownloadMediaResult, DebugLog import pipeline as pipeline_context from result_table import ResultTable -from ._shared import Cmdlet, CmdletArg, SharedArgs, create_pipe_object_result, parse_cmdlet_args, register_url_with_local_library, coerce_to_pipe_object +from tool.ytdlp import YtDlpTool + +from . import _shared as sh + +Cmdlet = sh.Cmdlet +CmdletArg = sh.CmdletArg +SharedArgs = sh.SharedArgs +create_pipe_object_result = sh.create_pipe_object_result +parse_cmdlet_args = sh.parse_cmdlet_args +register_url_with_local_library = sh.register_url_with_local_library +coerce_to_pipe_object = sh.coerce_to_pipe_object +get_field = sh.get_field # Minimal inlined helpers from helper/download.py (is_url_supported_by_ytdlp, list_formats) @@ -62,6 +73,136 @@ _EXTRACTOR_CACHE: List[Any] | None = None _YTDLP_PROGRESS_BAR = ProgressBar() +_SUBTITLE_EXTS = (".vtt", ".srt", ".ass", ".ssa", ".lrc") + + +def _format_chapters_note(info: Dict[str, Any]) -> Optional[str]: + """Format yt-dlp chapter metadata into a stable, note-friendly text. + + Output is one chapter per line, e.g.: + 00:00 Intro + 01:23-02:10 Topic name + """ + try: + chapters = info.get("chapters") + except Exception: + chapters = None + + if not isinstance(chapters, list) or not chapters: + return None + + rows: List[tuple[int, Optional[int], str]] = [] + max_t = 0 + for ch in chapters: + if not isinstance(ch, dict): + continue + start_raw = ch.get("start_time") + end_raw = ch.get("end_time") + title_raw = ch.get("title") or ch.get("name") or ch.get("chapter") + + try: + start_s = int(float(start_raw)) + except Exception: + continue + + end_s: Optional[int] = None + try: + if end_raw is not None: + end_s = int(float(end_raw)) + except Exception: + end_s = None + + title = str(title_raw).strip() if title_raw is not None else "" + rows.append((start_s, end_s, title)) + try: + max_t = max(max_t, start_s, end_s or 0) + except Exception: + max_t = max(max_t, start_s) + + if not rows: + return None + + force_hours = bool(max_t >= 3600) + + def _tc(seconds: int) -> str: + total = max(0, int(seconds)) + minutes, secs = divmod(total, 60) + hours, minutes = divmod(minutes, 60) + if force_hours: + return f"{hours:02d}:{minutes:02d}:{secs:02d}" + return f"{minutes:02d}:{secs:02d}" + + lines: List[str] = [] + for start_s, end_s, title in sorted(rows, key=lambda r: (r[0], r[1] if r[1] is not None else 10**9, r[2])): + if end_s is not None and end_s > start_s: + prefix = f"{_tc(start_s)}-{_tc(end_s)}" + else: + prefix = _tc(start_s) + line = f"{prefix} {title}".strip() + if line: + lines.append(line) + + text = "\n".join(lines).strip() + return text or None + + +def _best_subtitle_sidecar(media_path: Path) -> Optional[Path]: + """Find the most likely subtitle sidecar file for a downloaded media file.""" + try: + base_dir = media_path.parent + stem = media_path.stem + if not stem: + return None + + candidates: List[Path] = [] + for p in base_dir.glob(stem + ".*"): + try: + if not p.is_file(): + continue + except Exception: + continue + if p.suffix.lower() in _SUBTITLE_EXTS: + candidates.append(p) + + if not candidates: + return None + + def _rank(path: Path) -> tuple[int, int, float, str]: + name = path.name.lower() + lang_rank = 0 if ".en." in name or name.endswith(".en" + path.suffix.lower()) else 1 + ext = path.suffix.lower() + ext_rank_map = {".vtt": 0, ".srt": 1, ".ass": 2, ".ssa": 3, ".lrc": 4} + ext_rank = ext_rank_map.get(ext, 9) + try: + mtime = float(path.stat().st_mtime) + except Exception: + mtime = 0.0 + return (lang_rank, ext_rank, -mtime, name) + + candidates.sort(key=_rank) + return candidates[0] + except Exception: + return None + + +def _read_text_file(path: Path, *, max_bytes: int = 1_500_000) -> Optional[str]: + try: + data = path.read_bytes() + except Exception: + return None + if not data: + return None + if len(data) > max_bytes: + data = data[:max_bytes] + try: + return data.decode("utf-8", errors="replace") + except Exception: + try: + return data.decode(errors="replace") + except Exception: + return None + + def _ensure_yt_dlp_ready() -> None: if yt_dlp is not None: return @@ -100,16 +241,26 @@ def list_formats(url: str, no_playlist: bool = False, playlist_items: Optional[s ydl_opts["noplaylist"] = True if playlist_items: ydl_opts["playlist_items"] = playlist_items + + debug(f"Fetching format list for: {url}") with yt_dlp.YoutubeDL(ydl_opts) as ydl: # type: ignore[arg-type] - debug(f"Fetching format list for: {url}") info = ydl.extract_info(url, download=False) - formats = info.get("formats", []) - if not formats: - log("No formats available", file=sys.stderr) - return None - result_formats = [] - for fmt in formats: - result_formats.append({ + + if not isinstance(info, dict): + log("No formats available", file=sys.stderr) + return None + + formats = info.get("formats") or [] + if not isinstance(formats, list) or not formats: + log("No formats available", file=sys.stderr) + return None + + result_formats: List[Dict[str, Any]] = [] + for fmt in formats: + if not isinstance(fmt, dict): + continue + result_formats.append( + { "format_id": fmt.get("format_id", ""), "format": fmt.get("format", ""), "ext": fmt.get("ext", ""), @@ -122,9 +273,11 @@ def list_formats(url: str, no_playlist: bool = False, playlist_items: Optional[s "filesize": fmt.get("filesize"), "abr": fmt.get("abr"), "tbr": fmt.get("tbr"), - }) - debug(f"Found {len(result_formats)} available formats") - return result_formats + } + ) + + debug(f"Found {len(result_formats)} available formats") + return result_formats or None except Exception as e: log(f"✗ Error fetching formats: {e}", file=sys.stderr) return None @@ -215,6 +368,31 @@ def _download_with_sections_via_cli(url: str, ytdl_options: Dict[str, Any], sect cmd = ["yt-dlp"] if ytdl_options.get("format"): cmd.extend(["-f", ytdl_options["format"]]) + if ytdl_options.get("merge_output_format"): + cmd.extend(["--merge-output-format", str(ytdl_options["merge_output_format"])]) + + # For CLI downloads, infer chapter/metadata embedding from either legacy flags + # or explicit FFmpegMetadata postprocessor entries. + postprocessors = ytdl_options.get("postprocessors") + want_add_metadata = bool(ytdl_options.get("addmetadata")) + want_embed_chapters = bool(ytdl_options.get("embedchapters")) + if isinstance(postprocessors, list): + for pp in postprocessors: + if not isinstance(pp, dict): + continue + if str(pp.get("key") or "") == "FFmpegMetadata": + want_add_metadata = True + if bool(pp.get("add_chapters", True)): + want_embed_chapters = True + + if want_add_metadata: + cmd.append("--add-metadata") + if want_embed_chapters: + cmd.append("--embed-chapters") + if ytdl_options.get("writesubtitles"): + cmd.append("--write-sub") + cmd.append("--write-auto-sub") + cmd.extend(["--sub-format", "vtt"]) if ytdl_options.get("force_keyframes_at_cuts"): cmd.extend(["--force-keyframes-at-cuts"]) if ytdl_options.get("force_keyframes_at_cuts") else None cmd.extend(["-o", section_outtmpl]) @@ -258,11 +436,6 @@ def _build_ytdlp_options(opts: DownloadOptions) -> Dict[str, Any]: if opts.cookies_path and opts.cookies_path.is_file(): base_options["cookiefile"] = str(opts.cookies_path) - else: - from hydrus_health_check import get_cookies_file_path # local import - global_cookies = get_cookies_file_path() - if global_cookies: - base_options["cookiefile"] = global_cookies if opts.no_playlist: base_options["noplaylist"] = True @@ -274,6 +447,37 @@ def _build_ytdlp_options(opts: DownloadOptions) -> Dict[str, Any]: base_options["format"] = opts.ytdl_format or "bestvideo+bestaudio/best" base_options["format_sort"] = ["res:4320", "res:2880", "res:2160", "res:1440", "res:1080", "res:720", "res"] + # Optional yt-dlp features + if getattr(opts, "embed_chapters", False): + # Prefer explicit FFmpegMetadata PP so chapter embedding runs even when + # we already specified other postprocessors (e.g. FFmpegExtractAudio). + pps = base_options.get("postprocessors") + if not isinstance(pps, list): + pps = [] + already_has_metadata = any( + isinstance(pp, dict) and str(pp.get("key") or "") == "FFmpegMetadata" for pp in pps + ) + if not already_has_metadata: + pps.append( + { + "key": "FFmpegMetadata", + "add_metadata": True, + "add_chapters": True, + "add_infojson": "if_exists", + } + ) + base_options["postprocessors"] = pps + + # Chapter embedding is most reliable in mkv/mp4 containers. + # When merging separate video+audio streams, prefer mkv so mpv sees chapters. + if opts.mode != "audio": + base_options.setdefault("merge_output_format", "mkv") + + if getattr(opts, "write_sub", False): + base_options["writesubtitles"] = True + base_options["writeautomaticsub"] = True + base_options["subtitlesformat"] = "vtt" + if opts.clip_sections: sections: List[str] = [] @@ -410,13 +614,27 @@ def _get_libgen_download_url(libgen_url: str) -> Optional[str]: response = session.get(libgen_url, timeout=10, allow_redirects=True) final_url = response.url try: - from bs4 import BeautifulSoup - soup = BeautifulSoup(response.content, 'html.parser') - for link in soup.find_all('a'): - href = link.get('href') - if href and 'get.php' in href: - return urljoin(libgen_url, href) - except ImportError: + try: + from lxml import html as lxml_html + except ImportError: + lxml_html = None + + if lxml_html is not None: + doc = lxml_html.fromstring(response.content) + for a in doc.xpath("//a[@href]"): + href = str(a.get("href") or "").strip() + if href and "get.php" in href.lower(): + return urljoin(final_url, href) + else: + for m in re.finditer( + r"href=[\"\']([^\"\']+)[\"\']", + response.text or "", + flags=re.IGNORECASE, + ): + href = str(m.group(1) or "").strip() + if href and "get.php" in href.lower(): + return urljoin(final_url, href) + except Exception: pass if final_url != libgen_url: debug(f"LibGen resolved to mirror: {final_url}") @@ -648,7 +866,7 @@ def _download_direct_file( raise DownloadError(f"Error downloading file: {exc}") from exc -def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15) -> Optional[Dict[str, Any]]: +def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15, *, cookiefile: Optional[str] = None) -> Optional[Dict[str, Any]]: """Probe URL to extract metadata WITHOUT downloading. Args: @@ -686,12 +904,8 @@ def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15) -> "noprogress": True, # No progress bars } - # Add cookies if available (lazy import to avoid circular dependency) - from hydrus_health_check import get_cookies_file_path # local import - - global_cookies = get_cookies_file_path() - if global_cookies: - ydl_opts["cookiefile"] = global_cookies + if cookiefile: + ydl_opts["cookiefile"] = str(cookiefile) # Add no_playlist option if specified if no_playlist: @@ -807,7 +1021,14 @@ def download_media( debug(f"Skipping probe for playlist (item selection: {opts.playlist_items}), proceeding with download") probe_result = {"url": opts.url} # Minimal probe result else: - probe_result = probe_url(opts.url, no_playlist=opts.no_playlist, timeout_seconds=15) + probe_cookiefile = None + try: + if opts.cookies_path and opts.cookies_path.is_file(): + probe_cookiefile = str(opts.cookies_path) + except Exception: + probe_cookiefile = None + + probe_result = probe_url(opts.url, no_playlist=opts.no_playlist, timeout_seconds=15, cookiefile=probe_cookiefile) if probe_result is None: if not opts.quiet: @@ -1182,6 +1403,8 @@ class Download_Media(Cmdlet): try: debug("Starting download-media") + ytdlp_tool = YtDlpTool(config) + # Parse arguments parsed = parse_cmdlet_args(args, self) @@ -1192,7 +1415,6 @@ class Download_Media(Cmdlet): # If no url provided via args, try to extract from piped result if not raw_url and result: - from ._shared import get_field # Handle single result or list of results results_to_check = result if isinstance(result, list) else [result] for item in results_to_check: @@ -1226,6 +1448,10 @@ class Download_Media(Cmdlet): # Get other options clip_spec = parsed.get("clip") + # Always enable chapters + subtitles so downstream pipes (e.g. mpv) can consume them. + embed_chapters = True + write_sub = True + mode = "audio" if parsed.get("audio") else "video" # Parse clip range(s) if specified @@ -1379,7 +1605,14 @@ class Download_Media(Cmdlet): if playlist_items: return str(requested_url) try: - pr = probe_url(requested_url, no_playlist=False, timeout_seconds=15) + cf = None + try: + cookie_path = ytdlp_tool.resolve_cookiefile() + if cookie_path is not None and cookie_path.is_file(): + cf = str(cookie_path) + except Exception: + cf = None + pr = probe_url(requested_url, no_playlist=False, timeout_seconds=15, cookiefile=cf) if isinstance(pr, dict): for key in ("webpage_url", "original_url", "url", "requested_url"): value = pr.get(key) @@ -1458,7 +1691,14 @@ class Download_Media(Cmdlet): - selected_urls: Optional[List[str]] (expanded per-entry urls when available) """ try: - pr = probe_url(url, no_playlist=False, timeout_seconds=15) + cf = None + try: + cookie_path = ytdlp_tool.resolve_cookiefile() + if cookie_path is not None and cookie_path.is_file(): + cf = str(cookie_path) + except Exception: + cf = None + pr = probe_url(url, no_playlist=False, timeout_seconds=15, cookiefile=cf) except Exception: pr = None if not isinstance(pr, dict): @@ -1685,6 +1925,15 @@ class Download_Media(Cmdlet): acodec = fmt.get("acodec", "none") filesize = fmt.get("filesize") format_id = fmt.get("format_id", "") + + # If the chosen format is video-only (no audio stream), automatically + # request best audio too so the resulting file has sound. + selection_format_id = format_id + try: + if vcodec != "none" and acodec == "none" and format_id: + selection_format_id = f"{format_id}+ba" + except Exception: + selection_format_id = format_id # Format size size_str = "" @@ -1729,9 +1978,9 @@ class Download_Media(Cmdlet): "full_metadata": { "format_id": format_id, "url": url, - "item_selector": format_id, + "item_selector": selection_format_id, }, - "_selection_args": ["-format", format_id] + "_selection_args": ["-format", selection_format_id] } # Add to results list and table (don't emit - formats should wait for @N selection) @@ -1778,23 +2027,57 @@ class Download_Media(Cmdlet): actual_format = playlist_items actual_playlist_items = None - # Auto-pick best audio format when -audio is used and no explicit format is given. + # For -audio, default to yt-dlp's built-in bestaudio selector. + # This should *not* require interactive format picking. if mode == "audio" and not actual_format: - chosen = None - formats = list_formats(url, no_playlist=False, playlist_items=actual_playlist_items) - if formats: - chosen = _pick_best_audio_format_id(formats) - actual_format = chosen or "bestaudio/best" + actual_format = "bestaudio" + + # If no explicit format is provided for video mode, allow a config override. + if mode == "video" and not actual_format: + configured = (ytdlp_tool.default_format("video") or "").strip() + if configured and configured != "bestvideo+bestaudio/best": + actual_format = configured + + # If a single format id was chosen and it is video-only, auto-merge best audio. + if ( + actual_format + and isinstance(actual_format, str) + and mode != "audio" + and "+" not in actual_format + and "/" not in actual_format + and "[" not in actual_format + and actual_format not in {"best", "bv", "ba", "b"} + ): + try: + formats = list_formats(url, no_playlist=False, playlist_items=actual_playlist_items) + if formats: + fmt_match = next( + (f for f in formats if str(f.get("format_id", "")) == actual_format), + None, + ) + if fmt_match: + vcodec = str(fmt_match.get("vcodec", "none")) + acodec = str(fmt_match.get("acodec", "none")) + if vcodec != "none" and acodec == "none": + debug( + f"Selected video-only format {actual_format}; using {actual_format}+ba for audio" + ) + actual_format = f"{actual_format}+ba" + except Exception: + pass opts = DownloadOptions( url=url, mode=mode, output_dir=final_output_dir, ytdl_format=actual_format, + cookies_path=ytdlp_tool.resolve_cookiefile(), clip_sections=clip_sections_spec, playlist_items=actual_playlist_items, quiet=quiet_mode, no_playlist=False, + embed_chapters=embed_chapters, + write_sub=write_sub, ) # Use timeout wrapper to prevent hanging @@ -1838,7 +2121,40 @@ class Download_Media(Cmdlet): # Build PipeObjects first so we can attach cross-clip relationships. pipe_objects: List[Dict[str, Any]] = [] for downloaded in results_to_emit: - pipe_objects.append(self._build_pipe_object(downloaded, url, opts)) + po = self._build_pipe_object(downloaded, url, opts) + + # Attach chapter timestamps for downstream consumers (e.g., mpv scripts) + # even if container embedding fails. + try: + info = downloaded.info if isinstance(getattr(downloaded, "info", None), dict) else {} + except Exception: + info = {} + chapters_text = _format_chapters_note(info) if embed_chapters else None + if chapters_text: + notes = po.get("notes") + if not isinstance(notes, dict): + notes = {} + notes.setdefault("chapters", chapters_text) + po["notes"] = notes + + if write_sub: + try: + media_path = Path(str(po.get("path") or "")) + except Exception: + media_path = None + + if media_path is not None and media_path.exists() and media_path.is_file(): + sub_path = _best_subtitle_sidecar(media_path) + if sub_path is not None: + sub_text = _read_text_file(sub_path) + if sub_text: + notes = po.get("notes") + if not isinstance(notes, dict): + notes = {} + notes["sub"] = sub_text + po["notes"] = notes + + pipe_objects.append(po) # If this is a clip download, decorate titles/tags so the title: tag is clip-based. # Relationship tags are only added when multiple clips exist. @@ -1868,6 +2184,95 @@ class Download_Media(Cmdlet): debug("✓ Downloaded and emitted") except DownloadError as e: + # Special-case yt-dlp format errors: show a selectable format list table so + # the user can pick a working format_id and continue the pipeline via @N. + cause = getattr(e, "__cause__", None) + detail = "" + try: + detail = str(cause or "") + except Exception: + detail = "" + + if "requested format is not available" in (detail or "").lower() and mode != "audio": + formats = list_formats(url, no_playlist=False, playlist_items=actual_playlist_items) + if formats: + formats_to_show = formats + + table = ResultTable() + table.title = f"Available formats for {url}" + table.set_source_command("download-media", [str(a) for a in (args or [])]) + + results_list: List[Dict[str, Any]] = [] + for idx, fmt in enumerate(formats_to_show, 1): + resolution = fmt.get("resolution", "") + ext = fmt.get("ext", "") + vcodec = fmt.get("vcodec", "none") + acodec = fmt.get("acodec", "none") + filesize = fmt.get("filesize") + format_id = fmt.get("format_id", "") + + selection_format_id = format_id + try: + if vcodec != "none" and acodec == "none" and format_id: + selection_format_id = f"{format_id}+ba" + except Exception: + selection_format_id = format_id + + size_str = "" + if filesize: + try: + size_mb = float(filesize) / (1024 * 1024) + size_str = f"{size_mb:.1f}MB" + except Exception: + size_str = "" + + desc_parts: List[str] = [] + if resolution and resolution != "audio only": + desc_parts.append(str(resolution)) + if ext: + desc_parts.append(str(ext).upper()) + if vcodec != "none": + desc_parts.append(f"v:{vcodec}") + if acodec != "none": + desc_parts.append(f"a:{acodec}") + if size_str: + desc_parts.append(size_str) + format_desc = " | ".join(desc_parts) + + format_dict: Dict[str, Any] = { + "table": "download-media", + "title": f"Format {format_id}", + "url": url, + "target": url, + "detail": format_desc, + "media_kind": "format", + "columns": [ + ("#", str(idx)), + ("ID", format_id), + ("Resolution", resolution or "N/A"), + ("Ext", ext), + ("Video", vcodec), + ("Audio", acodec), + ("Size", size_str or "N/A"), + ], + "full_metadata": { + "format_id": format_id, + "url": url, + "item_selector": selection_format_id, + }, + "_selection_args": ["-format", selection_format_id], + } + + results_list.append(format_dict) + table.add_result(format_dict) + + pipeline_context.set_current_stage_table(table) + pipeline_context.set_last_result_table(table, results_list) + + # Returning 0 with no emits lets the CLI pause the pipeline for @N selection. + log("Requested format is not available; select a working format with @N", file=sys.stderr) + return 0 + log(f"Download failed for {url}: {e}", file=sys.stderr) except Exception as e: log(f"Error processing {url}: {e}", file=sys.stderr) diff --git a/cmdlet/download_torrent.py b/cmdlet/download_torrent.py index 7e5e41c..bfdf051 100644 --- a/cmdlet/download_torrent.py +++ b/cmdlet/download_torrent.py @@ -15,9 +15,9 @@ from pathlib import Path from typing import Any, Dict, Optional, Sequence from SYS.logger import log -from ._shared import Cmdlet, CmdletArg, parse_cmdlet_args +from . import _shared as sh -class Download_Torrent(Cmdlet): +class Download_Torrent(sh.Cmdlet): """Class-based download-torrent cmdlet with self-registration.""" def __init__(self) -> None: @@ -27,10 +27,10 @@ class Download_Torrent(Cmdlet): usage="download-torrent [options]", alias=["torrent", "magnet"], arg=[ - CmdletArg(name="magnet", type="string", required=False, description="Magnet link or .torrent file/URL", variadic=True), - CmdletArg(name="output", type="string", description="Output directory for downloaded files"), - CmdletArg(name="wait", type="float", description="Wait time (seconds) for magnet processing timeout"), - CmdletArg(name="background", type="flag", alias="bg", description="Start download in background"), + sh.CmdletArg(name="magnet", type="string", required=False, description="Magnet link or .torrent file/URL", variadic=True), + sh.CmdletArg(name="output", type="string", description="Output directory for downloaded files"), + sh.CmdletArg(name="wait", type="float", description="Wait time (seconds) for magnet processing timeout"), + sh.CmdletArg(name="background", type="flag", alias="bg", description="Start download in background"), ], detail=["Download torrents/magnets via AllDebrid API."], exec=self.run, @@ -38,7 +38,7 @@ class Download_Torrent(Cmdlet): self.register() def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: - parsed = parse_cmdlet_args(args, self) + parsed = sh.parse_cmdlet_args(args, self) magnet_args = parsed.get("magnet", []) output_dir = Path(parsed.get("output") or Path.home() / "Downloads") wait_timeout = int(float(parsed.get("wait", 600))) diff --git a/cmdlet/get_file.py b/cmdlet/get_file.py index 3624742..ddfcce2 100644 --- a/cmdlet/get_file.py +++ b/cmdlet/get_file.py @@ -9,13 +9,13 @@ import subprocess import webbrowser import pipeline as ctx -from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field, normalize_hash +from . import _shared as sh from SYS.logger import log, debug from Store import Store from config import resolve_output_dir -class Get_File(Cmdlet): +class Get_File(sh.Cmdlet): """Export files to local path via hash+store.""" def __init__(self) -> None: @@ -25,10 +25,10 @@ class Get_File(Cmdlet): summary="Export file to local path", usage="@1 | get-file -path C:\\Downloads", arg=[ - SharedArgs.HASH, - SharedArgs.STORE, - SharedArgs.PATH, - CmdletArg("name", description="Output filename (default: from metadata title)"), + sh.SharedArgs.HASH, + sh.SharedArgs.STORE, + sh.SharedArgs.PATH, + sh.CmdletArg("name", description="Output filename (default: from metadata title)"), ], detail=[ "- Exports file from storage backend to local path", @@ -42,12 +42,12 @@ class Get_File(Cmdlet): def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: """Export file via hash+store backend.""" debug(f"[get-file] run() called with result type: {type(result)}") - parsed = parse_cmdlet_args(args, self) + parsed = sh.parse_cmdlet_args(args, self) debug(f"[get-file] parsed args: {parsed}") # Extract hash and store from result or args - file_hash = parsed.get("hash") or get_field(result, "hash") - store_name = parsed.get("store") or get_field(result, "store") + file_hash = parsed.get("hash") or sh.get_field(result, "hash") + store_name = parsed.get("store") or sh.get_field(result, "store") output_path = parsed.get("path") output_name = parsed.get("name") @@ -62,7 +62,7 @@ class Get_File(Cmdlet): return 1 # Normalize hash - file_hash = normalize_hash(file_hash) + file_hash = sh.normalize_hash(file_hash) if not file_hash: log("Error: Invalid hash format") return 1 @@ -84,9 +84,9 @@ class Get_File(Cmdlet): def resolve_display_title() -> str: candidates = [ - get_field(result, "title"), - get_field(result, "name"), - get_field(result, "filename"), + sh.get_field(result, "title"), + sh.get_field(result, "name"), + sh.get_field(result, "filename"), (metadata.get("title") if isinstance(metadata, dict) else None), (metadata.get("name") if isinstance(metadata, dict) else None), (metadata.get("filename") if isinstance(metadata, dict) else None), diff --git a/cmdlet/get_metadata.py b/cmdlet/get_metadata.py index 0681c1e..617e6cb 100644 --- a/cmdlet/get_metadata.py +++ b/cmdlet/get_metadata.py @@ -7,7 +7,13 @@ import sys from SYS.logger import log from pathlib import Path -from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field +from . import _shared as sh + +Cmdlet = sh.Cmdlet +CmdletArg = sh.CmdletArg +SharedArgs = sh.SharedArgs +parse_cmdlet_args = sh.parse_cmdlet_args +get_field = sh.get_field import pipeline as ctx from result_table import ResultTable @@ -74,9 +80,15 @@ class Get_Metadata(Cmdlet): hash_value: Optional[str], pages: Optional[int] = None) -> Dict[str, Any]: """Build a table row dict with metadata fields.""" size_mb = None - if isinstance(size_bytes, int): + size_int: Optional[int] = None + if size_bytes is not None: try: - size_mb = int(size_bytes / (1024 * 1024)) + size_int = int(size_bytes) + except Exception: + size_int = None + if isinstance(size_int, int): + try: + size_mb = int(size_int / (1024 * 1024)) except Exception: size_mb = None @@ -105,7 +117,7 @@ class Get_Metadata(Cmdlet): "path": path, "store": store, "mime": mime, - "size_bytes": size_bytes, + "size_bytes": size_int, "duration_seconds": dur_int, "pages": pages_int, "imported_ts": imported_ts, @@ -237,8 +249,8 @@ class Get_Metadata(Cmdlet): pages=pages, ) - table_title = title - table = ResultTable(table_title).init_command("get-metadata", list(args)) + table_title = f"get-metadata: {title}" if title else "get-metadata" + table = ResultTable(table_title).init_command(table_title, "get-metadata", list(args)) self._add_table_body_row(table, row) ctx.set_last_result_table_overlay(table, [row], row) ctx.emit(row) diff --git a/cmdlet/get_note.py b/cmdlet/get_note.py index 2f9935b..4d85164 100644 --- a/cmdlet/get_note.py +++ b/cmdlet/get_note.py @@ -7,15 +7,15 @@ import sys from SYS.logger import log import pipeline as ctx -from ._shared import ( - Cmdlet, - CmdletArg, - SharedArgs, - normalize_hash, - parse_cmdlet_args, - normalize_result_input, - should_show_help, -) +from . import _shared as sh + +Cmdlet = sh.Cmdlet +CmdletArg = sh.CmdletArg +SharedArgs = sh.SharedArgs +normalize_hash = sh.normalize_hash +parse_cmdlet_args = sh.parse_cmdlet_args +normalize_result_input = sh.normalize_result_input +should_show_help = sh.should_show_help from Store import Store from SYS.utils import sha256_file diff --git a/cmdlet/get_relationship.py b/cmdlet/get_relationship.py index 1855eb5..ec097c8 100644 --- a/cmdlet/get_relationship.py +++ b/cmdlet/get_relationship.py @@ -10,7 +10,17 @@ from SYS.logger import log import models import pipeline as ctx from API import HydrusNetwork as hydrus_wrapper -from ._shared import Cmdlet, CmdletArg, SharedArgs, normalize_hash, fmt_bytes, get_hash_for_operation, fetch_hydrus_metadata, should_show_help, get_field +from . import _shared as sh + +Cmdlet = sh.Cmdlet +CmdletArg = sh.CmdletArg +SharedArgs = sh.SharedArgs +normalize_hash = sh.normalize_hash +fmt_bytes = sh.fmt_bytes +get_hash_for_operation = sh.get_hash_for_operation +fetch_hydrus_metadata = sh.fetch_hydrus_metadata +should_show_help = sh.should_show_help +get_field = sh.get_field from API.folder import API_folder_store from config import get_local_storage_path from result_table import ResultTable @@ -224,13 +234,14 @@ def _run(result: Any, _args: Sequence[str], config: Dict[str, Any]) -> int: try: client = None store_label = "hydrus" + backend_obj = None if store_name: # Store specified: do not fall back to a global/default Hydrus client. store_label = str(store_name) try: store = Store(config) - backend = store[str(store_name)] - candidate = getattr(backend, "_client", None) + backend_obj = store[str(store_name)] + candidate = getattr(backend_obj, "_client", None) if candidate is not None and hasattr(candidate, "get_file_relationships"): client = candidate except Exception: @@ -241,6 +252,74 @@ def _run(result: Any, _args: Sequence[str], config: Dict[str, Any]) -> int: else: client = hydrus_wrapper.get_client(config) + def _resolve_related_title(rel_hash: str) -> str: + """Best-effort resolve a Hydrus hash to a human title. + + Preference order: + - title: tag from the backend (fast path) + - Hydrus metadata tags via fetch_hydrus_metadata + - fallback to short hash + """ + h = normalize_hash(rel_hash) + if not h: + return str(rel_hash) + + # Prefer backend tag extraction when available. + if backend_obj is not None and hasattr(backend_obj, "get_tag"): + try: + tag_result = backend_obj.get_tag(h) + tags = tag_result[0] if isinstance(tag_result, tuple) and tag_result else tag_result + if isinstance(tags, list): + for t in tags: + if isinstance(t, str) and t.lower().startswith("title:"): + val = t.split(":", 1)[1].strip() + if val: + return val + except Exception: + pass + + # Fallback: fetch minimal metadata and scan for a title tag. + try: + meta, _ = fetch_hydrus_metadata( + config, + h, + store_name=store_label if store_name else None, + hydrus_client=client, + include_service_keys_to_tags=True, + include_file_url=False, + include_duration=False, + include_size=False, + include_mime=False, + ) + if isinstance(meta, dict): + tags_payload = meta.get("tags") + tag_candidates: list[str] = [] + if isinstance(tags_payload, dict): + for svc_data in tags_payload.values(): + if not isinstance(svc_data, dict): + continue + storage = svc_data.get("storage_tags") + if isinstance(storage, dict): + for group in storage.values(): + if isinstance(group, list): + tag_candidates.extend([str(x) for x in group if isinstance(x, str)]) + display = svc_data.get("display_tags") + if isinstance(display, list): + tag_candidates.extend([str(x) for x in display if isinstance(x, str)]) + flat = meta.get("tags_flat") + if isinstance(flat, list): + tag_candidates.extend([str(x) for x in flat if isinstance(x, str)]) + + for t in tag_candidates: + if isinstance(t, str) and t.lower().startswith("title:"): + val = t.split(":", 1)[1].strip() + if val: + return val + except Exception: + pass + + return h[:16] + "..." + if client: rel = client.get_file_relationships(hash_hex) if rel: @@ -274,7 +353,7 @@ def _run(result: Any, _args: Sequence[str], config: Dict[str, Any]) -> int: found_relationships.append({ "hash": king_hash, "type": "king", - "title": king_hash, + "title": _resolve_related_title(king_hash), "path": None, "store": store_label, }) @@ -292,7 +371,7 @@ def _run(result: Any, _args: Sequence[str], config: Dict[str, Any]) -> int: found_relationships.append({ "hash": rel_hash_norm, "type": rel_name, - "title": rel_hash_norm, # Can't resolve title easily without another API call + "title": _resolve_related_title(rel_hash_norm), "path": None, "store": store_label, }) @@ -304,7 +383,7 @@ def _run(result: Any, _args: Sequence[str], config: Dict[str, Any]) -> int: found_relationships.append({ "hash": rel_hash_norm, "type": rel_name, - "title": rel_hash_norm, + "title": _resolve_related_title(rel_hash_norm), "path": None, "store": store_label, }) diff --git a/cmdlet/get_tag.py b/cmdlet/get_tag.py index c3cd013..467bc52 100644 --- a/cmdlet/get_tag.py +++ b/cmdlet/get_tag.py @@ -27,7 +27,15 @@ from typing import Any, Dict, List, Optional, Sequence, Tuple import pipeline as ctx from API import HydrusNetwork from API.folder import read_sidecar, write_sidecar, find_sidecar, API_folder_store -from ._shared import normalize_hash, looks_like_hash, Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field +from . import _shared as sh + +normalize_hash = sh.normalize_hash +looks_like_hash = sh.looks_like_hash +Cmdlet = sh.Cmdlet +CmdletArg = sh.CmdletArg +SharedArgs = sh.SharedArgs +parse_cmdlet_args = sh.parse_cmdlet_args +get_field = sh.get_field from config import get_local_storage_path diff --git a/cmdlet/get_url.py b/cmdlet/get_url.py index d54e12a..c3fd857 100644 --- a/cmdlet/get_url.py +++ b/cmdlet/get_url.py @@ -5,7 +5,15 @@ from typing import Any, Dict, List, Sequence import sys import pipeline as ctx -from ._shared import Cmdlet, SharedArgs, parse_cmdlet_args, get_field, normalize_hash +from . import _shared as sh + +Cmdlet, SharedArgs, parse_cmdlet_args, get_field, normalize_hash = ( + sh.Cmdlet, + sh.SharedArgs, + sh.parse_cmdlet_args, + sh.get_field, + sh.normalize_hash, +) from SYS.logger import log from Store import Store diff --git a/cmdlet/merge_file.py b/cmdlet/merge_file.py index fef247f..b3894e6 100644 --- a/cmdlet/merge_file.py +++ b/cmdlet/merge_file.py @@ -12,17 +12,17 @@ import re as _re from config import resolve_output_dir -from ._shared import ( - Cmdlet, - CmdletArg, - create_pipe_object_result, - get_field, - get_pipe_object_hash, - get_pipe_object_path, - normalize_result_input, - parse_cmdlet_args, - should_show_help, -) +from . import _shared as sh + +Cmdlet = sh.Cmdlet +CmdletArg = sh.CmdletArg +create_pipe_object_result = sh.create_pipe_object_result +get_field = sh.get_field +get_pipe_object_hash = sh.get_pipe_object_hash +get_pipe_object_path = sh.get_pipe_object_path +normalize_result_input = sh.normalize_result_input +parse_cmdlet_args = sh.parse_cmdlet_args +should_show_help = sh.should_show_help import pipeline as ctx diff --git a/cmdlet/screen_shot.py b/cmdlet/screen_shot.py index 69aaf99..8b0fa7d 100644 --- a/cmdlet/screen_shot.py +++ b/cmdlet/screen_shot.py @@ -20,7 +20,16 @@ from urllib.parse import urlsplit, quote, urljoin from SYS.logger import log, debug from API.HTTP import HTTPClient from SYS.utils import ensure_directory, unique_path, unique_preserve_order -from ._shared import Cmdlet, CmdletArg, SharedArgs, create_pipe_object_result, normalize_result_input, should_show_help, get_field +from . import _shared as sh + +Cmdlet = sh.Cmdlet +CmdletArg = sh.CmdletArg +SharedArgs = sh.SharedArgs +create_pipe_object_result = sh.create_pipe_object_result +normalize_result_input = sh.normalize_result_input +should_show_help = sh.should_show_help +get_field = sh.get_field +parse_cmdlet_args = sh.parse_cmdlet_args import pipeline as pipeline_context # ============================================================================ @@ -33,20 +42,7 @@ import pipeline as pipeline_context # Playwright & Screenshot Dependencies # ============================================================================ -try: - from playwright.sync_api import ( - TimeoutError as PlaywrightTimeoutError, - sync_playwright, - ) - HAS_PLAYWRIGHT = True -except Exception: - HAS_PLAYWRIGHT = False - PlaywrightTimeoutError = TimeoutError # type: ignore - - def sync_playwright(*_args: Any, **_kwargs: Any) -> Any: # type: ignore - raise RuntimeError( - "playwright is required for screenshot capture; install with: pip install playwright; then: playwright install" - ) +from tool.playwright import HAS_PLAYWRIGHT, PlaywrightTimeoutError, PlaywrightTool try: from config import resolve_output_dir @@ -128,6 +124,7 @@ class ScreenshotOptions: prefer_platform_target: bool = False target_selectors: Optional[Sequence[str]] = None selector_timeout_ms: int = 10_000 + playwright_tool: Optional[PlaywrightTool] = None @dataclass(slots=True) @@ -324,142 +321,119 @@ def _prepare_output_path(options: ScreenshotOptions) -> Path: def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str]) -> None: """Capture screenshot using Playwright.""" debug(f"[_capture] Starting capture for {options.url} -> {destination}") - playwright = None - browser = None - context = None try: - debug("Starting Playwright...", flush=True) - playwright = sync_playwright().start() - log("Launching Chromium browser...", flush=True) + tool = options.playwright_tool or PlaywrightTool({}) + tool.debug_dump() + + log("Launching browser...", flush=True) format_name = _normalise_format(options.output_format) headless = options.headless or format_name == "pdf" debug(f"[_capture] Format: {format_name}, Headless: {headless}") if format_name == "pdf" and not options.headless: warnings.append("pdf output requires headless Chromium; overriding headless mode") - browser = playwright.chromium.launch( - headless=headless, - args=["--disable-blink-features=AutomationControlled"], - ) - log("Creating browser context...", flush=True) - context = browser.new_context( - user_agent=USER_AGENT, - viewport=DEFAULT_VIEWPORT, - ignore_https_errors=True, - ) - page = context.new_page() - log(f"Navigating to {options.url}...", flush=True) - try: - page.goto(options.url, timeout=90_000, wait_until="domcontentloaded") - log("Page loaded successfully", flush=True) - except PlaywrightTimeoutError: - warnings.append("navigation timeout; capturing current page state") - log("Navigation timeout; proceeding with current state", flush=True) - - # Skip article lookup by default (wait_for_article defaults to False) - if options.wait_for_article: + + with tool.open_page(headless=headless) as page: + log(f"Navigating to {options.url}...", flush=True) try: - log("Waiting for article element...", flush=True) - page.wait_for_selector("article", timeout=10_000) - log("Article element found", flush=True) + tool.goto(page, options.url) + log("Page loaded successfully", flush=True) except PlaywrightTimeoutError: - warnings.append("
selector not found; capturing fallback") - log("Article element not found; using fallback", flush=True) - - if options.wait_after_load > 0: - log(f"Waiting {options.wait_after_load}s for page stabilization...", flush=True) - time.sleep(min(10.0, max(0.0, options.wait_after_load))) - if options.replace_video_posters: - log("Replacing video elements with posters...", flush=True) - page.evaluate( - """ - document.querySelectorAll('video').forEach(v => { - if (v.poster) { - const img = document.createElement('img'); - img.src = v.poster; - img.style.maxWidth = '100%'; - img.style.borderRadius = '12px'; - v.replaceWith(img); - } - }); - """ - ) - # Attempt platform-specific target capture if requested (and not PDF) - element_captured = False - if options.prefer_platform_target and format_name != "pdf": - log("Attempting platform-specific content capture...", flush=True) - try: - _platform_preprocess(options.url, page, warnings) - except Exception as e: - debug(f"[_capture] Platform preprocess failed: {e}") - pass - selectors = list(options.target_selectors or []) - if not selectors: - selectors = _selectors_for_url(options.url) + warnings.append("navigation timeout; capturing current page state") + log("Navigation timeout; proceeding with current state", flush=True) - debug(f"[_capture] Trying selectors: {selectors}") - for sel in selectors: + # Skip article lookup by default (wait_for_article defaults to False) + if options.wait_for_article: try: - log(f"Trying selector: {sel}", flush=True) - el = page.wait_for_selector(sel, timeout=max(0, int(options.selector_timeout_ms))) + log("Waiting for article element...", flush=True) + page.wait_for_selector("article", timeout=10_000) + log("Article element found", flush=True) except PlaywrightTimeoutError: - log(f"Selector not found: {sel}", flush=True) - continue + warnings.append("
selector not found; capturing fallback") + log("Article element not found; using fallback", flush=True) + + if options.wait_after_load > 0: + log(f"Waiting {options.wait_after_load}s for page stabilization...", flush=True) + time.sleep(min(10.0, max(0.0, options.wait_after_load))) + if options.replace_video_posters: + log("Replacing video elements with posters...", flush=True) + page.evaluate( + """ + document.querySelectorAll('video').forEach(v => { + if (v.poster) { + const img = document.createElement('img'); + img.src = v.poster; + img.style.maxWidth = '100%'; + img.style.borderRadius = '12px'; + v.replaceWith(img); + } + }); + """ + ) + # Attempt platform-specific target capture if requested (and not PDF) + element_captured = False + if options.prefer_platform_target and format_name != "pdf": + log("Attempting platform-specific content capture...", flush=True) try: - if el is not None: - log(f"Found element with selector: {sel}", flush=True) - try: - el.scroll_into_view_if_needed(timeout=1000) - except Exception: - pass - log(f"Capturing element to {destination}...", flush=True) - el.screenshot(path=str(destination), type=("jpeg" if format_name == "jpeg" else None)) - element_captured = True - log("Element captured successfully", flush=True) - break - except Exception as exc: - warnings.append(f"element capture failed for '{sel}': {exc}") - log(f"Failed to capture element: {exc}", flush=True) - # Fallback to default capture paths - if element_captured: - pass - elif format_name == "pdf": - log("Generating PDF...", flush=True) - page.emulate_media(media="print") - page.pdf(path=str(destination), print_background=True) - log(f"PDF saved to {destination}", flush=True) - else: - log(f"Capturing full page to {destination}...", flush=True) - screenshot_kwargs: Dict[str, Any] = {"path": str(destination)} - if format_name == "jpeg": - screenshot_kwargs["type"] = "jpeg" - screenshot_kwargs["quality"] = 90 - if options.full_page: - page.screenshot(full_page=True, **screenshot_kwargs) + _platform_preprocess(options.url, page, warnings) + except Exception as e: + debug(f"[_capture] Platform preprocess failed: {e}") + pass + selectors = list(options.target_selectors or []) + if not selectors: + selectors = _selectors_for_url(options.url) + + debug(f"[_capture] Trying selectors: {selectors}") + for sel in selectors: + try: + log(f"Trying selector: {sel}", flush=True) + el = page.wait_for_selector(sel, timeout=max(0, int(options.selector_timeout_ms))) + except PlaywrightTimeoutError: + log(f"Selector not found: {sel}", flush=True) + continue + try: + if el is not None: + log(f"Found element with selector: {sel}", flush=True) + try: + el.scroll_into_view_if_needed(timeout=1000) + except Exception: + pass + log(f"Capturing element to {destination}...", flush=True) + el.screenshot(path=str(destination), type=("jpeg" if format_name == "jpeg" else None)) + element_captured = True + log("Element captured successfully", flush=True) + break + except Exception as exc: + warnings.append(f"element capture failed for '{sel}': {exc}") + log(f"Failed to capture element: {exc}", flush=True) + # Fallback to default capture paths + if element_captured: + pass + elif format_name == "pdf": + log("Generating PDF...", flush=True) + page.emulate_media(media="print") + page.pdf(path=str(destination), print_background=True) + log(f"PDF saved to {destination}", flush=True) else: - article = page.query_selector("article") - if article is not None: - article_kwargs = dict(screenshot_kwargs) - article_kwargs.pop("full_page", None) - article.screenshot(**article_kwargs) + log(f"Capturing full page to {destination}...", flush=True) + screenshot_kwargs: Dict[str, Any] = {"path": str(destination)} + if format_name == "jpeg": + screenshot_kwargs["type"] = "jpeg" + screenshot_kwargs["quality"] = 90 + if options.full_page: + page.screenshot(full_page=True, **screenshot_kwargs) else: - page.screenshot(**screenshot_kwargs) - log(f"Screenshot saved to {destination}", flush=True) + article = page.query_selector("article") + if article is not None: + article_kwargs = dict(screenshot_kwargs) + article_kwargs.pop("full_page", None) + article.screenshot(**article_kwargs) + else: + page.screenshot(**screenshot_kwargs) + log(f"Screenshot saved to {destination}", flush=True) except Exception as exc: debug(f"[_capture] Exception: {exc}") raise ScreenshotError(f"Failed to capture screenshot: {exc}") from exc - finally: - log("Cleaning up browser resources...", flush=True) - with contextlib.suppress(Exception): - if context is not None: - context.close() - with contextlib.suppress(Exception): - if browser is not None: - browser.close() - with contextlib.suppress(Exception): - if playwright is not None: - playwright.stop() - log("Cleanup complete", flush=True) def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult: @@ -511,8 +485,6 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: Screenshots are created using Playwright and marked as temporary so they can be cleaned up later with the cleanup cmdlet. """ - from ._shared import parse_cmdlet_args - debug(f"[_run] screen-shot invoked with args: {args}") # Help check @@ -534,6 +506,19 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: parsed = parse_cmdlet_args(args, CMDLET) format_value = parsed.get("format") + if not format_value: + # Default format can be set via config.conf tool block: + # [tool=playwright] + # format="pdf" + try: + tool_cfg = config.get("tool", {}) if isinstance(config, dict) else {} + pw_cfg = tool_cfg.get("playwright") if isinstance(tool_cfg, dict) else None + if isinstance(pw_cfg, dict): + format_value = pw_cfg.get("format") + except Exception: + pass + if not format_value: + format_value = "png" storage_value = parsed.get("storage") selector_arg = parsed.get("selector") selectors = [selector_arg] if selector_arg else [] @@ -669,6 +654,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: prefer_platform_target=False, wait_for_article=False, full_page=True, + playwright_tool=PlaywrightTool(config), ) screenshot_result = _capture_screenshot(options) diff --git a/cmdlet/search_provider.py b/cmdlet/search_provider.py index 29c52d8..2851b09 100644 --- a/cmdlet/search_provider.py +++ b/cmdlet/search_provider.py @@ -10,7 +10,13 @@ import importlib from SYS.logger import log, debug from ProviderCore.registry import get_search_provider, list_search_providers -from ._shared import Cmdlet, CmdletArg, should_show_help +from . import _shared as sh + +Cmdlet, CmdletArg, should_show_help = ( + sh.Cmdlet, + sh.CmdletArg, + sh.should_show_help, +) import pipeline as ctx # Optional dependencies diff --git a/cmdlet/search_store.py b/cmdlet/search_store.py index 4b05875..66c536e 100644 --- a/cmdlet/search_store.py +++ b/cmdlet/search_store.py @@ -10,7 +10,17 @@ import sys from SYS.logger import log, debug -from ._shared import Cmdlet, CmdletArg, SharedArgs, get_field, should_show_help, normalize_hash, first_title_tag +from . import _shared as sh + +Cmdlet, CmdletArg, SharedArgs, get_field, should_show_help, normalize_hash, first_title_tag = ( + sh.Cmdlet, + sh.CmdletArg, + sh.SharedArgs, + sh.get_field, + sh.should_show_help, + sh.normalize_hash, + sh.first_title_tag, +) import pipeline as ctx @@ -209,6 +219,10 @@ class Search_Store(Cmdlet): table_title += f" [{storage_backend}]" table = ResultTable(table_title) + try: + table.set_source_command("search-store", list(args_list)) + except Exception: + pass if hash_query: try: table.set_preserve_order(True) @@ -309,6 +323,11 @@ class Search_Store(Cmdlet): ext_val = Path(path_str).suffix except Exception: ext_val = None + if not ext_val and title: + try: + ext_val = Path(str(title)).suffix + except Exception: + ext_val = None size_bytes = meta_obj.get("size") if size_bytes is None: @@ -333,6 +352,20 @@ class Search_Store(Cmdlet): ctx.emit(payload) if found_any: + # Title should reflect the command, query, and only stores present in the table. + store_counts: "OrderedDict[str, int]" = OrderedDict() + for row_item in results_list: + store_val = str(row_item.get("store") or "").strip() + if not store_val: + continue + if store_val not in store_counts: + store_counts[store_val] = 0 + store_counts[store_val] += 1 + + counts_part = " ".join(f"{name}:{count}" for name, count in store_counts.items() if count > 0) + base_title = f"search-store: {query}".strip() + table.title = f"{base_title} | {counts_part}" if counts_part else base_title + ctx.set_last_result_table(table, results_list) db.append_worker_stdout(worker_id, json.dumps(results_list, indent=2)) db.update_worker_status(worker_id, 'completed') @@ -377,28 +410,6 @@ class Search_Store(Cmdlet): log(f"Backend {backend_name} search failed: {exc}", file=sys.stderr) results = all_results[:limit] - def _format_storage_label(name: str) -> str: - clean = str(name or "").strip() - if not clean: - return "Unknown" - return clean.replace("_", " ").title() - - storage_counts: OrderedDict[str, int] = OrderedDict((name, 0) for name in searched_backends) - for item in results or []: - store = get_field(item, "store") - if not store: - continue - key = str(store).lower() - if key not in storage_counts: - storage_counts[key] = 0 - storage_counts[key] += 1 - - if storage_counts or query: - display_counts = OrderedDict((_format_storage_label(name), count) for name, count in storage_counts.items()) - summary_line = table.set_storage_summary(display_counts, query, inline=True) - if summary_line: - table.title = summary_line - if results: for item in results: def _as_dict(obj: Any) -> Dict[str, Any]: @@ -428,6 +439,20 @@ class Search_Store(Cmdlet): results_list.append(normalized) ctx.emit(normalized) + # Title should reflect the command, query, and only stores present in the table. + store_counts: "OrderedDict[str, int]" = OrderedDict() + for row_item in results_list: + store_val = str(row_item.get("store") or "").strip() + if not store_val: + continue + if store_val not in store_counts: + store_counts[store_val] = 0 + store_counts[store_val] += 1 + + counts_part = " ".join(f"{name}:{count}" for name, count in store_counts.items() if count > 0) + base_title = f"search-store: {query}".strip() + table.title = f"{base_title} | {counts_part}" if counts_part else base_title + ctx.set_last_result_table(table, results_list) db.append_worker_stdout(worker_id, json.dumps(results_list, indent=2)) else: diff --git a/cmdlet/trim_file.py b/cmdlet/trim_file.py index 4bfff58..cce7b5a 100644 --- a/cmdlet/trim_file.py +++ b/cmdlet/trim_file.py @@ -11,14 +11,14 @@ import re from SYS.logger import log, debug from SYS.utils import sha256_file -from ._shared import ( - Cmdlet, - CmdletArg, - parse_cmdlet_args, - normalize_result_input, - extract_tag_from_result, - extract_title_from_result -) +from . import _shared as sh + +Cmdlet = sh.Cmdlet +CmdletArg = sh.CmdletArg +parse_cmdlet_args = sh.parse_cmdlet_args +normalize_result_input = sh.normalize_result_input +extract_tag_from_result = sh.extract_tag_from_result +extract_title_from_result = sh.extract_title_from_result import pipeline as ctx CMDLET = Cmdlet( diff --git a/cmdlet/catalog.py b/cmdlet_catalog.py similarity index 95% rename from cmdlet/catalog.py rename to cmdlet_catalog.py index 68a8f08..a3b3117 100644 --- a/cmdlet/catalog.py +++ b/cmdlet_catalog.py @@ -26,9 +26,9 @@ def ensure_registry_loaded() -> None: def _normalize_mod_name(mod_name: str) -> str: """Normalize a command/module name for import resolution.""" normalized = (mod_name or "").strip() - if normalized.startswith('.'): - normalized = normalized.lstrip('.') - normalized = normalized.replace('-', '_') + if normalized.startswith("."): + normalized = normalized.lstrip(".") + normalized = normalized.replace("-", "_") return normalized @@ -83,7 +83,7 @@ def get_cmdlet_metadata(cmd_name: str) -> Optional[Dict[str, Any]]: if data is None: try: - reg_fn = (REGISTRY or {}).get(cmd_name.replace('_', '-').lower()) + reg_fn = (REGISTRY or {}).get(cmd_name.replace("_", "-").lower()) if reg_fn: owner_mod = getattr(reg_fn, "__module__", "") if owner_mod: @@ -186,8 +186,6 @@ def get_cmdlet_arg_flags(cmd_name: str) -> List[str]: if not meta: return [] - # Preserve the order that arguments are defined on the cmdlet (arg=[...]) so - # completions feel stable and predictable. flags: List[str] = [] seen: set[str] = set() diff --git a/cmdnat/help.py b/cmdnat/help.py index b6290a2..82a394c 100644 --- a/cmdnat/help.py +++ b/cmdnat/help.py @@ -135,7 +135,7 @@ def _render_detail(meta: Dict[str, Any], args: Sequence[str]) -> None: def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: try: - from cmdlet import catalog as _catalog + import cmdlet_catalog as _catalog CMDLET.arg[0].choices = _normalize_choice_list(_catalog.list_cmdlet_names()) metadata = _catalog.list_cmdlet_metadata() diff --git a/cmdnat/pipe.py b/cmdnat/pipe.py index e3c16e1..e789dbb 100644 --- a/cmdnat/pipe.py +++ b/cmdnat/pipe.py @@ -16,7 +16,7 @@ from models import PipeObject from API.folder import LocalLibrarySearchOptimizer from config import get_local_storage_path, get_hydrus_access_key, get_hydrus_url -from hydrus_health_check import get_cookies_file_path + _ALLDEBRID_UNLOCK_CACHE: Dict[str, str] = {} @@ -372,12 +372,18 @@ def _build_hydrus_header(config: Dict[str, Any]) -> Optional[str]: def _build_ytdl_options(config: Optional[Dict[str, Any]], hydrus_header: Optional[str]) -> Optional[str]: """Compose ytdl-raw-options string including cookies and optional Hydrus header.""" opts: List[str] = [] + cookies_path = None try: - cookies_path = get_cookies_file_path() + from tool.ytdlp import YtDlpTool + + cookiefile = YtDlpTool(config or {}).resolve_cookiefile() + if cookiefile is not None: + cookies_path = str(cookiefile) except Exception: cookies_path = None + if cookies_path: - opts.append(f"cookies={cookies_path.replace('\\', '/')}") + opts.append(f"cookies={cookies_path.replace('\\', '/')}" ) else: opts.append("cookies-from-browser=chrome") if hydrus_header: @@ -407,10 +413,18 @@ def _is_hydrus_path(path: str, hydrus_url: Optional[str]) -> bool: return True return False -def _ensure_ytdl_cookies() -> None: +def _ensure_ytdl_cookies(config: Optional[Dict[str, Any]] = None) -> None: """Ensure yt-dlp options are set correctly for this session.""" from pathlib import Path - cookies_path = get_cookies_file_path() + cookies_path = None + try: + from tool.ytdlp import YtDlpTool + + cookiefile = YtDlpTool(config or {}).resolve_cookiefile() + if cookiefile is not None: + cookies_path = str(cookiefile) + except Exception: + cookies_path = None if cookies_path: # Check if file exists and has content (use forward slashes for path checking) check_path = cookies_path.replace('\\', '/') @@ -635,7 +649,7 @@ def _queue_items( pass # Just verify cookies are configured, don't try to set via IPC - _ensure_ytdl_cookies() + _ensure_ytdl_cookies(config) hydrus_header = _build_hydrus_header(config or {}) ytdl_opts = _build_ytdl_options(config, hydrus_header) @@ -1426,7 +1440,15 @@ def _start_mpv(items: List[Any], config: Optional[Dict[str, Any]] = None, start_ hydrus_header = _build_hydrus_header(config or {}) ytdl_opts = _build_ytdl_options(config, hydrus_header) - cookies_path = get_cookies_file_path() + cookies_path = None + try: + from tool.ytdlp import YtDlpTool + + cookiefile = YtDlpTool(config or {}).resolve_cookiefile() + if cookiefile is not None: + cookies_path = str(cookiefile) + except Exception: + cookies_path = None if cookies_path: debug(f"Starting MPV with cookies file: {cookies_path.replace('\\', '/')}") else: diff --git a/config.py b/config.py index 1ee6a26..13cf07b 100644 --- a/config.py +++ b/config.py @@ -1,11 +1,5 @@ -"""Unified configuration helpers. - -Configuration is defined exclusively via the modular `.conf` format. - -- Required: `temp` -- Optional: stores, providers, and other settings -- Modular: optional fragments in `config.d/*.conf` are merged in lexicographic order +""" """ from __future__ import annotations @@ -130,6 +124,21 @@ def _apply_conf_block(config: Dict[str, Any], kind: str, subtype: str, block: Di provider[provider_name] = dict(block) return + if kind_l == "tool": + tool_name = str(subtype).strip().lower() + if not tool_name: + return + tool = config.setdefault("tool", {}) + if not isinstance(tool, dict): + config["tool"] = {} + tool = config["tool"] + existing = tool.get(tool_name) + if isinstance(existing, dict): + _merge_dict_inplace(existing, block) + else: + tool[tool_name] = dict(block) + return + def parse_conf_text(text: str, *, base: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: """Parse a lightweight .conf format into the app's config dict. @@ -227,7 +236,7 @@ def _serialize_conf(config: Dict[str, Any]) -> str: # Top-level scalars first for key in sorted(config.keys()): - if key in {"store", "provider"}: + if key in {"store", "provider", "tool"}: continue value = config.get(key) if isinstance(value, dict): @@ -263,6 +272,18 @@ def _serialize_conf(config: Dict[str, Any]) -> str: for k in sorted(block.keys()): lines.append(f"{k}={_format_conf_value(block.get(k))}") + # Tool blocks + tool = config.get("tool") + if isinstance(tool, dict): + for name in sorted(tool.keys()): + block = tool.get(name) + if not isinstance(block, dict): + continue + lines.append("") + lines.append(f"[tool={name}]") + for k in sorted(block.keys()): + lines.append(f"{k}={_format_conf_value(block.get(k))}") + return "\n".join(lines).rstrip() + "\n" @@ -510,12 +531,43 @@ def get_provider_credentials(config: Dict[str, Any], provider: str) -> Optional[ def resolve_cookies_path(config: Dict[str, Any], script_dir: Optional[Path] = None) -> Optional[Path]: - value = config.get("cookies") - if value: + # Support both legacy top-level `cookies=...` and the modular conf style: + # [tool=ytdlp] + # cookies="C:\\path\\cookies.txt" + values: list[Any] = [] + try: + values.append(config.get("cookies")) + except Exception: + pass + + try: + tool = config.get("tool") + if isinstance(tool, dict): + ytdlp = tool.get("ytdlp") + if isinstance(ytdlp, dict): + values.append(ytdlp.get("cookies")) + values.append(ytdlp.get("cookiefile")) + except Exception: + pass + + try: + ytdlp_block = config.get("ytdlp") + if isinstance(ytdlp_block, dict): + values.append(ytdlp_block.get("cookies")) + values.append(ytdlp_block.get("cookiefile")) + except Exception: + pass + + base_dir = script_dir or SCRIPT_DIR + for value in values: + if not value: + continue candidate = Path(str(value)).expanduser() + if not candidate.is_absolute(): + candidate = (base_dir / candidate).expanduser() if candidate.is_file(): return candidate - base_dir = script_dir or SCRIPT_DIR + default_path = base_dir / "cookies.txt" if default_path.is_file(): return default_path diff --git a/hydrus_health_check.py b/hydrus_health_check.py deleted file mode 100644 index 1c7854b..0000000 --- a/hydrus_health_check.py +++ /dev/null @@ -1,43 +0,0 @@ -"""Cookies availability helpers. - -This module is intentionally limited to cookie-file resolution used by yt-dlp. -Other service availability checks live in their owning store/provider objects. -""" - -import sys - -from pathlib import Path -from typing import Any, Dict, Optional, Tuple - -from SYS.logger import debug - -# Global state for Cookies availability -_COOKIES_FILE_PATH: Optional[str] = None - - -def initialize_cookies_check(config: Optional[Dict[str, Any]] = None, emit_debug: bool = True) -> Tuple[bool, str]: - """Resolve cookies file path from config, falling back to cookies.txt in app root. - - Returns a tuple of (found, detail_message). - """ - global _COOKIES_FILE_PATH - - try: - from config import resolve_cookies_path - cookies_path = resolve_cookies_path(config or {}, script_dir=Path(__file__).parent) - except Exception: - cookies_path = None - - if cookies_path and cookies_path.exists(): - _COOKIES_FILE_PATH = str(cookies_path) - if emit_debug: - debug(f"Cookies: ENABLED - Found cookies file", file=sys.stderr) - return True, str(cookies_path) - else: - _COOKIES_FILE_PATH = None - return False, "Not found" - - -def get_cookies_file_path() -> Optional[str]: - """Get the path to the cookies.txt file if it exists.""" - return _COOKIES_FILE_PATH diff --git a/models.py b/models.py index 478323a..9cc98c5 100644 --- a/models.py +++ b/models.py @@ -348,6 +348,8 @@ class DownloadOptions: playlist_items: Optional[str] = None # yt-dlp --playlist-items format (e.g., "1-3,5,8") no_playlist: bool = False # If True, pass --no-playlist to yt-dlp quiet: bool = False # If True, suppress all console output (progress, debug logs) + embed_chapters: bool = False # If True, pass yt-dlp --embed-chapters / embedchapters + write_sub: bool = False # If True, download subtitles (writesubtitles/writeautomaticsub) class SendFunc(Protocol): diff --git a/pyproject.toml b/pyproject.toml index d4e84d7..43bacf1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,7 @@ dependencies = [ "textual>=0.30.0", # Media processing and downloading - "yt-dlp>=2023.11.0", + "yt-dlp[default]>=2023.11.0", "yt-dlp-ejs", # EJS challenge solver scripts for YouTube JavaScript challenges "requests>=2.31.0", "httpx>=0.25.0", @@ -43,7 +43,6 @@ dependencies = [ # Document and data handling "pypdf>=3.0.0", - "img2pdf>=0.6.0", "mutagen>=1.46.0", "cbor2>=4.0", @@ -53,7 +52,6 @@ dependencies = [ # Metadata extraction and processing "musicbrainzngs>=0.7.0", - "beautifulsoup4>=4.12.0", "lxml>=4.9.0", # Advanced searching and libraries diff --git a/requirements.txt b/requirements.txt index cd84ea6..db07c49 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,14 +4,13 @@ prompt-toolkit>=3.0.0 textual>=0.30.0 # Media processing and downloading -yt-dlp>=2023.11.0 +yt-dlp[default]>=2023.11.0 requests>=2.31.0 httpx>=0.25.0 ffmpeg-python>=0.2.0 # Document and data handling pypdf>=3.0.0 -img2pdf>=0.6.0 mutagen>=1.46.0 cbor2>=4.0 @@ -21,7 +20,6 @@ python-bidi>=0.4.2 # Metadata extraction and processing musicbrainzngs>=0.7.0 -beautifulsoup4>=4.12.0 lxml>=4.9.0 # Advanced searching and libraries diff --git a/test-login.py b/test-login.py deleted file mode 100644 index cb38c13..0000000 --- a/test-login.py +++ /dev/null @@ -1,336 +0,0 @@ -import requests -import random, string -from concurrent import futures -from tqdm import tqdm -import time -from datetime import datetime -import argparse -import os -import sys -import shutil -import json -import re -import base64 -import hashlib -from Crypto.Cipher import AES -from Crypto.Util import Counter - -def display_error(response, message): - print(message) - print(response) - print(response.text) - exit() - -def get_book_infos(session, url): - r = session.get(url).text - infos_url = "https:" + r.split('"url":"')[1].split('"')[0].replace("\\u0026", "&") - response = session.get(infos_url) - data = response.json()['data'] - title = data['brOptions']['bookTitle'].strip().replace(" ", "_") - title = ''.join( c for c in title if c not in '<>:"/\\|?*' ) # Filter forbidden chars in directory names (Windows & Linux) - title = title[:150] # Trim the title to avoid long file names - metadata = data['metadata'] - links = [] - for item in data['brOptions']['data']: - for page in item: - links.append(page['uri']) - - if len(links) > 1: - print(f"[+] Found {len(links)} pages") - return title, links, metadata - else: - print(f"[-] Error while getting image links") - exit() - -def login(email, password): - session = requests.Session() - response = session.get("https://archive.org/services/account/login/") - login_data = response.json() - if not login_data['success']: - display_error(response, "[-] Error while getting login token:") - - login_token = login_data["value"]["token"] - - headers = {"Content-Type": "application/x-www-form-urlencoded"} - data = {"username":email, "password":password, "t": login_token} - - response = session.post("https://archive.org/services/account/login/", headers=headers, data=json.dumps(data)) - try: - response_json = response.json() - except: - display_error(response, "[-] Error while login:") - - if response_json["success"] == False: - if response_json["value"] == "bad_login": - print("[-] Invalid credentials!") - exit() - display_error(response, "[-] Error while login:") - else: - print("[+] Successful login") - return session - -def loan(session, book_id, verbose=True): - data = { - "action": "grant_access", - "identifier": book_id - } - response = session.post("https://archive.org/services/loans/loan/searchInside.php", data=data) - data['action'] = "browse_book" - response = session.post("https://archive.org/services/loans/loan/", data=data) - - if response.status_code == 400 : - try: - if response.json()["error"] == "This book is not available to borrow at this time. Please try again later.": - print("This book doesn't need to be borrowed") - return session - else : - display_error(response, "Something went wrong when trying to borrow the book.") - except: # The response is not in JSON format - display_error(response, "The book cannot be borrowed") - - data['action'] = "create_token" - response = session.post("https://archive.org/services/loans/loan/", data=data) - - if "token" in response.text: - if verbose: - print("[+] Successful loan") - return session - else: - display_error(response, "Something went wrong when trying to borrow the book, maybe you can't borrow this book.") - -def return_loan(session, book_id): - data = { - "action": "return_loan", - "identifier": book_id - } - response = session.post("https://archive.org/services/loans/loan/", data=data) - if response.status_code == 200 and response.json()["success"]: - print("[+] Book returned") - else: - display_error(response, "Something went wrong when trying to return the book") - -def image_name(pages, page, directory): - return f"{directory}/{(len(str(pages)) - len(str(page))) * '0'}{page}.jpg" - -def deobfuscate_image(image_data, link, obf_header): - """ - @Author: https://github.com/justimm - Decrypts the first 1024 bytes of image_data using AES-CTR. - The obfuscation_header is expected in the form "1|" - where the base64-decoded counter is 16 bytes. - We derive the AES key by taking the SHA-1 digest of the image URL (with protocol/host removed) - and using the first 16 bytes. - For AES-CTR, we use a 16-byte counter block. The first 8 bytes are used as a fixed prefix, - and the remaining 8 bytes (interpreted as a big-endian integer) are used as the initial counter value. - """ - try: - version, counter_b64 = obf_header.split('|') - except Exception as e: - raise ValueError("Invalid X-Obfuscate header format") from e - - if version != '1': - raise ValueError("Unsupported obfuscation version: " + version) - - # Derive AES key: replace protocol/host in link with '/' - aesKey = re.sub(r"^https?:\/\/.*?\/", "/", link) - sha1_digest = hashlib.sha1(aesKey.encode('utf-8')).digest() - key = sha1_digest[:16] - - # Decode the counter (should be 16 bytes) - counter_bytes = base64.b64decode(counter_b64) - if len(counter_bytes) != 16: - raise ValueError(f"Expected counter to be 16 bytes, got {len(counter_bytes)}") - - prefix = counter_bytes[:8] - initial_value = int.from_bytes(counter_bytes[8:], byteorder='big') - - # Create AES-CTR cipher with a 64-bit counter length. - ctr = Counter.new(64, prefix=prefix, initial_value=initial_value, little_endian=False) - cipher = AES.new(key, AES.MODE_CTR, counter=ctr) - - decrypted_part = cipher.decrypt(image_data[:1024]) - new_data = decrypted_part + image_data[1024:] - return new_data - -def download_one_image(session, link, i, directory, book_id, pages): - headers = { - "Referer": "https://archive.org/", - "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8", - "Sec-Fetch-Site": "same-site", - "Sec-Fetch-Mode": "no-cors", - "Sec-Fetch-Dest": "image", - } - retry = True - response = None - while retry: - try: - response = session.get(link, headers=headers) - if response.status_code == 403: - session = loan(session, book_id, verbose=False) - raise Exception("Borrow again") - elif response.status_code == 200: - retry = False - except: - time.sleep(1) # Wait 1 second before retrying - - image = image_name(pages, i, directory) - - obf_header = response.headers.get("X-Obfuscate") - image_content = None - if obf_header: - try: - image_content = deobfuscate_image(response.content, link, obf_header) - except Exception as e: - print(f"[ERROR] Deobfuscation failed: {e}") - return - else: - image_content = response.content - - with open(image, "wb") as f: - f.write(image_content) - -def download(session, n_threads, directory, links, scale, book_id): - print("Downloading pages...") - links = [f"{link}&rotate=0&scale={scale}" for link in links] - pages = len(links) - - tasks = [] - with futures.ThreadPoolExecutor(max_workers=n_threads) as executor: - for link in links: - i = links.index(link) - tasks.append(executor.submit(download_one_image, session=session, link=link, i=i, directory=directory, book_id=book_id, pages=pages)) - for task in tqdm(futures.as_completed(tasks), total=len(tasks)): - pass - - images = [image_name(pages, i, directory) for i in range(len(links))] - return images - -def make_pdf(pdf, title, directory): - file = title+".pdf" - # Handle the case where multiple books with the same name are downloaded - i = 1 - while os.path.isfile(os.path.join(directory, file)): - file = f"{title}({i}).pdf" - i += 1 - - with open(os.path.join(directory, file),"wb") as f: - f.write(pdf) - print(f"[+] PDF saved as \"{file}\"") - -if __name__ == "__main__": - - my_parser = argparse.ArgumentParser() - my_parser.add_argument('-e', '--email', help='Your archive.org email', type=str, required=True) - my_parser.add_argument('-p', '--password', help='Your archive.org password', type=str, required=True) - my_parser.add_argument('-u', '--url', help='Link to the book (https://archive.org/details/XXXX). You can use this argument several times to download multiple books', action='append', type=str) - my_parser.add_argument('-d', '--dir', help='Output directory', type=str) - my_parser.add_argument('-f', '--file', help='File where are stored the URLs of the books to download', type=str) - my_parser.add_argument('-r', '--resolution', help='Image resolution (10 to 0, 0 is the highest), [default 3]', type=int, default=3) - my_parser.add_argument('-t', '--threads', help="Maximum number of threads, [default 50]", type=int, default=50) - my_parser.add_argument('-j', '--jpg', help="Output to individual JPG's rather than a PDF", action='store_true') - my_parser.add_argument('-m', '--meta', help="Output the metadata of the book to a json file (-j option required)", action='store_true') - - if len(sys.argv) == 1: - my_parser.print_help(sys.stderr) - sys.exit(1) - args = my_parser.parse_args() - - if args.url is None and args.file is None: - my_parser.error("At least one of --url and --file required") - - email = args.email - password = args.password - scale = args.resolution - n_threads = args.threads - d = args.dir - - if d == None: - d = os.getcwd() - elif not os.path.isdir(d): - print(f"Output directory does not exist!") - exit() - - if args.url is not None: - urls = args.url - else: - if os.path.exists(args.file): - with open(args.file) as f: - urls = f.read().strip().split("\n") - else: - print(f"{args.file} does not exist!") - exit() - - # Check the urls format - for url in urls: - if not url.startswith("https://archive.org/details/"): - print(f"{url} --> Invalid url. URL must starts with \"https://archive.org/details/\"") - exit() - - print(f"{len(urls)} Book(s) to download") - session = login(email, password) - - for url in urls: - book_id = list(filter(None, url.split("/")))[3] - print("="*40) - print(f"Current book: https://archive.org/details/{book_id}") - session = loan(session, book_id) - title, links, metadata = get_book_infos(session, url) - - directory = os.path.join(d, title) - # Handle the case where multiple books with the same name are downloaded - i = 1 - _directory = directory - while os.path.isdir(directory): - directory = f"{_directory}({i})" - i += 1 - os.makedirs(directory) - - if args.meta: - print("Writing metadata.json...") - with open(f"{directory}/metadata.json",'w') as f: - json.dump(metadata,f) - - images = download(session, n_threads, directory, links, scale, book_id) - - if not args.jpg: # Create pdf with images and remove the images folder - import img2pdf - - # prepare PDF metadata - # sometimes archive metadata is missing - pdfmeta = { } - # ensure metadata are str - for key in ["title", "creator", "associated-names"]: - if key in metadata: - if isinstance(metadata[key], str): - pass - elif isinstance(metadata[key], list): - metadata[key] = "; ".join(metadata[key]) - else: - raise Exception("unsupported metadata type") - # title - if 'title' in metadata: - pdfmeta['title'] = metadata['title'] - # author - if 'creator' in metadata and 'associated-names' in metadata: - pdfmeta['author'] = metadata['creator'] + "; " + metadata['associated-names'] - elif 'creator' in metadata: - pdfmeta['author'] = metadata['creator'] - elif 'associated-names' in metadata: - pdfmeta['author'] = metadata['associated-names'] - # date - if 'date' in metadata: - try: - pdfmeta['creationdate'] = datetime.strptime(metadata['date'][0:4], '%Y') - except: - pass - # keywords - pdfmeta['keywords'] = [f"https://archive.org/details/{book_id}"] - - pdf = img2pdf.convert(images, **pdfmeta) - make_pdf(pdf, title, args.dir if args.dir != None else "") - try: - shutil.rmtree(directory) - except OSError as e: - print ("Error: %s - %s." % (e.filename, e.strerror)) - - return_loan(session, book_id) \ No newline at end of file diff --git a/tool/__init__.py b/tool/__init__.py new file mode 100644 index 0000000..cd9f29d --- /dev/null +++ b/tool/__init__.py @@ -0,0 +1,11 @@ +"""Tool helpers. + +This package contains wrappers around external tools (e.g. yt-dlp) so cmdlets can share +common defaults (cookies, timeouts, format selectors) and users can override them via +`config.conf`. +""" + +from .ytdlp import YtDlpTool, YtDlpDefaults +from .playwright import PlaywrightTool, PlaywrightDefaults + +__all__ = ["YtDlpTool", "YtDlpDefaults", "PlaywrightTool", "PlaywrightDefaults"] diff --git a/tool/playwright.py b/tool/playwright.py new file mode 100644 index 0000000..3b9d751 --- /dev/null +++ b/tool/playwright.py @@ -0,0 +1,203 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, Dict, Iterator, Optional + +from SYS.logger import debug + +try: + from playwright.sync_api import TimeoutError as PlaywrightTimeoutError + from playwright.sync_api import sync_playwright + + HAS_PLAYWRIGHT = True + _PLAYWRIGHT_IMPORT_ERROR: Optional[Exception] = None +except Exception as exc: # pragma: no cover + HAS_PLAYWRIGHT = False + _PLAYWRIGHT_IMPORT_ERROR = exc + PlaywrightTimeoutError = TimeoutError # type: ignore + sync_playwright = None # type: ignore + + +# Re-export for consumers (e.g. cmdlets catching navigation timeouts) +__all__ = ["HAS_PLAYWRIGHT", "PlaywrightTimeoutError", "PlaywrightTool", "PlaywrightDefaults"] + + +def _get_nested(config: Dict[str, Any], *path: str) -> Any: + cur: Any = config + for key in path: + if not isinstance(cur, dict): + return None + cur = cur.get(key) + return cur + + +@dataclass(slots=True) +class PlaywrightDefaults: + browser: str = "chromium" # chromium|firefox|webkit + headless: bool = True + user_agent: str = ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/120.0.0.0 Safari/537.36" + ) + viewport_width: int = 1280 + viewport_height: int = 1200 + navigation_timeout_ms: int = 90_000 + ignore_https_errors: bool = True + + +class PlaywrightTool: + """Small wrapper to standardize Playwright defaults and lifecycle. + + This is meant to keep cmdlets/providers from duplicating: + - sync_playwright start/stop + - browser launch/context creation + - user-agent/viewport defaults + + Config overrides (top-level keys): + - playwright.browser="chromium" + - playwright.headless=true + - playwright.user_agent="..." + - playwright.viewport_width=1280 + - playwright.viewport_height=1200 + - playwright.navigation_timeout_ms=90000 + - playwright.ignore_https_errors=true + """ + + def __init__(self, config: Optional[Dict[str, Any]] = None) -> None: + self._config: Dict[str, Any] = dict(config or {}) + self.defaults = self._load_defaults() + + def _load_defaults(self) -> PlaywrightDefaults: + cfg = self._config + tool_block = _get_nested(cfg, "tool", "playwright") + if not isinstance(tool_block, dict): + tool_block = {} + pw_block = cfg.get("playwright") if isinstance(cfg.get("playwright"), dict) else {} + if not isinstance(pw_block, dict): + pw_block = {} + + def _get(name: str, fallback: Any) -> Any: + val = tool_block.get(name) + if val is None: + val = pw_block.get(name) + if val is None: + val = cfg.get(f"playwright_{name}") + if val is None: + val = _get_nested(cfg, "playwright", name) + return fallback if val is None else val + + browser = str(_get("browser", PlaywrightDefaults.browser)).strip().lower() or "chromium" + if browser not in {"chromium", "firefox", "webkit"}: + browser = "chromium" + + headless_raw = _get("headless", PlaywrightDefaults.headless) + headless = bool(headless_raw) + + ua = str(_get("user_agent", PlaywrightDefaults.user_agent)) + + def _int(name: str, fallback: int) -> int: + raw = _get(name, fallback) + try: + return int(raw) + except Exception: + return fallback + + vw = _int("viewport_width", PlaywrightDefaults.viewport_width) + vh = _int("viewport_height", PlaywrightDefaults.viewport_height) + nav_timeout = _int("navigation_timeout_ms", PlaywrightDefaults.navigation_timeout_ms) + + ignore_https = bool(_get("ignore_https_errors", PlaywrightDefaults.ignore_https_errors)) + + return PlaywrightDefaults( + browser=browser, + headless=headless, + user_agent=ua, + viewport_width=vw, + viewport_height=vh, + navigation_timeout_ms=nav_timeout, + ignore_https_errors=ignore_https, + ) + + def require(self) -> None: + if HAS_PLAYWRIGHT and sync_playwright is not None: + return + detail = str(_PLAYWRIGHT_IMPORT_ERROR or "playwright is not installed") + raise RuntimeError( + "playwright is required; install with: pip install playwright; then: playwright install\n" + f"detail: {detail}" + ) + + def open_page( + self, + *, + headless: Optional[bool] = None, + user_agent: Optional[str] = None, + viewport_width: Optional[int] = None, + viewport_height: Optional[int] = None, + ignore_https_errors: Optional[bool] = None, + ) -> Iterator[Any]: + """Context manager yielding a Playwright page with sane defaults.""" + self.require() + + h = self.defaults.headless if headless is None else bool(headless) + ua = self.defaults.user_agent if user_agent is None else str(user_agent) + vw = self.defaults.viewport_width if viewport_width is None else int(viewport_width) + vh = self.defaults.viewport_height if viewport_height is None else int(viewport_height) + ihe = self.defaults.ignore_https_errors if ignore_https_errors is None else bool(ignore_https_errors) + + pw = None + browser = None + context = None + try: + assert sync_playwright is not None + pw = sync_playwright().start() + + browser_type = getattr(pw, self.defaults.browser, None) + if browser_type is None: + browser_type = pw.chromium + + browser = browser_type.launch( + headless=h, + args=["--disable-blink-features=AutomationControlled"], + ) + context = browser.new_context( + user_agent=ua, + viewport={"width": vw, "height": vh}, + ignore_https_errors=ihe, + ) + page = context.new_page() + yield page + finally: + try: + if context is not None: + context.close() + except Exception: + pass + try: + if browser is not None: + browser.close() + except Exception: + pass + try: + if pw is not None: + pw.stop() + except Exception: + pass + + def goto(self, page: Any, url: str) -> None: + """Navigate with configured timeout.""" + try: + page.goto(url, timeout=int(self.defaults.navigation_timeout_ms), wait_until="domcontentloaded") + except Exception: + raise + + def debug_dump(self) -> None: + try: + debug( + f"[playwright] browser={self.defaults.browser} headless={self.defaults.headless} " + f"viewport={self.defaults.viewport_width}x{self.defaults.viewport_height} " + f"nav_timeout_ms={self.defaults.navigation_timeout_ms}" + ) + except Exception: + pass diff --git a/tool/ytdlp.py b/tool/ytdlp.py new file mode 100644 index 0000000..b774675 --- /dev/null +++ b/tool/ytdlp.py @@ -0,0 +1,195 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional, Sequence + +from SYS.logger import debug + + +def _get_nested(config: Dict[str, Any], *path: str) -> Any: + cur: Any = config + for key in path: + if not isinstance(cur, dict): + return None + cur = cur.get(key) + return cur + + +def _parse_csv_list(value: Any) -> Optional[List[str]]: + if value is None: + return None + if isinstance(value, list): + out: List[str] = [] + for item in value: + s = str(item).strip() + if s: + out.append(s) + return out or None + s = str(value).strip() + if not s: + return None + # allow either JSON-ish list strings or simple comma-separated values + if s.startswith("[") and s.endswith("]"): + s = s[1:-1] + parts = [p.strip() for p in s.split(",")] + parts = [p for p in parts if p] + return parts or None + + +@dataclass(slots=True) +class YtDlpDefaults: + """User-tunable defaults for yt-dlp behavior. + + Recommended config.conf keys (top-level dotted keys): + - ytdlp.video_format="bestvideo+bestaudio/best" + - ytdlp.audio_format="251/140/bestaudio" + - ytdlp.format_sort="res:2160,res:1440,res:1080,res:720,res" + + Cookies: + - cookies="C:\\path\\cookies.txt" (already supported by config.resolve_cookies_path) + """ + + video_format: str = "bestvideo+bestaudio/best" + audio_format: str = "251/140/bestaudio" + format_sort: Optional[List[str]] = None + + +class YtDlpTool: + """Centralizes yt-dlp defaults and translation helpers. + + This is intentionally small and dependency-light so cmdlets can use it without + forcing a full refactor. + """ + + def __init__(self, config: Optional[Dict[str, Any]] = None, *, script_dir: Optional[Path] = None) -> None: + self._config: Dict[str, Any] = dict(config or {}) + # `resolve_cookies_path` expects the app root so it can fall back to ./cookies.txt. + # This file lives under ./tool/, so default to the parent directory. + self._script_dir = script_dir or Path(__file__).resolve().parent.parent + self.defaults = self._load_defaults() + self._cookiefile: Optional[Path] = self._init_cookiefile() + + def _init_cookiefile(self) -> Optional[Path]: + """Resolve cookies once at tool init (yt-dlp is the primary consumer).""" + try: + from config import resolve_cookies_path + + resolved = resolve_cookies_path(self._config, script_dir=self._script_dir) + if resolved is not None and resolved.is_file(): + return resolved + except Exception: + pass + return None + + def _load_defaults(self) -> YtDlpDefaults: + cfg = self._config + + tool_block = _get_nested(cfg, "tool", "ytdlp") + if not isinstance(tool_block, dict): + tool_block = {} + + ytdlp_block = cfg.get("ytdlp") if isinstance(cfg.get("ytdlp"), dict) else {} + if not isinstance(ytdlp_block, dict): + ytdlp_block = {} + + # Accept both nested and flat styles. + video_format = ( + tool_block.get("video_format") + or tool_block.get("format") + or ytdlp_block.get("video_format") + or ytdlp_block.get("video") + or ytdlp_block.get("format_video") + or cfg.get("ytdlp_video_format") + ) + audio_format = ( + tool_block.get("audio_format") + or ytdlp_block.get("audio_format") + or ytdlp_block.get("audio") + or ytdlp_block.get("format_audio") + or cfg.get("ytdlp_audio_format") + ) + + # Also accept dotted keys written as nested dicts: ytdlp.format.video, ytdlp.format.audio + nested_video = _get_nested(cfg, "ytdlp", "format", "video") + nested_audio = _get_nested(cfg, "ytdlp", "format", "audio") + + fmt_sort_val = ( + tool_block.get("format_sort") + or ytdlp_block.get("format_sort") + or ytdlp_block.get("formatSort") + or cfg.get("ytdlp_format_sort") + or _get_nested(cfg, "ytdlp", "format", "sort") + ) + fmt_sort = _parse_csv_list(fmt_sort_val) + + defaults = YtDlpDefaults( + video_format=str(nested_video or video_format or YtDlpDefaults.video_format), + audio_format=str(nested_audio or audio_format or YtDlpDefaults.audio_format), + format_sort=fmt_sort, + ) + + return defaults + + def resolve_cookiefile(self) -> Optional[Path]: + return self._cookiefile + + def default_format(self, mode: str) -> str: + m = str(mode or "").lower().strip() + if m == "audio": + return self.defaults.audio_format + return self.defaults.video_format + + def build_yt_dlp_cli_args( + self, + *, + url: str, + output_dir: Optional[Path] = None, + ytdl_format: Optional[str] = None, + playlist_items: Optional[str] = None, + no_playlist: bool = False, + quiet: bool = True, + extra_args: Optional[Sequence[str]] = None, + ) -> List[str]: + """Build a yt-dlp command line (argv list). + + This is primarily for debug output or subprocess execution. + """ + argv: List[str] = ["yt-dlp"] + if quiet: + argv.extend(["--quiet", "--no-warnings"]) + argv.append("--no-progress") + + cookiefile = self.resolve_cookiefile() + if cookiefile is not None: + argv.extend(["--cookies", str(cookiefile)]) + + if no_playlist: + argv.append("--no-playlist") + if playlist_items: + argv.extend(["--playlist-items", str(playlist_items)]) + + fmt = (ytdl_format or "").strip() + if fmt: + # Use long form to avoid confusion with app-level flags. + argv.extend(["--format", fmt]) + + if self.defaults.format_sort: + for sort_key in self.defaults.format_sort: + argv.extend(["-S", sort_key]) + + if output_dir is not None: + outtmpl = str((output_dir / "%(title)s.%(ext)s").resolve()) + argv.extend(["-o", outtmpl]) + + if extra_args: + argv.extend([str(a) for a in extra_args if str(a).strip()]) + + argv.append(str(url)) + return argv + + def debug_print_cli(self, argv: Sequence[str]) -> None: + try: + debug("yt-dlp argv: " + " ".join(str(a) for a in argv)) + except Exception: + pass