From 1f765cffda4036ce5bc5796bc091b579b26b4f00 Mon Sep 17 00:00:00 2001 From: Nose Date: Mon, 5 Jan 2026 07:51:19 -0800 Subject: [PATCH] df --- API/HTTP.py | 369 ++++- API/HydrusNetwork.py | 744 ++++++++- SYS/cmdlet_api.py => API/cmdlet.py | 0 CLI.py | 27 +- MPV/pipeline_helper.py | 2 +- MPV/portable_config/mpv.conf | 18 +- MPV/portable_config/script-opts/uosc.conf | 2 +- Provider/HIFI.py | 345 +++- Provider/alldebrid.py | 7 +- Provider/hello_provider.py | 192 +++ Provider/libgen.py | 3 + Provider/podcastindex.py | 2 +- Provider/torrent.py | 442 +++++ Provider/vimm.py | 185 +++ ProviderCore/base.py | 12 +- ProviderCore/registry.py | 378 +++-- SYS/download.py | 1116 ------------- SYS/metadata.py | 1819 ++------------------- SYS/tasks.py | 234 --- Store/registry.py | 60 +- cmdlet/_shared.py | 37 +- cmdlet/add_file.py | 34 +- cmdlet/convert_file.py | 289 ++++ cmdlet/download_file.py | 111 +- cmdlet/get_tag.py | 2 +- cmdlet/search_file.py | 2 +- cmdnat/matrix.py | 2 +- docs/provider_guide.md | 165 ++ scripts/requirements.txt | 2 + tmp_trim_registry.py | 10 + tmp_write_registry.py | 3 + tool/ytdlp.py | 83 +- 32 files changed, 3447 insertions(+), 3250 deletions(-) rename SYS/cmdlet_api.py => API/cmdlet.py (100%) create mode 100644 Provider/hello_provider.py create mode 100644 Provider/torrent.py create mode 100644 Provider/vimm.py delete mode 100644 SYS/download.py delete mode 100644 SYS/tasks.py create mode 100644 cmdlet/convert_file.py create mode 100644 docs/provider_guide.md create mode 100644 tmp_trim_registry.py create mode 100644 tmp_write_registry.py diff --git a/API/HTTP.py b/API/HTTP.py index a6a8f1c..dcf1faf 100644 --- a/API/HTTP.py +++ b/API/HTTP.py @@ -10,10 +10,24 @@ Provides synchronous and asynchronous HTTP operations with: import httpx import asyncio -from typing import Optional, Dict, Any, Callable, BinaryIO +import sys +import time +import traceback +import re +from typing import Optional, Dict, Any, Callable, BinaryIO, List, Iterable, Set from pathlib import Path +from urllib.parse import unquote, urlparse, parse_qs import logging +from SYS.logger import debug, log +from SYS.models import DebugLogger, DownloadError, DownloadMediaResult, ProgressBar +from SYS.utils import ensure_directory, sha256_file + +try: # Optional; used for metadata extraction when available + from SYS.metadata import extract_ytdlp_tags +except Exception: # pragma: no cover - optional dependency + extract_ytdlp_tags = None # type: ignore[assignment] + logger = logging.getLogger(__name__) # Default configuration @@ -366,6 +380,359 @@ class HTTPClient: return self._client.stream(method, url, **kwargs) +def download_direct_file( + url: str, + output_dir: Path, + debug_logger: Optional[DebugLogger] = None, + quiet: bool = False, + suggested_filename: Optional[str] = None, + pipeline_progress: Optional[Any] = None, +) -> DownloadMediaResult: + """Download a direct file (PDF, image, document, etc.) with guardrails and metadata hooks.""" + + ensure_directory(output_dir) + + def _sanitize_filename(name: str) -> str: + # Windows-safe filename sanitization. + text = str(name or "").strip() + if not text: + return "" + text = text.replace("/", "\\") + text = text.split("\\")[-1] + + invalid = set('<>:"/\\|?*') + cleaned_chars: List[str] = [] + for ch in text: + o = ord(ch) + if o < 32 or ch in invalid: + cleaned_chars.append(" ") + continue + cleaned_chars.append(ch) + cleaned = " ".join("".join(cleaned_chars).split()).strip() + cleaned = cleaned.rstrip(" .") + return cleaned + + def _unique_path(path: Path) -> Path: + if not path.exists(): + return path + stem = path.stem + suffix = path.suffix + parent = path.parent + for i in range(1, 10_000): + candidate = parent / f"{stem} ({i}){suffix}" + if not candidate.exists(): + return candidate + return parent / f"{stem} ({int(time.time())}){suffix}" + + parsed_url = urlparse(url) + url_path = parsed_url.path + + filename: Optional[str] = None + if parsed_url.query: + query_params = parse_qs(parsed_url.query) + for param_name in ("filename", "download", "file", "name"): + if param_name in query_params and query_params[param_name]: + filename = query_params[param_name][0] + filename = unquote(filename) + break + + if not filename or not filename.strip(): + filename = url_path.split("/")[-1] if url_path else "" + filename = unquote(filename) + + if "?" in filename: + filename = filename.split("?")[0] + + content_type = "" + try: + with HTTPClient(timeout=10.0) as client: + response = client._request("HEAD", url, follow_redirects=True) + content_disposition = response.headers.get("content-disposition", "") + try: + content_type = str(response.headers.get("content-type", "") or "").strip().lower() + except Exception: + content_type = "" + + if content_disposition: + match = re.search(r'filename\*?=(?:"([^"]*)"|([^;\s]*))', content_disposition) + if match: + extracted_name = match.group(1) or match.group(2) + if extracted_name: + filename = unquote(extracted_name) + if not quiet: + debug(f"Filename from Content-Disposition: {filename}") + except Exception as exc: + if not quiet: + log(f"Could not get filename from headers: {exc}", file=sys.stderr) + + try: + page_like_exts = {".php", ".asp", ".aspx", ".jsp", ".cgi"} + ext = "" + try: + ext = Path(str(filename or "")).suffix.lower() + except Exception: + ext = "" + + ct0 = (content_type or "").split(";", 1)[0].strip().lower() + must_probe = bool(ct0.startswith("text/html") or ext in page_like_exts) + + if must_probe: + with HTTPClient(timeout=10.0) as client: + with client._request_stream("GET", url, follow_redirects=True) as resp: + resp.raise_for_status() + ct = ( + str(resp.headers.get("content-type", "") or "") + .split(";", 1)[0] + .strip() + .lower() + ) + if ct.startswith("text/html"): + raise DownloadError("URL appears to be an HTML page, not a direct file") + except DownloadError: + raise + except Exception: + pass + + suggested = _sanitize_filename(suggested_filename) if suggested_filename else "" + if suggested: + suggested_path = Path(suggested) + if suggested_path.suffix: + filename = suggested + else: + detected_ext = "" + try: + detected_ext = Path(str(filename)).suffix + except Exception: + detected_ext = "" + filename = suggested + detected_ext if detected_ext else suggested + + try: + has_ext = bool(filename and Path(str(filename)).suffix) + except Exception: + has_ext = False + + if filename and (not has_ext): + ct = (content_type or "").split(";", 1)[0].strip().lower() + ext_by_ct = { + "application/pdf": ".pdf", + "application/epub+zip": ".epub", + "application/x-mobipocket-ebook": ".mobi", + "image/jpeg": ".jpg", + "image/png": ".png", + "image/webp": ".webp", + "image/gif": ".gif", + "text/plain": ".txt", + "application/zip": ".zip", + } + + if ct in ext_by_ct: + filename = f"{filename}{ext_by_ct[ct]}" + elif ct.startswith("text/html"): + raise DownloadError("URL appears to be an HTML page, not a direct file") + + if not filename or not str(filename).strip(): + raise DownloadError( + "Could not determine filename for URL (no Content-Disposition and no path filename)" + ) + + file_path = _unique_path(output_dir / str(filename)) + + use_pipeline_transfer = False + try: + if pipeline_progress is not None and hasattr(pipeline_progress, "update_transfer"): + ui = None + if hasattr(pipeline_progress, "ui_and_pipe_index"): + ui, _ = pipeline_progress.ui_and_pipe_index() # type: ignore[attr-defined] + use_pipeline_transfer = ui is not None + except Exception: + use_pipeline_transfer = False + + progress_bar: Optional[ProgressBar] = None + if (not quiet) and (not use_pipeline_transfer): + progress_bar = ProgressBar() + + transfer_started = [False] + + if not quiet: + debug(f"Direct download: {filename}") + + try: + start_time = time.time() + downloaded_bytes = [0] + transfer_started[0] = False + + def _maybe_begin_transfer(content_length: int) -> None: + if pipeline_progress is None or transfer_started[0]: + return + try: + total_val: Optional[int] = ( + int(content_length) + if isinstance(content_length, int) and content_length > 0 + else None + ) + except Exception: + total_val = None + try: + if hasattr(pipeline_progress, "begin_transfer"): + pipeline_progress.begin_transfer( + label=str(filename or "download"), + total=total_val, + ) + transfer_started[0] = True + except Exception: + return + + def progress_callback(bytes_downloaded: int, content_length: int) -> None: + downloaded_bytes[0] = int(bytes_downloaded or 0) + + try: + if pipeline_progress is not None and hasattr(pipeline_progress, "update_transfer"): + _maybe_begin_transfer(content_length) + total_val: Optional[int] = ( + int(content_length) + if isinstance(content_length, int) and content_length > 0 + else None + ) + pipeline_progress.update_transfer( + label=str(filename or "download"), + completed=int(bytes_downloaded or 0), + total=total_val, + ) + except Exception: + pass + + if progress_bar is not None: + progress_bar.update( + downloaded=int(bytes_downloaded or 0), + total=int(content_length) if content_length and content_length > 0 else None, + label=str(filename or "download"), + file=sys.stderr, + ) + + with HTTPClient(timeout=30.0) as client: + client.download(url, str(file_path), progress_callback=progress_callback) + + elapsed = time.time() - start_time + + try: + if progress_bar is not None: + progress_bar.finish() + except Exception: + pass + + try: + if pipeline_progress is not None and transfer_started[0] and hasattr( + pipeline_progress, "finish_transfer" + ): + pipeline_progress.finish_transfer(label=str(filename or "download")) + except Exception: + pass + + if not quiet: + debug(f"✓ Downloaded in {elapsed:.1f}s") + + ext_out = "" + try: + ext_out = Path(str(filename)).suffix.lstrip(".") + except Exception: + ext_out = "" + + info: Dict[str, Any] = { + "id": str(filename).rsplit(".", 1)[0] if "." in str(filename) else str(filename), + "ext": ext_out, + "webpage_url": url, + } + + hash_value = None + try: + hash_value = sha256_file(file_path) + except Exception: + pass + + tags: List[str] = [] + if extract_ytdlp_tags: + try: + tags = extract_ytdlp_tags(info) + except Exception as exc: + log(f"Error extracting tags: {exc}", file=sys.stderr) + + if not any(str(t).startswith("title:") for t in tags): + info["title"] = str(filename) + tags = [] + if extract_ytdlp_tags: + try: + tags = extract_ytdlp_tags(info) + except Exception as exc: + log(f"Error extracting tags with filename: {exc}", file=sys.stderr) + + if debug_logger is not None: + debug_logger.write_record( + "direct-file-downloaded", + {"url": url, "path": str(file_path), "hash": hash_value}, + ) + + return DownloadMediaResult( + path=file_path, + info=info, + tag=tags, + source_url=url, + hash_value=hash_value, + ) + + except (httpx.HTTPError, httpx.RequestError) as exc: + try: + if progress_bar is not None: + progress_bar.finish() + except Exception: + pass + try: + if pipeline_progress is not None and transfer_started[0] and hasattr( + pipeline_progress, "finish_transfer" + ): + pipeline_progress.finish_transfer(label=str(filename or "download")) + except Exception: + pass + + log(f"Download error: {exc}", file=sys.stderr) + if debug_logger is not None: + debug_logger.write_record( + "exception", + {"phase": "direct-file", "url": url, "error": str(exc)}, + ) + raise DownloadError(f"Failed to download {url}: {exc}") from exc + + except Exception as exc: + try: + if progress_bar is not None: + progress_bar.finish() + except Exception: + pass + try: + if pipeline_progress is not None and transfer_started[0] and hasattr( + pipeline_progress, "finish_transfer" + ): + pipeline_progress.finish_transfer(label=str(filename or "download")) + except Exception: + pass + + log(f"Error downloading file: {exc}", file=sys.stderr) + if debug_logger is not None: + debug_logger.write_record( + "exception", + { + "phase": "direct-file", + "url": url, + "error": str(exc), + "traceback": traceback.format_exc(), + }, + ) + raise DownloadError(f"Error downloading file: {exc}") from exc + + +# Back-compat alias +_download_direct_file = download_direct_file + + class AsyncHTTPClient: """Unified async HTTP client with asyncio support.""" diff --git a/API/HydrusNetwork.py b/API/HydrusNetwork.py index f8d0a44..7165c94 100644 --- a/API/HydrusNetwork.py +++ b/API/HydrusNetwork.py @@ -11,6 +11,7 @@ import shutil import subprocess import sys import time +from collections import deque from SYS.logger import log from SYS.utils_constant import ALL_SUPPORTED_EXTENSIONS as GLOBAL_SUPPORTED_EXTENSIONS @@ -18,8 +19,8 @@ import tempfile import logging from dataclasses import dataclass, field from pathlib import Path -from typing import Any, Iterable, Optional, Sequence, Type, TypeVar, Union, cast -from urllib.parse import urlsplit, urlencode, quote +from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Type, TypeVar, Union, cast +from urllib.parse import urlsplit, urlencode, quote, urlunsplit, unquote import httpx logger = logging.getLogger(__name__) @@ -1828,3 +1829,742 @@ def download_hydrus_file( print_final_progress(filename, file_size, elapsed) return downloaded + + +# ============================================================================ +# Hydrus metadata helpers (moved from SYS.metadata) +# ============================================================================ + + +def _normalize_hash(value: Any) -> str: + candidate = str(value or "").strip().lower() + if not candidate: + raise ValueError("Hydrus hash is required") + if len(candidate) != 64 or any(ch not in "0123456789abcdef" for ch in candidate): + raise ValueError("Hydrus hash must be a 64-character hex string") + return candidate + + +def _normalize_tag(tag: Any) -> Optional[str]: + if tag is None: + return None + if isinstance(tag, str): + candidate = tag.strip() + else: + candidate = str(tag).strip() + return candidate or None + + +def _dedup_tags_by_namespace(tags: List[str], keep_first: bool = True) -> List[str]: + if not tags: + return [] + + namespace_to_tags: Dict[Optional[str], List[Tuple[int, str]]] = {} + first_appearance: Dict[Optional[str], int] = {} + + for idx, tag in enumerate(tags): + namespace: Optional[str] = tag.split(":", 1)[0] if ":" in tag else None + if namespace not in first_appearance: + first_appearance[namespace] = idx + if namespace not in namespace_to_tags: + namespace_to_tags[namespace] = [] + namespace_to_tags[namespace].append((idx, tag)) + + result: List[Tuple[int, str]] = [] + for namespace, tag_list in namespace_to_tags.items(): + chosen_tag = tag_list[0][1] if keep_first else tag_list[-1][1] + result.append((first_appearance[namespace], chosen_tag)) + + result.sort(key=lambda x: x[0]) + return [tag for _, tag in result] + + +def _extract_tag_services(entry: Dict[str, Any]) -> List[Dict[str, Any]]: + tags_section = entry.get("tags") + services: List[Dict[str, Any]] = [] + if not isinstance(tags_section, dict): + return services + names_map = tags_section.get("service_keys_to_names") + if not isinstance(names_map, dict): + names_map = {} + + def get_record(service_key: Optional[str], service_name: Optional[str]) -> Dict[str, Any]: + key_lower = service_key.lower() if isinstance(service_key, str) else None + name_lower = service_name.lower() if isinstance(service_name, str) else None + for record in services: + existing_key = record.get("service_key") + if key_lower and isinstance(existing_key, str) and existing_key.lower() == key_lower: + if service_name and not record.get("service_name"): + record["service_name"] = service_name + return record + existing_name = record.get("service_name") + if name_lower and isinstance(existing_name, str) and existing_name.lower() == name_lower: + if service_key and not record.get("service_key"): + record["service_key"] = service_key + return record + record = { + "service_key": service_key, + "service_name": service_name, + "tags": [], + } + services.append(record) + return record + + def _iter_current_status_lists(container: Any) -> Iterable[List[Any]]: + if isinstance(container, dict): + for status_key, tags_list in container.items(): + if str(status_key) != "0": + continue + if isinstance(tags_list, list): + yield tags_list + elif isinstance(container, list): + yield container + + statuses_map = tags_section.get("service_keys_to_statuses_to_tags") + if isinstance(statuses_map, dict): + for service_key, status_map in statuses_map.items(): + record = get_record(service_key if isinstance(service_key, str) else None, names_map.get(service_key)) + for tags_list in _iter_current_status_lists(status_map): + for tag in tags_list: + normalized = _normalize_tag(tag) + if normalized: + record["tags"].append(normalized) + + ignored_keys = { + "service_keys_to_statuses_to_tags", + "service_keys_to_statuses_to_display_tags", + "service_keys_to_display_friendly_tags", + "service_keys_to_names", + "tag_display_types_to_namespaces", + "namespace_display_string_lookup", + "tag_display_decoration_colour_lookup", + } + + for key, service in tags_section.items(): + if key in ignored_keys: + continue + if isinstance(service, dict): + service_key = service.get("service_key") or (key if isinstance(key, str) else None) + service_name = service.get("service_name") or service.get("name") or names_map.get(service_key) + record = get_record(service_key if isinstance(service_key, str) else None, service_name) + storage = service.get("storage_tags") or service.get("statuses_to_tags") or service.get("tags") + if isinstance(storage, dict): + for tags_list in _iter_current_status_lists(storage): + for tag in tags_list: + normalized = _normalize_tag(tag) + if normalized: + record["tags"].append(normalized) + elif isinstance(storage, list): + for tag in storage: + normalized = _normalize_tag(tag) + if normalized: + record["tags"].append(normalized) + + for record in services: + record["tags"] = _dedup_tags_by_namespace(record["tags"], keep_first=True) + return services + + +def _select_primary_tags( + services: List[Dict[str, Any]], + aggregated: List[str], + prefer_service: Optional[str] +) -> Tuple[Optional[str], List[str]]: + prefer_lower = prefer_service.lower() if isinstance(prefer_service, str) else None + if prefer_lower: + for record in services: + name = record.get("service_name") + if isinstance(name, str) and name.lower() == prefer_lower and record["tags"]: + return record.get("service_key"), record["tags"] + for record in services: + if record["tags"]: + return record.get("service_key"), record["tags"] + return None, aggregated + + +def _derive_title( + tags_primary: List[str], + tags_aggregated: List[str], + entry: Dict[str, Any] +) -> Optional[str]: + for source in (tags_primary, tags_aggregated): + for tag in source: + namespace, sep, value = tag.partition(":") + if sep and namespace and namespace.lower() == "title": + cleaned = value.strip() + if cleaned: + return cleaned + for key in ( + "title", + "display_name", + "pretty_name", + "original_display_filename", + "original_filename", + ): + value = entry.get(key) + if isinstance(value, str): + cleaned = value.strip() + if cleaned: + return cleaned + return None + + +def _derive_clip_time( + tags_primary: List[str], + tags_aggregated: List[str], + entry: Dict[str, Any] +) -> Optional[str]: + namespaces = {"clip", "clip_time", "cliptime"} + for source in (tags_primary, tags_aggregated): + for tag in source: + namespace, sep, value = tag.partition(":") + if sep and namespace and namespace.lower() in namespaces: + cleaned = value.strip() + if cleaned: + return cleaned + clip_value = entry.get("clip_time") + if isinstance(clip_value, str): + cleaned_clip = clip_value.strip() + if cleaned_clip: + return cleaned_clip + return None + + +def _summarize_hydrus_entry( + entry: Dict[str, Any], + prefer_service: Optional[str] +) -> Tuple[Dict[str, Any], List[str], Optional[str], Optional[str], Optional[str]]: + services = _extract_tag_services(entry) + aggregated: List[str] = [] + seen: Set[str] = set() + for record in services: + for tag in record["tags"]: + if tag not in seen: + seen.add(tag) + aggregated.append(tag) + service_key, primary_tags = _select_primary_tags(services, aggregated, prefer_service) + title = _derive_title(primary_tags, aggregated, entry) + clip_time = _derive_clip_time(primary_tags, aggregated, entry) + summary = dict(entry) + if title and not summary.get("title"): + summary["title"] = title + if clip_time and not summary.get("clip_time"): + summary["clip_time"] = clip_time + summary["tag_service_key"] = service_key + summary["has_current_file_service"] = _has_current_file_service(entry) + if "is_local" not in summary: + summary["is_local"] = bool(entry.get("is_local")) + return summary, primary_tags, service_key, title, clip_time + + +def _looks_like_hash(value: Any) -> bool: + if not isinstance(value, str): + return False + candidate = value.strip().lower() + return len(candidate) == 64 and all(ch in "0123456789abcdef" for ch in candidate) + + +def _collect_relationship_hashes(payload: Any, accumulator: Set[str]) -> None: + if isinstance(payload, dict): + for value in payload.values(): + _collect_relationship_hashes(value, accumulator) + elif isinstance(payload, (list, tuple, set)): + for value in payload: + _collect_relationship_hashes(value, accumulator) + elif isinstance(payload, str) and _looks_like_hash(payload): + accumulator.add(payload) + + +def _generate_hydrus_url_variants(url: str) -> List[str]: + seen: Set[str] = set() + variants: List[str] = [] + + def push(candidate: Optional[str]) -> None: + if not candidate: + return + text = candidate.strip() + if not text or text in seen: + return + seen.add(text) + variants.append(text) + + push(url) + try: + parsed = urlsplit(url) + except Exception: + return variants + + if parsed.scheme in {"http", "https"}: + alternate_scheme = "https" if parsed.scheme == "http" else "http" + push(urlunsplit((alternate_scheme, parsed.netloc, parsed.path, parsed.query, parsed.fragment))) + + normalised_netloc = parsed.netloc.lower() + if normalised_netloc and normalised_netloc != parsed.netloc: + push(urlunsplit((parsed.scheme, normalised_netloc, parsed.path, parsed.query, parsed.fragment))) + + if parsed.path: + trimmed_path = parsed.path.rstrip("/") + if trimmed_path != parsed.path: + push(urlunsplit((parsed.scheme, parsed.netloc, trimmed_path, parsed.query, parsed.fragment))) + else: + push(urlunsplit((parsed.scheme, parsed.netloc, parsed.path + "/", parsed.query, parsed.fragment))) + unquoted_path = unquote(parsed.path) + if unquoted_path != parsed.path: + push(urlunsplit((parsed.scheme, parsed.netloc, unquoted_path, parsed.query, parsed.fragment))) + + if parsed.query or parsed.fragment: + push(urlunsplit((parsed.scheme, parsed.netloc, parsed.path, "", ""))) + if parsed.path: + unquoted_path = unquote(parsed.path) + push(urlunsplit((parsed.scheme, parsed.netloc, unquoted_path, "", ""))) + + return variants + + +def _build_hydrus_query( + hashes: Optional[Sequence[str]], + file_ids: Optional[Sequence[int]], + include_relationships: bool, + minimal: bool, +) -> Dict[str, str]: + query: Dict[str, str] = {} + if hashes: + query["hashes"] = json.dumps([_normalize_hash(h) for h in hashes]) + if file_ids: + query["file_ids"] = json.dumps([int(fid) for fid in file_ids]) + if not query: + raise ValueError("hashes or file_ids must be provided") + query["include_service_keys_to_tags"] = json.dumps(True) + query["include_tag_services"] = json.dumps(True) + query["include_file_services"] = json.dumps(True) + if include_relationships: + query["include_file_relationships"] = json.dumps(True) + if not minimal: + extras = ( + "include_url", + "include_size", + "include_width", + "include_height", + "include_duration", + "include_mime", + "include_has_audio", + "include_is_trashed", + ) + for key in extras: + query[key] = json.dumps(True) + return query + + +def _fetch_hydrus_entries( + client: "HydrusNetwork", + hashes: Optional[Sequence[str]], + file_ids: Optional[Sequence[int]], + include_relationships: bool, + minimal: bool, +) -> List[Dict[str, Any]]: + if not hashes and not file_ids: + return [] + spec = HydrusRequestSpec( + method="GET", + endpoint="/get_files/file_metadata", + query=_build_hydrus_query(hashes, file_ids, include_relationships, minimal), + ) + response = client._perform_request(spec) + metadata = response.get("metadata") if isinstance(response, dict) else None + if isinstance(metadata, list): + return [entry for entry in metadata if isinstance(entry, dict)] + return [] + + +def _has_current_file_service(entry: Dict[str, Any]) -> bool: + services = entry.get("file_services") + if not isinstance(services, dict): + return False + current = services.get("current") + if isinstance(current, dict): + for value in current.values(): + if value: + return True + return False + if isinstance(current, list): + return len(current) > 0 + return False + + +def _compute_file_flags(entry: Dict[str, Any]) -> Tuple[bool, bool, bool]: + mime = entry.get("mime") + mime_lower = mime.lower() if isinstance(mime, str) else "" + is_video = mime_lower.startswith("video/") + is_audio = mime_lower.startswith("audio/") + is_deleted = bool(entry.get("is_trashed")) + file_services = entry.get("file_services") + if not is_deleted and isinstance(file_services, dict): + deleted = file_services.get("deleted") + if isinstance(deleted, dict) and deleted: + is_deleted = True + return is_video, is_audio, is_deleted + + +def fetch_hydrus_metadata(payload: Dict[str, Any]) -> Dict[str, Any]: + hash_hex = None + raw_hash_value = payload.get("hash") + if raw_hash_value is not None: + hash_hex = _normalize_hash(raw_hash_value) + file_ids: List[int] = [] + raw_file_ids = payload.get("file_ids") + if isinstance(raw_file_ids, (list, tuple, set)): + for value in raw_file_ids: + try: + file_ids.append(int(value)) + except (TypeError, ValueError): + continue + elif raw_file_ids is not None: + try: + file_ids.append(int(raw_file_ids)) + except (TypeError, ValueError): + file_ids = [] + raw_file_id = payload.get("file_id") + if raw_file_id is not None: + try: + coerced = int(raw_file_id) + except (TypeError, ValueError): + coerced = None + if coerced is not None and coerced not in file_ids: + file_ids.append(coerced) + base_url = str(payload.get("api_url") or "").strip() + if not base_url: + raise ValueError("Hydrus api_url is required") + access_key = str(payload.get("access_key") or "").strip() + options_raw = payload.get("options") + options = options_raw if isinstance(options_raw, dict) else {} + prefer_service = options.get("prefer_service_name") + if isinstance(prefer_service, str): + prefer_service = prefer_service.strip() + else: + prefer_service = None + include_relationships = bool(options.get("include_relationships")) + minimal = bool(options.get("minimal")) + timeout = float(options.get("timeout") or 60.0) + client = HydrusNetwork(base_url, access_key, timeout) + hashes: Optional[List[str]] = None + if hash_hex: + hashes = [hash_hex] + if not hashes and not file_ids: + raise ValueError("Hydrus hash or file id is required") + try: + entries = _fetch_hydrus_entries( + client, + hashes, + file_ids or None, + include_relationships, + minimal + ) + except HydrusRequestError as exc: + raise RuntimeError(str(exc)) + if not entries: + response: Dict[str, Any] = { + "hash": hash_hex, + "metadata": {}, + "tags": [], + "warnings": [f"No Hydrus metadata for {hash_hex or file_ids}"], + "error": "not_found", + } + if file_ids: + response["file_id"] = file_ids[0] + return response + entry = entries[0] + if not hash_hex: + entry_hash = entry.get("hash") + if isinstance(entry_hash, str) and entry_hash: + hash_hex = entry_hash + hashes = [hash_hex] + summary, primary_tags, service_key, title, clip_time = _summarize_hydrus_entry(entry, prefer_service) + is_video, is_audio, is_deleted = _compute_file_flags(entry) + has_current_file_service = _has_current_file_service(entry) + is_local = bool(entry.get("is_local")) + size_bytes = entry.get("size") or entry.get("file_size") + filesize_mb = None + if isinstance(size_bytes, (int, float)) and size_bytes > 0: + filesize_mb = float(size_bytes) / (1024.0 * 1024.0) + duration = entry.get("duration") + if duration is None and isinstance(entry.get("duration_ms"), (int, float)): + duration = float(entry["duration_ms"]) / 1000.0 + warnings_list: List[str] = [] + if not primary_tags: + warnings_list.append("No tags returned for preferred service") + relationships = None + relationship_metadata: Dict[str, Dict[str, Any]] = {} + if include_relationships and hash_hex: + try: + rel_spec = HydrusRequestSpec( + method="GET", + endpoint="/manage_file_relationships/get_file_relationships", + query={"hash": hash_hex}, + ) + relationships = client._perform_request(rel_spec) + except HydrusRequestError as exc: + warnings_list.append(f"Relationship lookup failed: {exc}") + relationships = None + if isinstance(relationships, dict): + related_hashes: Set[str] = set() + _collect_relationship_hashes(relationships, related_hashes) + related_hashes.discard(hash_hex) + if related_hashes: + try: + related_entries = _fetch_hydrus_entries( + client, + sorted(related_hashes), + None, + False, + True + ) + except HydrusRequestError as exc: + warnings_list.append(f"Relationship metadata fetch failed: {exc}") + else: + for rel_entry in related_entries: + rel_hash = rel_entry.get("hash") + if not isinstance(rel_hash, str): + continue + rel_summary, rel_tags, _, rel_title, rel_clip = _summarize_hydrus_entry(rel_entry, prefer_service) + rel_summary["tags"] = rel_tags + if rel_title: + rel_summary["title"] = rel_title + if rel_clip: + rel_summary["clip_time"] = rel_clip + relationship_metadata[rel_hash] = rel_summary + result: Dict[str, Any] = { + "hash": entry.get("hash") or hash_hex, + "metadata": summary, + "tags": primary_tags, + "tag_service_key": service_key, + "title": title, + "clip_time": clip_time, + "duration": duration, + "filesize_mb": filesize_mb, + "is_video": is_video, + "is_audio": is_audio, + "is_deleted": is_deleted, + "is_local": is_local, + "has_current_file_service": has_current_file_service, + "matched_hash": entry.get("hash") or hash_hex, + "swap_recommended": False, + } + file_id_value = entry.get("file_id") + if isinstance(file_id_value, (int, float)): + result["file_id"] = int(file_id_value) + if relationships is not None: + result["relationships"] = relationships + if relationship_metadata: + result["relationship_metadata"] = relationship_metadata + if warnings_list: + result["warnings"] = warnings_list + return result + + +def fetch_hydrus_metadata_by_url(payload: Dict[str, Any]) -> Dict[str, Any]: + raw_url = payload.get("url") or payload.get("source_url") + url = str(raw_url or "").strip() + if not url: + raise ValueError("URL is required to fetch Hydrus metadata by URL") + base_url = str(payload.get("api_url") or "").strip() + if not base_url: + raise ValueError("Hydrus api_url is required") + access_key = str(payload.get("access_key") or "").strip() + options_raw = payload.get("options") + options = options_raw if isinstance(options_raw, dict) else {} + timeout = float(options.get("timeout") or 60.0) + client = HydrusNetwork(base_url, access_key, timeout) + hashes: Optional[List[str]] = None + file_ids: Optional[List[int]] = None + matched_url = None + normalised_reported = None + seen: Set[str] = set() + queue = deque() + for variant in _generate_hydrus_url_variants(url): + queue.append(variant) + if not queue: + queue.append(url) + tried_variants: List[str] = [] + while queue: + candidate = queue.popleft() + candidate = str(candidate or "").strip() + if not candidate or candidate in seen: + continue + seen.add(candidate) + tried_variants.append(candidate) + spec = HydrusRequestSpec( + method="GET", + endpoint="/add_urls/get_url_files", + query={"url": candidate}, + ) + try: + response = client._perform_request(spec) + except HydrusRequestError as exc: + raise RuntimeError(str(exc)) + response_hashes_list: List[str] = [] + response_file_ids_list: List[int] = [] + if isinstance(response, dict): + normalised_value = response.get("normalised_url") + if isinstance(normalised_value, str): + trimmed = normalised_value.strip() + if trimmed: + normalised_reported = normalised_reported or trimmed + if trimmed not in seen: + queue.append(trimmed) + for redirect_key in ("redirect_url", "url"): + redirect_value = response.get(redirect_key) + if isinstance(redirect_value, str): + redirect_trimmed = redirect_value.strip() + if redirect_trimmed and redirect_trimmed not in seen: + queue.append(redirect_trimmed) + raw_hashes = response.get("hashes") or response.get("file_hashes") + if isinstance(raw_hashes, list): + for item in raw_hashes: + try: + normalized = _normalize_hash(item) + except ValueError: + continue + if normalized: + response_hashes_list.append(normalized) + raw_ids = response.get("file_ids") or response.get("file_id") + if isinstance(raw_ids, list): + for item in raw_ids: + try: + response_file_ids_list.append(int(item)) + except (TypeError, ValueError): + continue + elif raw_ids is not None: + try: + response_file_ids_list.append(int(raw_ids)) + except (TypeError, ValueError): + pass + statuses = response.get("url_file_statuses") + if isinstance(statuses, list): + for entry in statuses: + if not isinstance(entry, dict): + continue + status_hash = entry.get("hash") or entry.get("file_hash") + if status_hash: + try: + normalized = _normalize_hash(status_hash) + except ValueError: + normalized = None + if normalized: + response_hashes_list.append(normalized) + status_id = entry.get("file_id") or entry.get("fileid") + if status_id is not None: + try: + response_file_ids_list.append(int(status_id)) + except (TypeError, ValueError): + pass + if not hashes and response_hashes_list: + hashes = response_hashes_list + if not file_ids and response_file_ids_list: + file_ids = response_file_ids_list + if hashes or file_ids: + matched_url = candidate + break + if not hashes and not file_ids: + raise RuntimeError( + "No Hydrus matches for URL variants: " + + ", ".join(tried_variants) + ) + followup_payload = { + "api_url": base_url, + "access_key": access_key, + "hash": hashes[0] if hashes else None, + "file_ids": file_ids, + "options": {"timeout": timeout, "minimal": True}, + } + result = fetch_hydrus_metadata(followup_payload) + result["matched_url"] = matched_url or url + result["normalised_url"] = normalised_reported or matched_url or url + result["tried_urls"] = tried_variants + return result + + +def _build_hydrus_context(payload: Dict[str, Any]) -> Tuple["HydrusNetwork", str, str, float, Optional[str]]: + base_url = str(payload.get("api_url") or "").strip() + if not base_url: + raise ValueError("Hydrus api_url is required") + access_key = str(payload.get("access_key") or "").strip() + options_raw = payload.get("options") + options = options_raw if isinstance(options_raw, dict) else {} + timeout = float(options.get("timeout") or payload.get("timeout") or 60.0) + prefer_service = payload.get("prefer_service_name") or options.get("prefer_service_name") + if isinstance(prefer_service, str): + prefer_service = prefer_service.strip() or None + else: + prefer_service = None + client = HydrusNetwork(base_url, access_key, timeout) + return client, base_url, access_key, timeout, prefer_service + + +def _refetch_hydrus_summary( + base_url: str, + access_key: str, + hash_hex: str, + timeout: float, + prefer_service: Optional[str] +) -> Dict[str, Any]: + payload: Dict[str, Any] = { + "hash": hash_hex, + "api_url": base_url, + "access_key": access_key, + "options": { + "minimal": True, + "include_relationships": False, + "timeout": timeout, + }, + } + if prefer_service: + payload["options"]["prefer_service_name"] = prefer_service + return fetch_hydrus_metadata(payload) + + +def apply_hydrus_tag_mutation( + payload: Dict[str, Any], + add: Iterable[Any], + remove: Iterable[Any] +) -> Dict[str, Any]: + client, base_url, access_key, timeout, prefer_service = _build_hydrus_context(payload) + hash_hex = _normalize_hash(payload.get("hash")) + add_list = [_normalize_tag(tag) for tag in add if _normalize_tag(tag)] + remove_list = [_normalize_tag(tag) for tag in remove if _normalize_tag(tag)] + if not add_list and not remove_list: + raise ValueError("No tag changes supplied") + service_key = payload.get("service_key") or payload.get("tag_service_key") + summary = None + if not service_key: + summary = _refetch_hydrus_summary(base_url, access_key, hash_hex, timeout, prefer_service) + service_key = summary.get("tag_service_key") + if not isinstance(service_key, str) or not service_key: + raise RuntimeError("Unable to determine Hydrus tag service key") + actions: Dict[str, List[str]] = {} + if add_list: + actions["0"] = [tag for tag in add_list if tag] + if remove_list: + actions["1"] = [tag for tag in remove_list if tag] + if not actions: + raise ValueError("Tag mutation produced no actionable changes") + request_payload = { + "hashes": [hash_hex], + "service_keys_to_actions_to_tags": { + service_key: actions, + }, + } + try: + tag_spec = HydrusRequestSpec( + method="POST", + endpoint="/add_tags/add_tags", + data=request_payload, + ) + client._perform_request(tag_spec) + except HydrusRequestError as exc: + raise RuntimeError(str(exc)) + summary_after = _refetch_hydrus_summary(base_url, access_key, hash_hex, timeout, prefer_service) + result = dict(summary_after) + result["added_tags"] = actions.get("0", []) + result["removed_tags"] = actions.get("1", []) + result["tag_service_key"] = summary_after.get("tag_service_key") + return result diff --git a/SYS/cmdlet_api.py b/API/cmdlet.py similarity index 100% rename from SYS/cmdlet_api.py rename to API/cmdlet.py diff --git a/CLI.py b/CLI.py index 4768bf8..46a9d31 100644 --- a/CLI.py +++ b/CLI.py @@ -845,21 +845,8 @@ class CmdletIntrospection: providers.keys() ) - try: - from Provider.metadata_provider import list_metadata_providers - - meta_providers = list_metadata_providers(config) or {} - meta_available = [n for n, ready in meta_providers.items() if ready] - meta_choices = ( - sorted(meta_available) - if meta_available else sorted(meta_providers.keys()) - ) - except Exception: - meta_choices = [] - - merged = sorted(set(provider_choices + meta_choices)) - if merged: - return merged + if provider_choices: + return provider_choices if normalized_arg == "scrape": try: @@ -990,7 +977,15 @@ class CmdletCompleter(Completer): config=config ) if choices: - for choice in choices: + choice_list = choices + normalized_prev = prev_token.lstrip("-").strip().lower() + if normalized_prev == "provider" and current_token: + current_lower = current_token.lower() + filtered = [c for c in choices if current_lower in c.lower()] + if filtered: + choice_list = filtered + + for choice in choice_list: yield Completion(choice, start_position=-len(current_token)) # Example: if the user has typed `download-file -url ...`, then `url` # is considered used and should not be suggested again (even as `--url`). diff --git a/MPV/pipeline_helper.py b/MPV/pipeline_helper.py index 0f24da2..e92f4ab 100644 --- a/MPV/pipeline_helper.py +++ b/MPV/pipeline_helper.py @@ -343,7 +343,7 @@ def _run_op(op: str, data: Any) -> Dict[str, Any]: # Fast gate: only for streaming URLs yt-dlp knows about. try: - from SYS.download import is_url_supported_by_ytdlp # noqa: WPS433 + from tool.ytdlp import is_url_supported_by_ytdlp # noqa: WPS433 if not is_url_supported_by_ytdlp(url): return { diff --git a/MPV/portable_config/mpv.conf b/MPV/portable_config/mpv.conf index e849927..5c6e6e8 100644 --- a/MPV/portable_config/mpv.conf +++ b/MPV/portable_config/mpv.conf @@ -6,20 +6,22 @@ osd-bar=no border=no # Keep the window size stable when loading files (don't resize to match aspect). -keepaspect-window=no - # Ensure uosc texture/icon fonts are discoverable by libass. osd-fonts-dir=~~/scripts/uosc/fonts sub-fonts-dir=~~/scripts/uosc/ -auto-window-resize=no ontop=yes +autofit=100% save-position-on-quit=yes # Avoid showing embedded cover art for audio-only files. audio-display=no +# Stretch the video to fill the window (ignore aspect ratio, may distort) +keepaspect=no +video-unscaled=no +cursor-autohide=1000 # gpu-next can be fragile on some Windows/D3D11 setups; prefer the stable VO. vo=gpu @@ -34,7 +36,7 @@ background=none background-color=0/0 # Without transparency, these options may be useful: -background-color=.2 # don't use pure black +# background-color=.2 # don't use pure black (disabled to keep video background transparent) force-window-position # recenter the window when changing playlist position on X11 and macOS auto-window-resize=no # preserve the window size when changing playlist entry @@ -79,11 +81,3 @@ reset-on-next-file-remove=video-zoom # preserve the zoom when changing file reset-on-next-file-remove=panscan reset-on-next-file-remove=video-unscaled linear-downscaling=no # don't make black and white manga brighter - - -git config --global user.name "Nose" -git config --global user.email "goyimnose@nothing.blah" - -ssh-keygen -t ed25519 -C "goyimnose@nothing.blah" -f $env:USERPROFILE\.ssh\id_ed25519 - -git remote set-url origin goyimnose@nothing.blah:OWNER/REPO.git \ No newline at end of file diff --git a/MPV/portable_config/script-opts/uosc.conf b/MPV/portable_config/script-opts/uosc.conf index 3422f7c..e55ccfb 100644 --- a/MPV/portable_config/script-opts/uosc.conf +++ b/MPV/portable_config/script-opts/uosc.conf @@ -182,7 +182,7 @@ time_precision=0 # Display stream's buffered time in timeline if it's lower than this amount of seconds, 0 to disable buffered_time_threshold=60 # Hide UI when mpv autohides the cursor. Timing is controlled by `cursor-autohide` in `mpv.conf` (in milliseconds). -autohide=no +autohide=yes # Can be: flash, static, manual (controlled by flash-pause-indicator and decide-pause-indicator commands) pause_indicator=flash # Sizes to list in stream quality menu diff --git a/Provider/HIFI.py b/Provider/HIFI.py index 67ba6ca..3bd1c95 100644 --- a/Provider/HIFI.py +++ b/Provider/HIFI.py @@ -1,12 +1,15 @@ from __future__ import annotations +import os +import random import re import shutil +import string +import subprocess +import time import sys from pathlib import Path -import subprocess -from typing import Any, Dict, List, Optional, Tuple - +from typing import Any, Dict, Iterable, List, Optional, Tuple from API.hifi import HifiApiClient from ProviderCore.base import Provider, SearchResult from SYS.logger import debug, log @@ -733,6 +736,10 @@ class HIFI(Provider): input_ref: str, output_path: Path, lossless_fallback: bool = True, + progress: Optional[Any] = None, + transfer_label: Optional[str] = None, + duration_seconds: Optional[int] = None, + audio_quality: Optional[str] = None, ) -> Optional[Path]: ffmpeg_path = self._find_ffmpeg() if not ffmpeg_path: @@ -749,20 +756,115 @@ class HIFI(Provider): protocol_whitelist = "file,https,http,tcp,tls,crypto,data" - def _run(cmd: List[str]) -> bool: + label = str(transfer_label or output_path.name or "hifi") + + def _estimate_total_bytes() -> Optional[int]: try: - proc = subprocess.run( - cmd, - capture_output=True, - text=True, - check=False, + dur = int(duration_seconds) if duration_seconds is not None else None + except Exception: + dur = None + if not dur or dur <= 0: + return None + + qual = str(audio_quality or "").strip().lower() + # Rough per-quality bitrate guess (bytes/sec). + if qual in {"hi_res", + "hi_res_lossless", + "hires", + "hi-res", + "master", + "mqa"}: + bps = 4_608_000 # ~24-bit/96k stereo + elif qual in {"lossless", + "flac"}: + bps = 1_411_200 # 16-bit/44.1k stereo + else: + bps = 320_000 # kbps for compressed + + try: + return int((bps / 8.0) * dur) + except Exception: + return None + + est_total_bytes = _estimate_total_bytes() + + def _update_transfer(total_bytes_val: Optional[int]) -> None: + if progress is None: + return + try: + progress.update_transfer( + label=label, + completed=int(total_bytes_val) if total_bytes_val is not None else None, + total=est_total_bytes, + ) + except Exception: + pass + + def _run(cmd: List[str], *, target_path: Optional[Path] = None) -> bool: + cmd_progress = list(cmd) + # Enable ffmpeg progress output for live byte updates. + cmd_progress.insert(1, "-progress") + cmd_progress.insert(2, "pipe:1") + cmd_progress.insert(3, "-nostats") + + try: + proc = subprocess.Popen( + cmd_progress, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, ) - if proc.returncode == 0 and self._has_nonempty_file(output_path): - return True - if proc.stderr: - debug(f"[hifi] ffmpeg failed: {proc.stderr.strip()}") except Exception as exc: debug(f"[hifi] ffmpeg invocation failed: {exc}") + return False + + last_bytes = None + try: + while True: + line = proc.stdout.readline() if proc.stdout else "" + if not line: + if proc.poll() is not None: + break + time.sleep(0.05) + continue + + if "=" not in line: + continue + key, val = line.strip().split("=", 1) + if key == "total_size": + try: + last_bytes = int(val) + _update_transfer(last_bytes) + except Exception: + pass + elif key == "out_time_ms": + # Map out_time_ms to byte estimate when total_size missing. + try: + if est_total_bytes and val.isdigit(): + ms = int(val) + dur_ms = (duration_seconds or 0) * 1000 + if dur_ms > 0: + pct = min(1.0, max(0.0, ms / dur_ms)) + approx = int(est_total_bytes * pct) + _update_transfer(approx) + except Exception: + pass + + proc.wait() + finally: + if last_bytes is not None: + _update_transfer(last_bytes) + + check_path = target_path or output_path + if proc.returncode == 0 and self._has_nonempty_file(check_path): + return True + + try: + stderr_text = proc.stderr.read() if proc.stderr else "" + if stderr_text: + debug(f"[hifi] ffmpeg failed: {stderr_text.strip()}") + except Exception: + pass return False # Prefer remux (fast, no transcode). @@ -816,25 +918,14 @@ class HIFI(Provider): "flac", str(tmp_flac_path), ] - try: - proc = subprocess.run( - cmd_flac, - capture_output=True, - text=True, - check=False, - ) - if proc.returncode == 0 and self._has_nonempty_file(tmp_flac_path): - if tmp_flac_path != flac_path: - try: - tmp_flac_path.replace(flac_path) - except Exception: - # If rename fails, still return the temp file. - return tmp_flac_path - return flac_path - if proc.stderr: - debug(f"[hifi] ffmpeg flac fallback failed: {proc.stderr.strip()}") - except Exception as exc: - debug(f"[hifi] ffmpeg flac fallback invocation failed: {exc}") + if _run(cmd_flac, target_path=tmp_flac_path) and self._has_nonempty_file(tmp_flac_path): + if tmp_flac_path != flac_path: + try: + tmp_flac_path.replace(flac_path) + except Exception: + # If rename fails, still return the temp file. + return tmp_flac_path + return flac_path return None def download(self, result: SearchResult, output_dir: Path) -> Optional[Path]: @@ -921,7 +1012,14 @@ class HIFI(Provider): # If resolve_tidal_manifest_path returned a URL, prefer feeding it directly to ffmpeg. if resolved_text.lower().startswith("http"): out_file = output_dir / f"{stem}{suffix}" - materialized = self._ffmpeg_demux_to_audio(input_ref=resolved_text, output_path=out_file) + materialized = self._ffmpeg_demux_to_audio( + input_ref=resolved_text, + output_path=out_file, + progress=self.config.get("_pipeline_progress") if isinstance(self.config, dict) else None, + transfer_label=title_part or getattr(result, "title", None), + duration_seconds=self._coerce_duration_seconds(md), + audio_quality=md.get("audioQuality") if isinstance(md, dict) else None, + ) if materialized is not None: return materialized @@ -947,7 +1045,14 @@ class HIFI(Provider): if source_path.is_file() and source_path.suffix.lower() == ".mpd": # Materialize audio from the local MPD. out_file = output_dir / f"{stem}{suffix}" - materialized = self._ffmpeg_demux_to_audio(input_ref=str(source_path), output_path=out_file) + materialized = self._ffmpeg_demux_to_audio( + input_ref=str(source_path), + output_path=out_file, + progress=self.config.get("_pipeline_progress") if isinstance(self.config, dict) else None, + transfer_label=title_part or getattr(result, "title", None), + duration_seconds=self._coerce_duration_seconds(md), + audio_quality=md.get("audioQuality") if isinstance(md, dict) else None, + ) if materialized is not None: return materialized return None @@ -965,7 +1070,14 @@ class HIFI(Provider): # As a last resort, attempt to treat the local path as an ffmpeg input. out_file = output_dir / f"{stem}{suffix}" - materialized = self._ffmpeg_demux_to_audio(input_ref=resolved_text, output_path=out_file) + materialized = self._ffmpeg_demux_to_audio( + input_ref=resolved_text, + output_path=out_file, + progress=self.config.get("_pipeline_progress") if isinstance(self.config, dict) else None, + transfer_label=title_part or getattr(result, "title", None), + duration_seconds=self._coerce_duration_seconds(md), + audio_quality=md.get("audioQuality") if isinstance(md, dict) else None, + ) return materialized def _get_api_client_for_base(self, base_url: str) -> Optional[HifiApiClient]: @@ -1228,6 +1340,38 @@ class HIFI(Provider): minutes, secs = divmod(total, 60) return f"{minutes}:{secs:02d}" + @staticmethod + def _coerce_duration_seconds(value: Any) -> Optional[int]: + candidates = [] + candidates.append(value) + try: + if isinstance(value, dict): + for key in ("duration", + "durationSeconds", + "duration_sec", + "duration_ms", + "durationMillis"): + if key in value: + candidates.append(value.get(key)) + except Exception: + pass + + for cand in candidates: + try: + if cand is None: + continue + if isinstance(cand, str) and cand.strip().endswith("ms"): + cand = cand.strip()[:-2] + v = float(cand) + if v <= 0: + continue + if v > 10_000: # treat as milliseconds + v = v / 1000.0 + return int(round(v)) + except Exception: + continue + return None + @staticmethod def _stringify(value: Any) -> str: text = str(value or "").strip() @@ -1305,23 +1449,18 @@ class HIFI(Provider): if audio_quality: columns.append(("Quality", audio_quality)) - tags = {"tidal"} - if audio_quality: - tags.add(f"quality:{audio_quality.lower()}") - metadata = item.get("mediaMetadata") - if isinstance(metadata, dict): - tag_values = metadata.get("tags") or [] - for tag in tag_values: - if isinstance(tag, str) and tag.strip(): - tags.add(tag.strip().lower()) - # IMPORTANT: do not retain a shared reference to the raw API dict. # Downstream playback (MPV) mutates metadata to cache the decoded Tidal # manifest path/URL. If multiple results share the same dict reference, # they can incorrectly collapse to a single playable target. full_md: Dict[str, Any] = dict(item) + url_value = self._stringify(full_md.get("url")) + if url_value: + full_md["url"] = url_value - return SearchResult( + tags = self._build_track_tags(full_md) + + result = SearchResult( table="hifi", title=title, path=path, @@ -1332,6 +1471,12 @@ class HIFI(Provider): columns=columns, full_metadata=full_md, ) + if url_value: + try: + result.url = url_value + except Exception: + pass + return result def _extract_track_selection_context( self, selected_items: List[Any] @@ -1401,6 +1546,9 @@ class HIFI(Provider): def _fetch_track_details(self, track_id: int) -> Optional[Dict[str, Any]]: if track_id <= 0: return None + + info_data = self._fetch_track_info(track_id) + for base in self.api_urls: endpoint = f"{base.rstrip('/')}/track/" try: @@ -1408,12 +1556,32 @@ class HIFI(Provider): payload = client.track(track_id) if client else None data = payload.get("data") if isinstance(payload, dict) else None if isinstance(data, dict): - return data + merged: Dict[str, Any] = {} + if isinstance(info_data, dict): + merged.update(info_data) + merged.update(data) + return merged except Exception as exc: log(f"[hifi] Track lookup failed for {endpoint}: {exc}", file=sys.stderr) continue return None + def _fetch_track_info(self, track_id: int) -> Optional[Dict[str, Any]]: + if track_id <= 0: + return None + for base in self.api_urls: + endpoint = f"{base.rstrip('/')}/info/" + try: + client = self._get_api_client_for_base(base) + payload = client.info(track_id) if client else None + data = payload.get("data") if isinstance(payload, dict) else None + if isinstance(data, dict): + return data + except Exception as exc: + debug(f"[hifi] Info lookup failed for {endpoint}: {exc}") + continue + return None + def _fetch_track_lyrics(self, track_id: int) -> Optional[Dict[str, Any]]: if track_id <= 0: return None @@ -1450,6 +1618,54 @@ class HIFI(Provider): ] return [(name, value) for name, value in values if value] + def _build_track_tags(self, metadata: Dict[str, Any]) -> set[str]: + tags: set[str] = {"tidal"} + + audio_quality = self._stringify(metadata.get("audioQuality")) + if audio_quality: + tags.add(f"quality:{audio_quality.lower()}") + + media_md = metadata.get("mediaMetadata") + if isinstance(media_md, dict): + tag_values = media_md.get("tags") or [] + for tag in tag_values: + if isinstance(tag, str): + candidate = tag.strip() + if candidate: + tags.add(candidate.lower()) + + title_text = self._stringify(metadata.get("title")) + if title_text: + tags.add(f"title:{title_text}") + + artists = self._extract_artists(metadata) + for artist in artists: + artist_clean = self._stringify(artist) + if artist_clean: + tags.add(f"artist:{artist_clean}") + + album_title = "" + album_obj = metadata.get("album") + if isinstance(album_obj, dict): + album_title = self._stringify(album_obj.get("title")) + else: + album_title = self._stringify(metadata.get("album")) + if album_title: + tags.add(f"album:{album_title}") + + track_no_val = metadata.get("trackNumber") or metadata.get("track_number") + if track_no_val is not None: + try: + track_int = int(track_no_val) + if track_int > 0: + tags.add(f"track:{track_int}") + except Exception: + track_text = self._stringify(track_no_val) + if track_text: + tags.add(f"track:{track_text}") + + return tags + def selector( self, selected_items: List[Any], @@ -1476,16 +1692,32 @@ class HIFI(Provider): else None ) + try: + debug( + f"[hifi.selector] table_type={table_type} stage_is_last={stage_is_last} selected_count={len(selected_items) if selected_items else 0}" + ) + except Exception: + pass + # Artist selection: selecting @N should open an albums list. if isinstance(table_type, str) and table_type.strip().lower() == "hifi.artist": contexts = self._extract_artist_selection_context(selected_items) + try: + debug(f"[hifi.selector] artist contexts={len(contexts)}") + except Exception: + pass if not contexts: return False artist_id, artist_name = contexts[0] album_results = self._albums_for_artist(artist_id=artist_id, artist_name=artist_name, limit=200) if not album_results: - return False + try: + from SYS.rich_display import stdout_console + stdout_console().print(f"[bold yellow][hifi] No albums found for {artist_name}[/]") + except Exception: + log(f"[hifi] No albums found for {artist_name}") + return True try: from SYS.rich_display import stdout_console @@ -1531,6 +1763,10 @@ class HIFI(Provider): # Album selection: selecting @N should open the track list for that album. if isinstance(table_type, str) and table_type.strip().lower() == "hifi.album": contexts = self._extract_album_selection_context(selected_items) + try: + debug(f"[hifi.selector] album contexts={len(contexts)}") + except Exception: + pass if not contexts: return False @@ -1605,6 +1841,10 @@ class HIFI(Provider): return False contexts = self._extract_track_selection_context(selected_items) + try: + debug(f"[hifi.selector] track contexts={len(contexts)}") + except Exception: + pass if not contexts: return False @@ -1657,6 +1897,9 @@ class HIFI(Provider): insert_pos = 2 if artist_display else 1 columns.insert(insert_pos, ("Album", album_title)) + tags = self._build_track_tags(detail) + url_value = self._stringify(detail.get("url")) + result = SearchResult( table="hifi", title=title, @@ -1666,7 +1909,13 @@ class HIFI(Provider): media_kind="audio", columns=columns, full_metadata=detail, + tag=tags, ) + if url_value: + try: + result.url = url_value + except Exception: + pass table.add_result(result) try: results_payload.append(result.to_dict()) diff --git a/Provider/alldebrid.py b/Provider/alldebrid.py index dd3d28b..c4f76f0 100644 --- a/Provider/alldebrid.py +++ b/Provider/alldebrid.py @@ -8,12 +8,11 @@ from pathlib import Path from typing import Any, Dict, Iterable, List, Optional, Callable, Tuple from urllib.parse import urlparse -from API.HTTP import HTTPClient +from API.HTTP import HTTPClient, _download_direct_file from API.alldebrid import AllDebridClient, parse_magnet_or_hash, is_torrent_file from ProviderCore.base import Provider, SearchResult from ProviderCore.download import sanitize_filename -from SYS.download import _download_direct_file -from SYS.logger import log +from SYS.logger import log, debug from SYS.models import DownloadError _HOSTS_CACHE_TTL_SECONDS = 24 * 60 * 60 @@ -302,7 +301,7 @@ def _dispatch_alldebrid_magnet_search( ) except Exception: pass - log(f"[alldebrid] Sent magnet {magnet_id} to AllDebrid for download", file=sys.stderr) + debug(f"[alldebrid] Sent magnet {magnet_id} to AllDebrid for download") def prepare_magnet( diff --git a/Provider/hello_provider.py b/Provider/hello_provider.py new file mode 100644 index 0000000..05ac4fa --- /dev/null +++ b/Provider/hello_provider.py @@ -0,0 +1,192 @@ +"""Example provider template for use as a starter kit. + +This minimal provider demonstrates the typical hooks a provider may implement: +- `validate()` to assert it's usable +- `search()` to return `SearchResult` items +- `download()` to persist a sample file (useful for local tests) + +See `docs/provider_guide.md` for authoring guidance. +""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any, Dict, List, Optional + +from ProviderCore.base import Provider, SearchResult + + +class HelloProvider(Provider): + """Very small example provider suitable as a template. + + - Table name: `hello` + - Usage: `search-file -provider hello "query"` + - Selecting a row and piping into `download-file` will call `download()`. + """ + + URL = ("hello:",) + URL_DOMAINS = () + + def validate(self) -> bool: + # No configuration required; always available for testing/demo purposes. + return True + + def search( + self, + query: str, + limit: int = 50, + filters: Optional[Dict[str, Any]] = None, + **kwargs: Any, + ) -> List[SearchResult]: + q = (query or "").strip() + results: List[SearchResult] = [] + if not q or q in {"*", "all", "list"}: + q = "example" + + # Emit up to `limit` tiny example results. + n = min(max(1, int(limit)), 3) + for i in range(1, n + 1): + title = f"{q} sample {i}" + path = f"https://example.org/{q}/{i}" + sr = SearchResult( + table="hello", + title=title, + path=path, + detail="Example provider result", + media_kind="file", + columns=[("Example", "yes")], + full_metadata={"example_index": i}, + ) + results.append(sr) + + return results[: max(0, int(limit))] + + def download(self, result: SearchResult, output_dir: Path) -> Optional[Path]: + """Create a small text file to simulate a download. + + This keeps the example self-contained (no network access required) and + makes it straightforward to test provider behavior with `pytest`. + """ + try: + Path(output_dir).mkdir(parents=True, exist_ok=True) + except Exception: + pass + + title = str(getattr(result, "title", "hello") or "hello").strip() + safe = "".join(c if c.isalnum() or c in ("-", "_", ".") else "_" for c in title) + fname = f"{safe}.txt" if safe else "hello.txt" + dest = Path(output_dir) / fname + try: + dest.write_text(f"Hello from HelloProvider\nsource: {result.path}\n", encoding="utf-8") + return dest + except Exception: + return None + + def selector( + self, + selected_items: List[Any], + *, + ctx: Any, + stage_is_last: bool = True, + **_kwargs: Any, + ) -> bool: + """Present a simple details table when a HelloProvider row is selected. + + This demonstrates how providers can implement custom `@N` selection + behavior by constructing a `ResultTable`, populating it with + provider-specific rows, and instructing the CLI to show the table. + """ + if not stage_is_last: + return False + + def _as_payload(item: Any) -> Dict[str, Any]: + if isinstance(item, dict): + return dict(item) + try: + if hasattr(item, "to_dict"): + maybe = item.to_dict() + if isinstance(maybe, dict): + return maybe + except Exception: + pass + payload: Dict[str, Any] = {} + try: + payload = { + "title": getattr(item, "title", None), + "path": getattr(item, "path", None), + "table": getattr(item, "table", None), + "annotations": getattr(item, "annotations", None), + "media_kind": getattr(item, "media_kind", None), + "full_metadata": getattr(item, "full_metadata", None), + } + except Exception: + payload = {} + return payload + + chosen: List[Dict[str, Any]] = [] + for item in selected_items or []: + payload = _as_payload(item) + meta = payload.get("full_metadata") or {} + if not isinstance(meta, dict): + meta = {} + idx = meta.get("example_index") + if idx is None: + continue + title = str(payload.get("title") or payload.get("path") or "").strip() or f"hello-{idx}" + chosen.append({"index": idx, "title": title, "path": payload.get("path")}) + + if not chosen: + return False + + target = chosen[0] + idx = target.get("index") + title = target.get("title") or f"hello-{idx}" + + try: + from SYS.result_table import ResultTable + from SYS.rich_display import stdout_console + except Exception: + # If ResultTable isn't available, consider selection handled + return True + + table = ResultTable(f"Hello Details: {title}").set_preserve_order(True) + table.set_table("hello") + try: + table.set_table_metadata({"provider": "hello", "view": "details", "example_index": idx}) + except Exception: + pass + + table.set_source_command("download-file", []) + + results_payload: List[Dict[str, Any]] = [] + for part in ("a", "b"): + file_title = f"{title} - part {part}" + file_path = f"{target.get('path')}/{part}" + sr = SearchResult( + table="hello", + title=file_title, + path=file_path, + detail=f"Part {part}", + media_kind="file", + columns=[("Part", part)], + full_metadata={"part": part, "example_index": idx}, + ) + table.add_result(sr) + try: + results_payload.append(sr.to_dict()) + except Exception: + results_payload.append({"table": sr.table, "title": sr.title, "path": sr.path}) + + try: + ctx.set_last_result_table(table, results_payload) + ctx.set_current_stage_table(table) + except Exception: + pass + + try: + stdout_console().print() + stdout_console().print(table) + except Exception: + pass + + return True diff --git a/Provider/libgen.py b/Provider/libgen.py index 5b96475..6db8a01 100644 --- a/Provider/libgen.py +++ b/Provider/libgen.py @@ -1224,6 +1224,9 @@ class LibgenSearch: if results: _call(log_info, f"[libgen] Using mirror: {mirror}") return results + else: + _call(log_info, f"[libgen] Mirror returned 0 results; stopping mirror fallback") + break except requests.exceptions.Timeout: _call(log_info, f"[libgen] Mirror timed out: {mirror}") continue diff --git a/Provider/podcastindex.py b/Provider/podcastindex.py index 665d2d0..6e71c50 100644 --- a/Provider/podcastindex.py +++ b/Provider/podcastindex.py @@ -304,7 +304,7 @@ class PodcastIndex(Provider): pass try: - from SYS.download import _download_direct_file + from API.HTTP import _download_direct_file except Exception: return True diff --git a/Provider/torrent.py b/Provider/torrent.py new file mode 100644 index 0000000..85eac73 --- /dev/null +++ b/Provider/torrent.py @@ -0,0 +1,442 @@ +from __future__ import annotations + +import logging +import re +from dataclasses import dataclass +from typing import Any, Dict, List, Optional + +import requests +from ProviderCore.base import Provider, SearchResult +from SYS.logger import debug, log +try: # Preferred HTML parser + from lxml import html as lxml_html +except Exception: # pragma: no cover - optional + lxml_html = None # type: ignore + +logger = logging.getLogger(__name__) + + +@dataclass +class TorrentInfo: + name: str + url: str + seeders: int + leechers: int + size: str + source: str + category: Optional[str] = None + uploader: Optional[str] = None + magnet: Optional[str] = None + + +@dataclass +class SearchParams: + name: str + category: Optional[str] = None + order_column: Optional[str] = None + order_ascending: bool = False + + +_MAGNET_RE = re.compile(r"^magnet", re.IGNORECASE) + + +class Scraper: + def __init__(self, name: str, base_url: str, timeout: float = 10.0) -> None: + self.name = name + self.base = base_url.rstrip("/") + self.timeout = timeout + self.headers = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0 Safari/537.36" + ) + } + self.params: Optional[SearchParams] = None + + def find(self, params: SearchParams, pages: int = 1) -> List[TorrentInfo]: + self.params = params + results: List[TorrentInfo] = [] + for page in range(1, max(1, pages) + 1): + try: + results.extend(self._get_page(page)) + except Exception as exc: + debug(f"[{self.name}] page fetch failed: {exc}") + return results + + def _get_page(self, page: int) -> List[TorrentInfo]: + url, payload = self._request_data(page) + try: + resp = requests.get( + url, + params=payload, + headers=self.headers, + timeout=self.timeout, + ) + resp.raise_for_status() + return self._parse_search(resp) + except Exception as exc: + debug(f"[{self.name}] request failed: {exc}") + return [] + + def _request_data(self, page: int) -> tuple[str, Dict[str, Any]]: + return self.base, {} + + def _parse_search(self, response: requests.Response) -> List[TorrentInfo]: # pragma: no cover - interface + raise NotImplementedError + + def _parse_detail(self, url: str) -> Optional[str]: # optional override + try: + resp = requests.get(url, headers=self.headers, timeout=self.timeout) + resp.raise_for_status() + return self._parse_detail_response(resp) + except Exception: + return None + + def _parse_detail_response(self, response: requests.Response) -> Optional[str]: # pragma: no cover - interface + return None + + @staticmethod + def _int_from_text(value: Any) -> int: + try: + return int(str(value).strip().replace(",", "")) + except Exception: + return 0 + + +class NyaaScraper(Scraper): + def __init__(self) -> None: + super().__init__("nyaa.si", "https://nyaa.si") + + def _request_data(self, page: int) -> tuple[str, Dict[str, Any]]: + params = self.params or SearchParams(name="") + payload = { + "p": page, + "q": params.name, + "c": params.category or "0_0", + "f": "0", + } + if params.order_column: + payload["s"] = params.order_column + payload["o"] = "asc" if params.order_ascending else "desc" + return f"{self.base}/", payload + + def _parse_search(self, response: requests.Response) -> List[TorrentInfo]: + if lxml_html is None: + return [] + doc = lxml_html.fromstring(response.text) + rows = doc.xpath("//table//tbody/tr") + results: List[TorrentInfo] = [] + for row in rows: + cells = row.xpath("./td") + if len(cells) < 7: + continue + category_cell, name_cell, links_cell, size_cell, _, seed_cell, leech_cell, *_ = cells + + name_links = name_cell.xpath("./a") + name_tag = name_links[1] if len(name_links) > 1 else (name_links[0] if name_links else None) + if name_tag is None: + continue + + name = name_tag.get("title") or (name_tag.text_content() or "").strip() + url = name_tag.get("href") or "" + + magnet_link = None + magnet_candidates = links_cell.xpath('.//a[starts-with(@href,"magnet:")]/@href') + if magnet_candidates: + magnet_link = magnet_candidates[0] + + category_title = None + cat_titles = category_cell.xpath(".//a/@title") + if cat_titles: + category_title = cat_titles[0] + + results.append( + TorrentInfo( + name=name, + url=f"{self.base}{url}", + seeders=self._int_from_text(seed_cell.text_content()), + leechers=self._int_from_text(leech_cell.text_content()), + size=(size_cell.text_content() or "").strip(), + source=self.name, + category=category_title, + magnet=magnet_link, + ) + ) + return results + + +class X1337Scraper(Scraper): + def __init__(self) -> None: + super().__init__("1337x.to", "https://1337x.to") + + def _request_data(self, page: int) -> tuple[str, Dict[str, Any]]: + params = self.params or SearchParams(name="") + order = None + if params.order_column: + direction = "asc" if params.order_ascending else "desc" + order = f"{params.order_column}/{direction}" + + category = params.category + name = requests.utils.quote(params.name) + + if order and category: + path = f"/sort-category-search/{name}/{category}/{order}" + elif category: + path = f"/category-search/{name}/{category}" + elif order: + path = f"/sort-search/{name}/{order}" + else: + path = f"/search/{name}" + + url = f"{self.base}{path}/{page}/" + return url, {} + + def _parse_search(self, response: requests.Response) -> List[TorrentInfo]: + if lxml_html is None: + return [] + doc = lxml_html.fromstring(response.text) + rows = doc.xpath("//table//tbody/tr") + results: List[TorrentInfo] = [] + for row in rows: + cells = row.xpath("./td") + if len(cells) < 6: + continue + name_cell, seeds_cell, leech_cell, _, size_cell, uploader_cell = cells + + links = name_cell.xpath(".//a") + if len(links) < 2: + continue + + torrent_path = links[1].get("href") + torrent_url = f"{self.base}{torrent_path}" if torrent_path else "" + + info = TorrentInfo( + name=(links[1].text_content() or "").strip(), + url=torrent_url, + seeders=self._int_from_text(seeds_cell.text_content()), + leechers=self._int_from_text(leech_cell.text_content()), + size=(size_cell.text_content() or "").strip().replace(",", ""), + source=self.name, + uploader=(uploader_cell.text_content() or "").strip() if uploader_cell is not None else None, + ) + + if not info.magnet: + info.magnet = self._parse_detail(info.url) + results.append(info) + return results + + def _parse_detail_response(self, response: requests.Response) -> Optional[str]: + if lxml_html is None: + return None + doc = lxml_html.fromstring(response.text) + links = doc.xpath("//main//a[starts-with(@href,'magnet:')]/@href") + return links[0] if links else None + + +class YTSScraper(Scraper): + TRACKERS = "&tr=".join( + [ + "udp://open.demonii.com:1337/announce", + "udp://tracker.opentrackr.org:1337/announce", + "udp://tracker.leechers-paradise.org:6969", + ] + ) + + def __init__(self) -> None: + super().__init__("yts.mx", "https://yts.mx/api/v2") + self.headers = {} + + def _request_data(self, page: int) -> tuple[str, Dict[str, Any]]: + params = self.params or SearchParams(name="") + payload = { + "limit": 50, + "page": page, + "query_term": params.name, + "sort_by": "seeds", + "order_by": "desc" if not params.order_ascending else "asc", + } + return f"{self.base}/list_movies.json", payload + + def _parse_search(self, response: requests.Response) -> List[TorrentInfo]: + results: List[TorrentInfo] = [] + data = response.json() + if data.get("status") != "ok": + return results + movies = (data.get("data") or {}).get("movies") or [] + for movie in movies: + torrents = movie.get("torrents") or [] + if not torrents: + continue + tor = max(torrents, key=lambda t: t.get("seeds", 0)) + name = movie.get("title") or "unknown" + info = TorrentInfo( + name=name, + url=str(movie.get("id") or ""), + seeders=int(tor.get("seeds", 0) or 0), + leechers=int(tor.get("peers", 0) or 0), + size=str(tor.get("size") or ""), + source=self.name, + category=(movie.get("genres") or [None])[0], + magnet=self._build_magnet(tor, name), + ) + results.append(info) + return results + + def _build_magnet(self, torrent: Dict[str, Any], name: str) -> str: + return ( + f"magnet:?xt=urn:btih:{torrent.get('hash')}" + f"&dn={requests.utils.quote(name)}&tr={self.TRACKERS}" + ) + + +class ApiBayScraper(Scraper): + """Scraper for apibay.org (The Pirate Bay API clone).""" + + def __init__(self) -> None: + super().__init__("apibay.org", "https://apibay.org") + + def _request_data(self, page: int) -> tuple[str, Dict[str, Any]]: + _ = page # single-page API + params = self.params or SearchParams(name="") + return f"{self.base}/q.php", {"q": params.name} + + def _parse_search(self, response: requests.Response) -> List[TorrentInfo]: + results: List[TorrentInfo] = [] + try: + data = response.json() + except Exception: + return results + if not isinstance(data, list): + return results + + for item in data: + if not isinstance(item, dict): + continue + name = str(item.get("name") or "").strip() + info_hash = str(item.get("info_hash") or "").strip() + if not name or not info_hash: + continue + + magnet = self._build_magnet(info_hash, name) + seeders = self._int_from_text(item.get("seeders")) + leechers = self._int_from_text(item.get("leechers")) + size_raw = str(item.get("size") or "").strip() + size_fmt = self._format_size(size_raw) + + results.append( + TorrentInfo( + name=name, + url=f"{self.base}/description.php?id={item.get('id')}", + seeders=seeders, + leechers=leechers, + size=size_fmt, + source=self.name, + category=str(item.get("category") or ""), + uploader=str(item.get("username") or ""), + magnet=magnet, + ) + ) + return results + + @staticmethod + def _build_magnet(info_hash: str, name: str) -> str: + return f"magnet:?xt=urn:btih:{info_hash}&dn={requests.utils.quote(name)}" + + @staticmethod + def _format_size(size_raw: str) -> str: + try: + size_int = int(size_raw) + if size_int <= 0: + return size_raw + gb = size_int / (1024 ** 3) + if gb >= 1: + return f"{gb:.1f} GB" + mb = size_int / (1024 ** 2) + return f"{mb:.1f} MB" + except Exception: + return size_raw + + +class Torrent(Provider): + TABLE_AUTO_STAGES = {"torrent": ["download-file"]} + + def __init__(self, config: Optional[Dict[str, Any]] = None) -> None: + super().__init__(config) + self.scrapers: List[Scraper] = [] + # JSON APIs (no lxml dependency) + self.scrapers.append(ApiBayScraper()) + self.scrapers.append(YTSScraper()) + # HTML scrapers require lxml + if lxml_html is not None: + self.scrapers.append(NyaaScraper()) + self.scrapers.append(X1337Scraper()) + else: + log("[torrent] lxml not installed; skipping Nyaa/1337x scrapers", file=None) + + def validate(self) -> bool: + return bool(self.scrapers) + + def search( + self, + query: str, + limit: int = 50, + filters: Optional[Dict[str, Any]] = None, + **_kwargs: Any, + ) -> List[SearchResult]: + q = str(query or "").strip() + if not q: + return [] + + params = SearchParams(name=q, order_column="seeders", order_ascending=False) + results: List[TorrentInfo] = [] + + for scraper in self.scrapers: + try: + scraped = scraper.find(params, pages=1) + results.extend(scraped) + except Exception as exc: + debug(f"[torrent] scraper {scraper.name} failed: {exc}") + continue + + results = sorted(results, key=lambda r: r.seeders, reverse=True) + if limit and limit > 0: + results = results[:limit] + + out: List[SearchResult] = [] + for item in results: + path = item.magnet or item.url + columns = [ + ("TITLE", item.name), + ("Seeds", str(item.seeders)), + ("Leechers", str(item.leechers)), + ("Size", item.size or ""), + ("Source", item.source), + ] + if item.uploader: + columns.append(("Uploader", item.uploader)) + + md = { + "magnet": item.magnet, + "url": item.url, + "source": item.source, + "seeders": item.seeders, + "leechers": item.leechers, + "size": item.size, + } + if item.uploader: + md["uploader"] = item.uploader + + out.append( + SearchResult( + table="torrent", + title=item.name, + path=path, + detail=f"Seeds:{item.seeders} | Size:{item.size}", + annotations=[item.source], + media_kind="other", + columns=columns, + full_metadata=md, + tag={"torrent"}, + ) + ) + return out diff --git a/Provider/vimm.py b/Provider/vimm.py new file mode 100644 index 0000000..d408682 --- /dev/null +++ b/Provider/vimm.py @@ -0,0 +1,185 @@ +"""Vimm provider skeleton (lxml + HTTPClient). + +This is a lightweight, resilient provider implementation intended as a +starting point for implementing a full Vimm (vimm.net) provider. + +It prefers server-rendered HTML parsing via lxml and uses the repo's +`HTTPClient` helper for robust HTTP calls (timeouts/retries). + +Selectors in `search()` are intentionally permissive heuristics; update the +XPaths to match the real site HTML when you have an actual fixture. +""" + +from __future__ import annotations + +import re +import sys +from typing import Any, Dict, List, Optional +from urllib.parse import urljoin, quote_plus +from lxml import html as lxml_html + +from API.HTTP import HTTPClient +from ProviderCore.base import Provider, SearchResult +from SYS.logger import log, debug + + +class Vimm(Provider): + """Provider for vimm.net vault listings (skeleton). + + - Uses lxml for parsing + - No authentication required + """ + + URL = ("https://vimm.net/vault/",) + URL_DOMAINS = ("vimm.net",) + + def validate(self) -> bool: + # This provider has no required config; consider more checks if needed. + return True + + def _parse_size_bytes(self, size_str: str) -> Optional[int]: + if not size_str: + return None + try: + s = str(size_str or "").strip().replace(",", "") + m = re.search(r"(?P[\d\.]+)\s*(?P[KMGT]?B)?", s, flags=re.I) + if not m: + return None + val = float(m.group("val")) + unit = (m.group("unit") or "B").upper() + mul = { + "B": 1, + "KB": 1024, + "MB": 1024 ** 2, + "GB": 1024 ** 3, + "TB": 1024 ** 4, + }.get(unit, 1) + return int(val * mul) + except Exception: + return None + + def search( + self, + query: str, + limit: int = 50, + filters: Optional[Dict[str, Any]] = None, + **kwargs: Any, + ) -> List[SearchResult]: + q = (query or "").strip() + if not q: + return [] + + # Build search/list URL + base = "https://vimm.net/vault/" + url = f"{base}?p=list&q={quote_plus(q)}" + + try: + with HTTPClient(timeout=20.0) as client: + resp = client.get(url) + content = resp.content + except Exception as exc: + log(f"[vimm] HTTP fetch failed: {exc}", file=sys.stderr) + return [] + + try: + doc = lxml_html.fromstring(content) + except Exception as exc: + log(f"[vimm] HTML parse failed: {exc}", file=sys.stderr) + return [] + + results: List[SearchResult] = [] + + # Candidate XPaths for list items (tweak to match real DOM) + container_xpaths = [ + '//div[contains(@class,"list-item")]', + '//div[contains(@class,"result")]', + '//li[contains(@class,"item")]', + '//tr[contains(@class,"result")]', + '//article', + ] + + nodes = [] + for xp in container_xpaths: + try: + found = doc.xpath(xp) + if found: + nodes = found + debug(f"[vimm] using xpath {xp} -> {len(found)} nodes") + break + except Exception: + continue + + # Fallback: try generic anchors under a list area + if not nodes: + try: + nodes = doc.xpath('//div[contains(@id,"list")]/div') or doc.xpath('//div[contains(@class,"results")]/div') + except Exception: + nodes = [] + + for n in (nodes or [])[: max(1, int(limit))]: + try: + # Prefer explicit title anchors + title = None + href = None + try: + # a few heuristic searches for a meaningful anchor + a = (n.xpath('.//a[contains(@class,"title")]') or + n.xpath('.//h2/a') or + n.xpath('.//a[contains(@href,"/vault/")]') or + n.xpath('.//a')) + if a: + a0 = a[0] + title = a0.text_content().strip() + href = a0.get('href') + except Exception: + title = None + href = None + + if not title: + title = (n.text_content() or "").strip() + + path = urljoin(base, href) if href else "" + + # Extract size & platform heuristics + size_text = "" + try: + s = n.xpath('.//*[contains(@class,"size")]/text()') or n.xpath('.//span[contains(text(),"MB") or contains(text(),"GB")]/text()') + if s: + size_text = str(s[0]).strip() + except Exception: + size_text = "" + + size_bytes = self._parse_size_bytes(size_text) + + platform = "" + try: + p = n.xpath('.//*[contains(@class,"platform")]/text()') + if p: + platform = str(p[0]).strip() + except Exception: + platform = "" + + columns = [] + if platform: + columns.append(("Platform", platform)) + if size_text: + columns.append(("Size", size_text)) + + results.append( + SearchResult( + table="vimm", + title=str(title or "").strip(), + path=str(path or ""), + detail="", + annotations=[], + media_kind="file", + size_bytes=size_bytes, + tag={"vimm"}, + columns=columns, + full_metadata={"raw": lxml_html.tostring(n, encoding="unicode")}, + ) + ) + except Exception: + continue + + return results[: max(0, int(limit))] diff --git a/ProviderCore/base.py b/ProviderCore/base.py index 2f6a8c4..874c40f 100644 --- a/ProviderCore/base.py +++ b/ProviderCore/base.py @@ -26,8 +26,7 @@ class SearchResult: def to_dict(self) -> Dict[str, Any]: """Convert to dictionary for pipeline processing.""" - - return { + out = { "table": self.table, "title": self.title, "path": self.path, @@ -40,6 +39,15 @@ class SearchResult: "full_metadata": self.full_metadata, } + try: + url_value = getattr(self, "url", None) + if url_value is not None: + out["url"] = url_value + except Exception: + pass + + return out + class Provider(ABC): """Unified provider base class. diff --git a/ProviderCore/registry.py b/ProviderCore/registry.py index ffbb9ca..1a9093a 100644 --- a/ProviderCore/registry.py +++ b/ProviderCore/registry.py @@ -1,75 +1,238 @@ """Provider registry. -Concrete provider implementations live in the `Provider/` package. -This module is the single source of truth for provider discovery. +Concrete provider implementations live in the ``Provider`` package. This module +is the single source of truth for discovery, metadata, and lifecycle helpers +for those plugins. """ from __future__ import annotations -from typing import Any, Dict, Optional, Sequence, Type +import importlib +import pkgutil import sys +from dataclasses import dataclass, field +from types import ModuleType +from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Type from urllib.parse import urlparse from SYS.logger import log -from ProviderCore.base import Provider, SearchProvider, FileProvider, SearchResult -from Provider.alldebrid import AllDebrid -from Provider.bandcamp import Bandcamp -from Provider.libgen import Libgen -from Provider.matrix import Matrix -from Provider.openlibrary import OpenLibrary -from Provider.soulseek import Soulseek, download_soulseek_file -from Provider.telegram import Telegram -from Provider.youtube import YouTube -from Provider.fileio import FileIO -from Provider.zeroxzero import ZeroXZero -from Provider.loc import LOC -from Provider.internetarchive import InternetArchive -from Provider.podcastindex import PodcastIndex -from Provider.HIFI import HIFI +from ProviderCore.base import FileProvider, Provider, SearchProvider, SearchResult +from Provider.soulseek import download_soulseek_file -_PROVIDERS: Dict[str, - Type[Provider]] = { - # Search-capable providers - "alldebrid": AllDebrid, - "libgen": Libgen, - "openlibrary": OpenLibrary, - "internetarchive": InternetArchive, - "hifi": HIFI, - "soulseek": Soulseek, - "bandcamp": Bandcamp, - "youtube": YouTube, - "telegram": Telegram, - "loc": LOC, - "podcastindex": PodcastIndex, - # Upload-capable providers - "0x0": ZeroXZero, - "file.io": FileIO, - "matrix": Matrix, - } + +@dataclass(frozen=True) +class ProviderInfo: + """Metadata about a single provider entry.""" + + canonical_name: str + provider_class: Type[Provider] + module: str + alias_names: Tuple[str, ...] = field(default_factory=tuple) + + @property + def supports_search(self) -> bool: + return self.provider_class.search is not Provider.search + + @property + def supports_upload(self) -> bool: + return self.provider_class.upload is not Provider.upload + + +class ProviderRegistry: + """Handles discovery, registration, and lookup of provider classes.""" + + def __init__(self, package_name: str) -> None: + self.package_name = (package_name or "").strip() + self._infos: Dict[str, ProviderInfo] = {} + self._lookup: Dict[str, ProviderInfo] = {} + self._modules: set[str] = set() + self._discovered = False + + def _normalize(self, value: Any) -> str: + return str(value or "").strip().lower() + + def _candidate_names(self, + provider_class: Type[Provider], + override_name: Optional[str]) -> List[str]: + names: List[str] = [] + seen: set[str] = set() + + def _add(value: Any) -> None: + text = str(value or "").strip() + normalized = text.lower() + if not text or normalized in seen: + return + seen.add(normalized) + names.append(text) + + if override_name: + _add(override_name) + else: + _add(getattr(provider_class, "PROVIDER_NAME", None)) + _add(getattr(provider_class, "NAME", None)) + + _add(getattr(provider_class, "__name__", None)) + + for alias in getattr(provider_class, "PROVIDER_ALIASES", ()) or (): + _add(alias) + + return names + + def register( + self, + provider_class: Type[Provider], + *, + override_name: Optional[str] = None, + extra_aliases: Optional[Sequence[str]] = None, + module_name: Optional[str] = None, + replace: bool = False, + ) -> ProviderInfo: + """Register a provider class with canonical and alias names.""" + + candidates = self._candidate_names(provider_class, override_name) + if not candidates: + raise ValueError("provider name candidates are required") + + canonical = self._normalize(candidates[0]) + if not canonical: + raise ValueError("provider name must not be empty") + + alias_names: List[str] = [] + alias_seen: set[str] = set() + + for candidate in candidates[1:]: + normalized = self._normalize(candidate) + if not normalized or normalized == canonical or normalized in alias_seen: + continue + alias_seen.add(normalized) + alias_names.append(normalized) + + for alias in extra_aliases or (): + normalized = self._normalize(alias) + if not normalized or normalized == canonical or normalized in alias_seen: + continue + alias_seen.add(normalized) + alias_names.append(normalized) + + info = ProviderInfo( + canonical_name=canonical, + provider_class=provider_class, + module=module_name or getattr(provider_class, "__module__", "") or "", + alias_names=tuple(alias_names), + ) + + existing = self._infos.get(canonical) + if existing is not None and not replace: + return existing + + self._infos[canonical] = info + for lookup in (canonical,) + tuple(alias_names): + self._lookup[lookup] = info + return info + + def _register_module(self, module: ModuleType) -> None: + module_name = getattr(module, "__name__", "") + if not module_name or module_name in self._modules: + return + self._modules.add(module_name) + + for attr in dir(module): + candidate = getattr(module, attr) + if not isinstance(candidate, type): + continue + if not issubclass(candidate, Provider): + continue + if candidate in {Provider, SearchProvider, FileProvider}: + continue + if getattr(candidate, "__module__", "") != module_name: + continue + try: + self.register(candidate, module_name=module_name) + except Exception as exc: + log(f"[provider] Failed to register {module_name}.{candidate.__name__}: {exc}", file=sys.stderr) + + def discover(self) -> None: + """Import and register providers from the package.""" + + if self._discovered or not self.package_name: + return + self._discovered = True + + try: + package = importlib.import_module(self.package_name) + except Exception as exc: + log(f"[provider] Failed to import package {self.package_name}: {exc}", file=sys.stderr) + return + + self._register_module(package) + package_path = getattr(package, "__path__", None) + if not package_path: + return + + for finder, module_name, _ in pkgutil.iter_modules(package_path): + if module_name.startswith("_"): + continue + module_path = f"{self.package_name}.{module_name}" + try: + module = importlib.import_module(module_path) + except Exception as exc: + log(f"[provider] Failed to load {module_path}: {exc}", file=sys.stderr) + continue + self._register_module(module) + + def get(self, name: str) -> Optional[ProviderInfo]: + self.discover() + if not name: + return None + return self._lookup.get(self._normalize(name)) + + def iter_providers(self) -> Iterable[ProviderInfo]: + self.discover() + return tuple(self._infos.values()) + + def has_name(self, name: str) -> bool: + return self.get(name) is not None + + +REGISTRY = ProviderRegistry("Provider") +REGISTRY.discover() + + +def register_provider( + provider_class: Type[Provider], + *, + name: Optional[str] = None, + aliases: Optional[Sequence[str]] = None, + module_name: Optional[str] = None, + replace: bool = False, +) -> ProviderInfo: + """Register a provider class from tests or third-party packages.""" + + return REGISTRY.register( + provider_class, + override_name=name, + extra_aliases=aliases, + module_name=module_name, + replace=replace, + ) def get_provider_class(name: str) -> Optional[Type[Provider]]: - """Return the provider class for a registered provider name, if any.""" - key = str(name or "").strip().lower() - return _PROVIDERS.get(key) + info = REGISTRY.get(name) + if info is None: + return None + return info.provider_class def selection_auto_stage_for_table( table_type: str, stage_args: Optional[Sequence[str]] = None, ) -> Optional[list[str]]: - """Return the provider-suggested stage to auto-run for a selected table. - - This is used by the CLI to avoid hardcoding table names and behaviors. - """ t = str(table_type or "").strip().lower() if not t: return None - # Provider tables are usually either: - # - "youtube" (no dot) - # - "hifi.tracks" (prefix = provider name) provider_key = t.split(".", 1)[0] if "." in t else t provider_class = get_provider_class(provider_key) or get_provider_class(t) if provider_class is None: @@ -82,14 +245,7 @@ def selection_auto_stage_for_table( def is_known_provider_name(name: str) -> bool: - """Return True if `name` matches a registered provider key. - - This is intentionally cheap (no imports/instantiation) so callers can - probe UI strings (table names, store names, etc.) without triggering - noisy 'Unknown provider' logs. - """ - - return (name or "").strip().lower() in _PROVIDERS + return REGISTRY.has_name(name) def _supports_search(provider: Provider) -> bool: @@ -107,18 +263,14 @@ def _provider_url_patterns(provider_class: Type[Provider]) -> Sequence[str]: return [] -def get_provider(name: str, - config: Optional[Dict[str, - Any]] = None) -> Optional[Provider]: - """Get a provider by name (unified registry).""" - - provider_class = _PROVIDERS.get((name or "").lower()) - if provider_class is None: +def get_provider(name: str, config: Optional[Dict[str, Any]] = None) -> Optional[Provider]: + info = REGISTRY.get(name) + if info is None: log(f"[provider] Unknown provider: {name}", file=sys.stderr) return None try: - provider = provider_class(config) + provider = info.provider_class(config) if not provider.validate(): log(f"[provider] Provider '{name}' is not available", file=sys.stderr) return None @@ -129,24 +281,18 @@ def get_provider(name: str, def list_providers(config: Optional[Dict[str, Any]] = None) -> Dict[str, bool]: - """List all providers and their availability.""" - - availability: Dict[str, - bool] = {} - for name, provider_class in _PROVIDERS.items(): + availability: Dict[str, bool] = {} + for info in REGISTRY.iter_providers(): try: - provider = provider_class(config) - availability[name] = provider.validate() + provider = info.provider_class(config) + availability[info.canonical_name] = provider.validate() except Exception: - availability[name] = False + availability[info.canonical_name] = False return availability def get_search_provider(name: str, - config: Optional[Dict[str, - Any]] = None) -> Optional[SearchProvider]: - """Get a search-capable provider by name (compat API).""" - + config: Optional[Dict[str, Any]] = None) -> Optional[SearchProvider]: provider = get_provider(name, config) if provider is None: return None @@ -157,26 +303,20 @@ def get_search_provider(name: str, def list_search_providers(config: Optional[Dict[str, Any]] = None) -> Dict[str, bool]: - """List all search providers and their availability.""" - - availability: Dict[str, - bool] = {} - for name, provider_class in _PROVIDERS.items(): + availability: Dict[str, bool] = {} + for info in REGISTRY.iter_providers(): try: - provider = provider_class(config) - availability[name] = bool( - provider.validate() and _supports_search(provider) + provider = info.provider_class(config) + availability[info.canonical_name] = bool( + provider.validate() and info.supports_search ) except Exception: - availability[name] = False + availability[info.canonical_name] = False return availability def get_file_provider(name: str, - config: Optional[Dict[str, - Any]] = None) -> Optional[FileProvider]: - """Get an upload-capable provider by name (compat API).""" - + config: Optional[Dict[str, Any]] = None) -> Optional[FileProvider]: provider = get_provider(name, config) if provider is None: return None @@ -187,28 +327,19 @@ def get_file_provider(name: str, def list_file_providers(config: Optional[Dict[str, Any]] = None) -> Dict[str, bool]: - """List all file providers and their availability.""" - - availability: Dict[str, - bool] = {} - for name, provider_class in _PROVIDERS.items(): + availability: Dict[str, bool] = {} + for info in REGISTRY.iter_providers(): try: - provider = provider_class(config) - availability[name] = bool( - provider.validate() and _supports_upload(provider) + provider = info.provider_class(config) + availability[info.canonical_name] = bool( + provider.validate() and info.supports_upload ) except Exception: - availability[name] = False + availability[info.canonical_name] = False return availability def match_provider_name_for_url(url: str) -> Optional[str]: - """Return a registered provider name that claims the URL's domain. - - Providers can declare domains via class attribute `URL` (preferred) or `URL_DOMAINS`. - This matcher is intentionally cheap (no provider instantiation, no network). - """ - raw_url = str(url or "").strip() raw_url_lower = raw_url.lower() try: @@ -219,11 +350,6 @@ def match_provider_name_for_url(url: str) -> Optional[str]: host = "" path = "" - # Prefer Internet Archive for archive.org links unless the URL clearly refers - # to a borrow/loan flow (handled by OpenLibrary provider). - # - # This keeps direct downloads and item pages routed to `internetarchive`, while - # preserving OpenLibrary's scripted borrow pipeline for loan/reader URLs. def _norm_host(h: str) -> str: h_norm = str(h or "").strip().lower() if h_norm.startswith("www."): @@ -234,47 +360,45 @@ def match_provider_name_for_url(url: str) -> Optional[str]: if host_norm: if host_norm == "openlibrary.org" or host_norm.endswith(".openlibrary.org"): - return "openlibrary" if "openlibrary" in _PROVIDERS else None + return "openlibrary" if REGISTRY.has_name("openlibrary") else None if host_norm == "archive.org" or host_norm.endswith(".archive.org"): low_path = str(path or "").lower() is_borrowish = ( - low_path.startswith("/borrow/") or low_path.startswith("/stream/") - or low_path.startswith("/services/loans/") or "/services/loans/" in low_path + low_path.startswith("/borrow/") + or low_path.startswith("/stream/") + or low_path.startswith("/services/loans/") + or "/services/loans/" in low_path ) if is_borrowish: - return "openlibrary" if "openlibrary" in _PROVIDERS else None - return "internetarchive" if "internetarchive" in _PROVIDERS else None + return "openlibrary" if REGISTRY.has_name("openlibrary") else None + return "internetarchive" if REGISTRY.has_name("internetarchive") else None - for name, provider_class in _PROVIDERS.items(): - domains = _provider_url_patterns(provider_class) + for info in REGISTRY.iter_providers(): + domains = _provider_url_patterns(info.provider_class) if not domains: continue - for d in domains: - dom_raw = str(d or "").strip() + for domain in domains: + dom_raw = str(domain or "").strip() dom = dom_raw.lower() if not dom: continue - # Scheme-like patterns (magnet:, http://example) still use prefix match. if dom.startswith("magnet:") or dom.startswith("http://") or dom.startswith("https://"): if raw_url_lower.startswith(dom): - return name + return info.canonical_name continue dom_norm = _norm_host(dom) if not dom_norm or not host_norm: continue if host_norm == dom_norm or host_norm.endswith("." + dom_norm): - return name + return info.canonical_name return None def get_provider_for_url(url: str, - config: Optional[Dict[str, - Any]] = None) -> Optional[Provider]: - """Instantiate and return the matching provider for a URL, if any.""" - + config: Optional[Dict[str, Any]] = None) -> Optional[Provider]: name = match_provider_name_for_url(url) if not name: return None @@ -282,10 +406,12 @@ def get_provider_for_url(url: str, __all__ = [ - "SearchResult", + "ProviderInfo", "Provider", "SearchProvider", "FileProvider", + "SearchResult", + "register_provider", "get_provider", "list_providers", "get_search_provider", @@ -294,7 +420,7 @@ __all__ = [ "list_file_providers", "match_provider_name_for_url", "get_provider_for_url", - "download_soulseek_file", "get_provider_class", "selection_auto_stage_for_table", + "download_soulseek_file", ] diff --git a/SYS/download.py b/SYS/download.py deleted file mode 100644 index b66e2c4..0000000 --- a/SYS/download.py +++ /dev/null @@ -1,1116 +0,0 @@ -"""Download media files using yt-dlp with support for direct file downloads. - -Lean, focused downloader without event infrastructure overhead. -- yt-dlp integration for streaming sites -- Direct file download fallback for PDFs, images, documents -- Tag extraction via metadata.extract_ytdlp_tags() -- Logging via helper.logger.log() -""" - -from __future__ import annotations - -import glob # noqa: F401 -import hashlib -import json # noqa: F401 -import random -import re -import string -import subprocess -import sys -import time -import traceback -from pathlib import Path -from typing import Any, Dict, Iterator, List, Optional -from urllib.parse import urljoin, urlparse - -import httpx - -from SYS.logger import log, debug -from SYS.utils import ensure_directory, sha256_file -from API.HTTP import HTTPClient -from SYS.models import DownloadError, DownloadOptions, DownloadMediaResult, DebugLogger, ProgressBar - -try: - import yt_dlp # type: ignore - from yt_dlp.extractor import gen_extractors # type: ignore -except Exception as exc: - yt_dlp = None # type: ignore - YTDLP_IMPORT_ERROR = exc -else: - YTDLP_IMPORT_ERROR = None - -try: - from SYS.metadata import extract_ytdlp_tags -except ImportError: - extract_ytdlp_tags = None - -_EXTRACTOR_CACHE: List[Any] | None = None -_YTDLP_PROGRESS = ProgressBar() - - -def _ensure_yt_dlp_ready() -> None: - """Verify yt-dlp is available, raise if not.""" - if yt_dlp is not None: - return - detail = str(YTDLP_IMPORT_ERROR or "yt-dlp is not installed") - raise DownloadError(f"yt-dlp module not available: {detail}") - - -def _progress_callback(status: Dict[str, Any]) -> None: - """Simple progress callback using logger.""" - event = status.get("status") - if event == "downloading": - downloaded = status.get("downloaded_bytes") - total = status.get("total_bytes") or status.get("total_bytes_estimate") - _YTDLP_PROGRESS.update( - downloaded=int(downloaded or 0), - total=int(total) if total else None, - label="download", - file=sys.stderr, - ) - elif event == "finished": - _YTDLP_PROGRESS.finish() - debug(f"✓ Download finished: {status.get('filename')}") - elif event in ("postprocessing", "processing"): - debug(f"Post-processing: {status.get('postprocessor')}") - - -def is_url_supported_by_ytdlp(url: str) -> bool: - """Check if URL is supported by yt-dlp.""" - if yt_dlp is None: - return False - global _EXTRACTOR_CACHE - if _EXTRACTOR_CACHE is None: - try: - _EXTRACTOR_CACHE = [ie for ie in gen_extractors()] # type: ignore[arg-type] - except Exception: - _EXTRACTOR_CACHE = [] - for extractor in _EXTRACTOR_CACHE: - try: - if not extractor.suitable(url): - continue - except Exception: - continue - name = getattr(extractor, "IE_NAME", "") - if name.lower() == "generic": - continue - return True - return False - - -def list_formats( - url: str, - no_playlist: bool = False, - playlist_items: Optional[str] = None -) -> Optional[List[Dict[str, - Any]]]: - """Get list of available formats for a URL using yt-dlp.""" - _ensure_yt_dlp_ready() - - try: - ydl_opts = { - "quiet": True, - "no_warnings": True, - "socket_timeout": 30, - } - - if no_playlist: - ydl_opts["noplaylist"] = True - - if playlist_items: - ydl_opts["playlist_items"] = playlist_items - - with yt_dlp.YoutubeDL(ydl_opts) as ydl: - debug(f"Fetching format list for: {url}") - info = ydl.extract_info(url, download=False) - - formats = info.get("formats", []) - if not formats: - log("No formats available", file=sys.stderr) - return None - - result_formats = [] - for fmt in formats: - result_formats.append( - { - "format_id": fmt.get("format_id", - ""), - "format": fmt.get("format", - ""), - "ext": fmt.get("ext", - ""), - "resolution": fmt.get("resolution", - ""), - "width": fmt.get("width"), - "height": fmt.get("height"), - "fps": fmt.get("fps"), - "vcodec": fmt.get("vcodec", - "none"), - "acodec": fmt.get("acodec", - "none"), - "filesize": fmt.get("filesize"), - "tbr": fmt.get("tbr"), - } - ) - - debug(f"Found {len(result_formats)} available formats") - return result_formats - - except Exception as e: - log(f"✗ Error fetching formats: {e}", file=sys.stderr) - return None - - -def _download_with_sections_via_cli( - url: str, - ytdl_options: Dict[str, - Any], - sections: List[str], - quiet: bool = False -) -> tuple[Optional[str], - Dict[str, - Any]]: - """Download each section separately so merge-file can combine them. - - yt-dlp with multiple --download-sections args merges them into one file. - We need separate files for merge-file, so download each section individually. - - Uses hash-based filenames for sections (not title-based) to prevent yt-dlp from - thinking sections are already downloaded. The title is extracted and stored in tags. - - Returns: - (session_id, first_section_info_dict) - session_id for finding files, info dict for metadata extraction - """ - - sections_list = ytdl_options.get("download_sections", []) - if not sections_list: - return "", {} - - # Generate a unique hash-based ID for this download session - # This ensures different videos/downloads don't have filename collisions - session_id = hashlib.md5( - (url + str(time.time()) + "".join(random.choices(string.ascii_letters, - k=10))).encode() - ).hexdigest()[:12] - - first_section_info = None - title_from_first = None - - # Download each section separately with unique output template using session ID - for section_idx, section in enumerate(sections_list, 1): - # Build unique output template for this section using session-based filename - # e.g., "{session_id}_{section_idx}.ext" - simple and unique per section - base_outtmpl = ytdl_options.get("outtmpl", "%(title)s.%(ext)s") - output_dir_path = Path(base_outtmpl).parent - - # Use session_id + section index for temp filename - # e.g., "/path/{session_id}_1.%(ext)s" - filename_tmpl = f"{session_id}_{section_idx}" - if base_outtmpl.endswith(".%(ext)s"): - filename_tmpl += ".%(ext)s" - - # Use Path to handle separators correctly for the OS - section_outtmpl = str(output_dir_path / filename_tmpl) - - # For the first section, extract metadata first (separate call) - if section_idx == 1: - metadata_cmd = ["yt-dlp", "--dump-json", "--skip-download"] - if ytdl_options.get("cookiefile"): - cookies_path = ytdl_options["cookiefile"].replace("\\", "/") - metadata_cmd.extend(["--cookies", cookies_path]) - if ytdl_options.get("noplaylist"): - metadata_cmd.append("--no-playlist") - metadata_cmd.append(url) - - try: - meta_result = subprocess.run( - metadata_cmd, - capture_output=True, - text=True - ) - if meta_result.returncode == 0 and meta_result.stdout: - try: - info_dict = json.loads(meta_result.stdout.strip()) - first_section_info = info_dict - title_from_first = info_dict.get("title") - if not quiet: - debug(f"Extracted title from metadata: {title_from_first}") - except json.JSONDecodeError: - if not quiet: - debug("Could not parse JSON metadata") - except Exception as e: - if not quiet: - debug(f"Error extracting metadata: {e}") - - # Build yt-dlp command for downloading this section - cmd = ["yt-dlp"] - - # Add format - if ytdl_options.get("format"): - cmd.extend(["-f", ytdl_options["format"]]) - - # Add ONLY this section (not all sections) - cmd.extend(["--download-sections", section]) - - # Add force-keyframes-at-cuts if specified - if ytdl_options.get("force_keyframes_at_cuts"): - cmd.append("--force-keyframes-at-cuts") - - # Add output template for this section - cmd.extend(["-o", section_outtmpl]) - - # Add cookies file if present - if ytdl_options.get("cookiefile"): - # Convert backslashes to forward slashes for better compatibility - cookies_path = ytdl_options["cookiefile"].replace("\\", "/") - cmd.extend(["--cookies", cookies_path]) - - # Add no-playlist if specified - if ytdl_options.get("noplaylist"): - cmd.append("--no-playlist") - - # Add the URL - cmd.append(url) - - if not quiet: - debug( - f"Running yt-dlp for section {section_idx}/{len(sections_list)}: {section}" - ) - debug(f"Command: {' '.join(cmd)}") - - # Run the subprocess - don't capture output so progress is shown - try: - result = subprocess.run(cmd) - - if result.returncode != 0: - raise DownloadError( - f"yt-dlp subprocess failed for section {section_idx} with code {result.returncode}" - ) - except Exception as exc: - raise DownloadError( - f"yt-dlp subprocess error for section {section_idx}: {exc}" - ) from exc - - return session_id, first_section_info or {} - - -def _build_ytdlp_options(opts: DownloadOptions) -> Dict[str, Any]: - """Build yt-dlp download options.""" - ensure_directory(opts.output_dir) - - # Build output template - # When downloading sections, each section will have .section_N_of_M added by _download_with_sections_via_cli - outtmpl = str((opts.output_dir / "%(title)s.%(ext)s").resolve()) - - base_options: Dict[str, - Any] = { - "outtmpl": outtmpl, - "quiet": True, - "no_warnings": True, - "noprogress": True, - "socket_timeout": 30, - "retries": 10, - "fragment_retries": 10, - "http_chunk_size": 10_485_760, - "restrictfilenames": True, - "progress_hooks": [] if opts.quiet else [_progress_callback], - } - - if opts.cookies_path and opts.cookies_path.is_file(): - base_options["cookiefile"] = str(opts.cookies_path) - else: - # Fallback to browser cookies - base_options["cookiesfrombrowser"] = ("chrome", - ) - - # Add no-playlist option if specified (for single video from playlist url) - if opts.no_playlist: - base_options["noplaylist"] = True - - # Configure based on mode - if opts.mode == "audio": - base_options["format"] = opts.ytdl_format or "251/140/bestaudio" - base_options["postprocessors"] = [{ - "key": "FFmpegExtractAudio" - }] - else: # video - base_options["format"] = opts.ytdl_format or "bestvideo+bestaudio/best" - base_options["format_sort"] = [ - "res:4320", - "res:2880", - "res:2160", - "res:1440", - "res:1080", - "res:720", - "res", - ] - - # Add clip sections if provided (yt-dlp will download only these sections) - if opts.clip_sections: - # Parse section ranges like "48-65,120-152,196-205" (seconds) - # and convert to yt-dlp format: "*HH:MM:SS-HH:MM:SS,*HH:MM:SS-HH:MM:SS" - sections = [] - for section_range in opts.clip_sections.split(","): - try: - start_str, end_str = section_range.strip().split("-") - start_sec = float(start_str) - end_sec = float(end_str) - - # Convert seconds to HH:MM:SS format - def sec_to_hhmmss(seconds): - hours = int(seconds // 3600) - minutes = int((seconds % 3600) // 60) - secs = int(seconds % 60) - return f"{hours:02d}:{minutes:02d}:{secs:02d}" - - start_time = sec_to_hhmmss(start_sec) - end_time = sec_to_hhmmss(end_sec) - sections.append(f"*{start_time}-{end_time}") - except (ValueError, AttributeError): - pass - - if sections: - # Pass each section as a separate element in the list (yt-dlp expects multiple --download-sections args) - base_options["download_sections"] = sections - debug(f"Download sections configured: {', '.join(sections)}") - # Note: Not using --force-keyframes-at-cuts to avoid re-encoding - # This may result in less precise cuts but faster downloads - - # Add playlist items selection if provided - if opts.playlist_items: - base_options["playlist_items"] = opts.playlist_items - - if not opts.quiet: - debug(f"yt-dlp: mode={opts.mode}, format={base_options.get('format')}") - return base_options - - -def _iter_download_entries(info: Dict[str, Any]) -> Iterator[Dict[str, Any]]: - """Iterate through download entries, handling playlists.""" - queue: List[Dict[str, Any]] = [info] - seen: set[int] = set() - while queue: - current = queue.pop(0) - obj_id = id(current) - if obj_id in seen: - continue - seen.add(obj_id) - entries = current.get("entries") - if isinstance(entries, list): - for entry in entries: - if isinstance(entry, dict): - queue.append(entry) - if current.get("requested_downloads") or not entries: - yield current - - -def _candidate_paths(entry: Dict[str, Any], output_dir: Path) -> Iterator[Path]: - """Get candidate file paths for downloaded media.""" - requested = entry.get("requested_downloads") - if isinstance(requested, list): - for item in requested: - if isinstance(item, dict): - for key in ("filepath", "_filename", "filename"): - value = item.get(key) - if value: - yield Path(value) - for key in ("filepath", "_filename", "filename"): - value = entry.get(key) - if value: - yield Path(value) - if entry.get("filename"): - yield output_dir / entry["filename"] - - -def _resolve_entry_and_path(info: Dict[str, - Any], - output_dir: Path) -> tuple[Dict[str, - Any], - Path]: - """Find downloaded file in yt-dlp metadata.""" - for entry in _iter_download_entries(info): - for candidate in _candidate_paths(entry, output_dir): - if candidate.is_file(): - return entry, candidate - if not candidate.is_absolute(): - resolved = output_dir / candidate - if resolved.is_file(): - return entry, resolved - raise FileNotFoundError("yt-dlp did not report a downloaded media file") - - -def _extract_sha256(info: Dict[str, Any]) -> Optional[str]: - """Extract SHA256 hash from yt-dlp metadata.""" - for payload in [info] + info.get("entries", []): - if not isinstance(payload, dict): - continue - hashes = payload.get("hashes") - if isinstance(hashes, dict): - for key in ("sha256", "sha-256", "sha_256"): - value = hashes.get(key) - if isinstance(value, str) and value.strip(): - return value.strip().lower() - for key in ("sha256", "sha-256", "sha_256"): - value = payload.get(key) - if isinstance(value, str) and value.strip(): - return value.strip().lower() - return None - - -def _get_libgen_download_url(libgen_url: str) -> Optional[str]: - """Extract the actual download link from LibGen redirect URL. - - LibGen url like https://libgen.gl/file.php?id=123456 redirect to - actual mirror url. This follows the redirect chain to get the real file. - - Args: - libgen_url: LibGen file.php URL - - Returns: - Actual download URL or None if extraction fails - """ - try: - import requests - from urllib.parse import urlparse - - # Check if this is a LibGen URL - parsed = urlparse(libgen_url) - if "libgen" not in parsed.netloc.lower(): - return None - - if "/file.php" not in parsed.path.lower(): - return None - - # LibGen redirects to actual mirrors, follow redirects to get final URL - session = requests.Session() - session.headers.update( - { - "User-Agent": - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" - } - ) - - debug(f"Following LibGen redirect chain for: {libgen_url}") - - # First, get the page and look for direct download link - try: - response = session.get(libgen_url, timeout=10, allow_redirects=True) - final_url = response.url - - # Try to find actual download link in the page - try: - try: - from lxml import html as lxml_html - except ImportError: - lxml_html = None - - if lxml_html is not None: - doc = lxml_html.fromstring(response.content) - for a in doc.xpath("//a[@href]"): - href = str(a.get("href") or "").strip() - if not href: - continue - - href_lower = href.lower() - if "get.php" in href_lower or href_lower.endswith((".pdf", - ".epub", - ".djvu", - ".mobi")): - download_url = ( - href if href.startswith("http") else - urljoin(final_url, - href) - ) - debug(f"Found download link: {download_url}") - return download_url - else: - # Regex fallback - for m in re.finditer( - r"href=[\"\']([^\"\']+)[\"\']", - response.text or "", - flags=re.IGNORECASE, - ): - href = str(m.group(1) or "").strip() - if not href or href.lower().startswith("javascript:"): - continue - href_lower = href.lower() - if "get.php" in href_lower or href_lower.endswith((".pdf", - ".epub", - ".djvu", - ".mobi")): - download_url = ( - href if href.startswith("http") else - urljoin(final_url, - href) - ) - debug(f"Found download link: {download_url}") - return download_url - except Exception: - pass - - # If we followed redirects successfully, return the final URL - # This handles cases where libgen redirects to a direct download mirror - if final_url != libgen_url: - debug(f"LibGen resolved to mirror: {final_url}") - return final_url - - except requests.RequestException as e: - log(f"Error following LibGen redirects: {e}", file=sys.stderr) - # Try head request as fallback - try: - response = session.head(libgen_url, allow_redirects=True, timeout=10) - if response.url != libgen_url: - debug(f"LibGen HEAD resolved to: {response.url}") - return response.url - except: - pass - - return None - - except Exception as e: - log(f"Error resolving LibGen URL: {e}", file=sys.stderr) - return None - - -def _download_direct_file( - url: str, - output_dir: Path, - debug_logger: Optional[DebugLogger] = None, - quiet: bool = False, - suggested_filename: Optional[str] = None, - pipeline_progress: Optional[Any] = None, -) -> DownloadMediaResult: - """Download a direct file (PDF, image, document, etc.) without yt-dlp.""" - ensure_directory(output_dir) - - from urllib.parse import unquote, urlparse, parse_qs - import re - - def _sanitize_filename(name: str) -> str: - # Windows-safe filename sanitization. - # Keep it simple: strip path parts, drop invalid chars, collapse whitespace. - text = str(name or "").strip() - if not text: - return "" - # Remove any path components - text = text.replace("/", "\\") - text = text.split("\\")[-1] - - invalid = set('<>:"/\\|?*') - cleaned_chars: List[str] = [] - for ch in text: - o = ord(ch) - if o < 32: - cleaned_chars.append(" ") - continue - if ch in invalid: - cleaned_chars.append(" ") - continue - cleaned_chars.append(ch) - cleaned = " ".join("".join(cleaned_chars).split()).strip() - # Avoid trailing dots/spaces on Windows - cleaned = cleaned.rstrip(" .") - return cleaned - - def _unique_path(path: Path) -> Path: - if not path.exists(): - return path - stem = path.stem - suffix = path.suffix - parent = path.parent - for i in range(1, 10_000): - candidate = parent / f"{stem} ({i}){suffix}" - if not candidate.exists(): - return candidate - return parent / f"{stem} ({int(time.time())}){suffix}" - - # Extract filename from URL - parsed_url = urlparse(url) - url_path = parsed_url.path - - # Try to get filename from query parameters first (for LibGen and similar services) - # e.g., ?filename=Book+Title.pdf or &download=filename.pdf - filename = None - if parsed_url.query: - query_params = parse_qs(parsed_url.query) - for param_name in ("filename", "download", "file", "name"): - if param_name in query_params and query_params[param_name]: - filename = query_params[param_name][0] - filename = unquote(filename) - break - - # If not found in query params, extract from URL path - if not filename or not filename.strip(): - filename = url_path.split("/")[-1] if url_path else "" - filename = unquote(filename) - - # Remove query strings from filename if any - if "?" in filename: - filename = filename.split("?")[0] - - # Try to get real filename from Content-Disposition header (HEAD request) - content_type = "" - try: - with HTTPClient(timeout=10.0) as client: - response = client._request("HEAD", url, follow_redirects=True) - content_disposition = response.headers.get("content-disposition", "") - try: - content_type = str(response.headers.get("content-type", - "") or "").strip().lower() - except Exception: - content_type = "" - if content_disposition: - # Extract filename from Content-Disposition header - # Format: attachment; filename="filename.pdf" or filename=filename.pdf - match = re.search( - r'filename\*?=(?:"([^"]*)"|([^;\s]*))', - content_disposition - ) - if match: - extracted_name = match.group(1) or match.group(2) - if extracted_name: - filename = unquote(extracted_name) - if not quiet: - debug(f"Filename from Content-Disposition: {filename}") - except Exception as e: - if not quiet: - log(f"Could not get filename from headers: {e}", file=sys.stderr) - - # Guardrail: never treat HTML landing pages as downloadable files. - # We explicitly probe with GET for page-like endpoints (e.g. *.php) since some - # servers block/lie on HEAD, and a URL path like `edition.php` would otherwise - # be saved as a bogus file. - try: - page_like_exts = {".php", - ".asp", - ".aspx", - ".jsp", - ".cgi"} - ext = "" - try: - ext = Path(str(filename or "")).suffix.lower() - except Exception: - ext = "" - - ct0 = (content_type or "").split(";", 1)[0].strip().lower() - must_probe = bool(ct0.startswith("text/html") or ext in page_like_exts) - - if must_probe: - with HTTPClient(timeout=10.0) as client: - with client._request_stream("GET", url, follow_redirects=True) as resp: - resp.raise_for_status() - ct = ( - str(resp.headers.get("content-type", - "") or "").split(";", - 1)[0].strip().lower() - ) - if ct.startswith("text/html"): - raise DownloadError( - "URL appears to be an HTML page, not a direct file" - ) - except DownloadError: - raise - except Exception: - # If we can't probe, keep going; later logic may still infer a safe extension. - pass - - # Apply suggested filename (from provider title) if given. - suggested = _sanitize_filename(suggested_filename) if suggested_filename else "" - if suggested: - # Preserve extension from suggested name if present; otherwise borrow from detected filename. - suggested_path = Path(suggested) - if suggested_path.suffix: - filename = suggested - else: - detected_ext = "" - try: - detected_ext = Path(str(filename)).suffix - except Exception: - detected_ext = "" - if detected_ext: - filename = suggested + detected_ext - else: - filename = suggested - - # If we still don't have an extension, try to infer one from Content-Type. - # Never fall back to a generic `.bin` extension. - try: - has_ext = bool(filename and Path(str(filename)).suffix) - except Exception: - has_ext = False - - if filename and (not has_ext): - ct = (content_type or "").split(";")[0].strip().lower() - ext_by_ct = { - "application/pdf": ".pdf", - "application/epub+zip": ".epub", - "application/x-mobipocket-ebook": ".mobi", - "image/jpeg": ".jpg", - "image/png": ".png", - "image/webp": ".webp", - "image/gif": ".gif", - "text/plain": ".txt", - "application/zip": ".zip", - } - - if ct in ext_by_ct: - filename = f"{filename}{ext_by_ct[ct]}" - elif ct.startswith("text/html"): - # Guardrail: HTML landing pages should not be downloaded as opaque files. - raise DownloadError("URL appears to be an HTML page, not a direct file") - - # Final guardrail: if filename is empty, refuse rather than inventing `download.bin`. - if not filename or not str(filename).strip(): - raise DownloadError( - "Could not determine filename for URL (no Content-Disposition and no path filename)" - ) - - file_path = _unique_path(output_dir / filename) - - # Prefer pipeline transfer bars when a Live UI is active. - use_pipeline_transfer = False - try: - if pipeline_progress is not None and hasattr(pipeline_progress, - "update_transfer"): - ui = None - if hasattr(pipeline_progress, "ui_and_pipe_index"): - ui, _ = pipeline_progress.ui_and_pipe_index() # type: ignore[attr-defined] - use_pipeline_transfer = ui is not None - except Exception: - use_pipeline_transfer = False - - progress_bar: Optional[ProgressBar] = None - if (not quiet) and (not use_pipeline_transfer): - progress_bar = ProgressBar() - - if not quiet: - debug(f"Direct download: {filename}") - - try: - start_time = time.time() - downloaded_bytes = [0] - total_bytes = [0] - last_progress_time = [start_time] - rendered_once = [False] - transfer_started = [False] - - def _maybe_begin_transfer(content_length: int) -> None: - if pipeline_progress is None: - return - if transfer_started[0]: - return - try: - total_val: Optional[int] = ( - int(content_length) - if isinstance(content_length, - int) and content_length > 0 else None - ) - except Exception: - total_val = None - try: - if hasattr(pipeline_progress, "begin_transfer"): - pipeline_progress.begin_transfer( - label=str(filename or "download"), - total=total_val - ) - transfer_started[0] = True - except Exception: - return - - def progress_callback(bytes_downloaded: int, content_length: int) -> None: - downloaded_bytes[0] = bytes_downloaded - total_bytes[0] = content_length - - # Update pipeline transfer bar when present. - try: - if pipeline_progress is not None and hasattr(pipeline_progress, - "update_transfer"): - _maybe_begin_transfer(content_length) - total_val: Optional[int] = ( - int(content_length) - if isinstance(content_length, - int) and content_length > 0 else None - ) - pipeline_progress.update_transfer( - label=str(filename or "download"), - completed=int(bytes_downloaded) - if bytes_downloaded is not None else None, - total=total_val, - ) - except Exception: - pass - - now = time.time() - is_final = bool(content_length > 0 and bytes_downloaded >= content_length) - if (not rendered_once[0]) or is_final: - pass - elif now - last_progress_time[0] < 0.5: - return - - elapsed = now - start_time - percent = ( - bytes_downloaded / content_length - ) * 100 if content_length > 0 else 0 - speed = bytes_downloaded / elapsed if elapsed > 0 else 0 - eta_str: Optional[str] = None - if content_length > 0 and speed > 0: - try: - eta_seconds = max( - 0.0, - float(content_length - bytes_downloaded) / float(speed) - ) - minutes, seconds = divmod(int(eta_seconds), 60) - hours, minutes = divmod(minutes, 60) - eta_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}" - except Exception: - eta_str = None - - if progress_bar is not None: - progress_bar.update( - downloaded=bytes_downloaded, - total=content_length if content_length > 0 else None, - label=str(filename or "download"), - file=sys.stderr, - ) - - rendered_once[0] = True - - last_progress_time[0] = now - - with HTTPClient(timeout=30.0) as client: - client.download(url, str(file_path), progress_callback=progress_callback) - - elapsed = time.time() - start_time - - try: - if progress_bar is not None: - progress_bar.finish() - except Exception: - pass - - try: - if (pipeline_progress is not None and transfer_started[0] - and hasattr(pipeline_progress, - "finish_transfer")): - pipeline_progress.finish_transfer(label=str(filename or "download")) - except Exception: - pass - - try: - if progress_bar is not None: - avg_speed_str = ( - progress_bar. - format_bytes(downloaded_bytes[0] / elapsed if elapsed > 0 else 0) + - "/s" - ) - else: - avg_speed_str = f"{(downloaded_bytes[0] / elapsed if elapsed > 0 else 0):.1f} B/s" - except Exception: - avg_speed_str = "" - if not quiet: - debug(f"✓ Downloaded in {elapsed:.1f}s at {avg_speed_str}") - - # For direct file downloads, create minimal info dict without filename as title - # This prevents creating duplicate title: tags when filename gets auto-generated - # We'll add title back later only if we couldn't extract meaningful tags - ext = "" - try: - ext = Path(str(filename)).suffix.lstrip(".") - except Exception: - ext = "" - - info = { - "id": str(filename).rsplit(".", - 1)[0] if "." in str(filename) else str(filename), - "ext": ext, - "webpage_url": url, - } - - hash_value = None - try: - hash_value = sha256_file(file_path) - except Exception: - pass - - tags = [] - if extract_ytdlp_tags: - try: - tags = extract_ytdlp_tags(info) - except Exception as e: - log(f"Error extracting tags: {e}", file=sys.stderr) - - # Only use filename as a title tag if we couldn't extract any meaningful tags - # This prevents duplicate title: tags when the filename could be mistaken for metadata - if not any(t.startswith("title:") for t in tags): - # Re-extract tags with filename as title only if needed - info["title"] = filename - tags = [] - if extract_ytdlp_tags: - try: - tags = extract_ytdlp_tags(info) - except Exception as e: - log(f"Error extracting tags with filename: {e}", file=sys.stderr) - - if debug_logger is not None: - debug_logger.write_record( - "direct-file-downloaded", - { - "url": url, - "path": str(file_path), - "hash": hash_value - }, - ) - - return DownloadMediaResult( - path=file_path, - info=info, - tag=tags, - source_url=url, - hash_value=hash_value, - ) - - except (httpx.HTTPError, httpx.RequestError) as exc: - try: - if progress_bar is not None: - progress_bar.finish() - except Exception: - pass - try: - if (pipeline_progress is not None and transfer_started[0] - and hasattr(pipeline_progress, - "finish_transfer")): - pipeline_progress.finish_transfer(label=str(filename or "download")) - except Exception: - pass - log(f"Download error: {exc}", file=sys.stderr) - if debug_logger is not None: - debug_logger.write_record( - "exception", - { - "phase": "direct-file", - "url": url, - "error": str(exc) - }, - ) - raise DownloadError(f"Failed to download {url}: {exc}") from exc - except Exception as exc: - try: - if progress_bar is not None: - progress_bar.finish() - except Exception: - pass - try: - if (pipeline_progress is not None and transfer_started[0] - and hasattr(pipeline_progress, - "finish_transfer")): - pipeline_progress.finish_transfer(label=str(filename or "download")) - except Exception: - pass - log(f"Error downloading file: {exc}", file=sys.stderr) - if debug_logger is not None: - debug_logger.write_record( - "exception", - { - "phase": "direct-file", - "url": url, - "error": str(exc), - "traceback": traceback.format_exc(), - }, - ) - raise DownloadError(f"Error downloading file: {exc}") from exc - - -def probe_url(url: str, - no_playlist: bool = False, - timeout_seconds: int = 15) -> Optional[Dict[str, - Any]]: - """Probe URL to extract metadata WITHOUT downloading. - - Args: - url: URL to probe - no_playlist: If True, ignore playlists and probe only the single video - timeout_seconds: Max seconds to wait for probe (default 15s) - - Returns: - Dict with keys: extractor, title, entries (if playlist), duration, etc. - Returns None if not supported by yt-dlp or on timeout. - """ - if not is_url_supported_by_ytdlp(url): - return None - - # Wrap probe in timeout to prevent hanging on large playlists - import threading - from typing import cast - - result_container: List[Optional[Any]] = [None, None] # [result, error] - - def _do_probe() -> None: - try: - _ensure_yt_dlp_ready() - - assert yt_dlp is not None - # Extract info without downloading - # Use extract_flat='in_playlist' to get full metadata for playlist items - ydl_opts = { - "quiet": True, # Suppress all output - "no_warnings": True, - "socket_timeout": 10, - "retries": 2, # Reduce retries for faster timeout - "skip_download": True, # Don't actually download - "extract_flat": "in_playlist", # Get playlist with metadata for each entry - "noprogress": True, # No progress bars - } - - # Cookies are optional for probing; callers should pass cookiefile via DownloadOptions when needed. - - # Add no_playlist option if specified - if no_playlist: - ydl_opts["noplaylist"] = True - - with yt_dlp.YoutubeDL(ydl_opts) as ydl: # type: ignore[arg-type] - info = ydl.extract_info(url, download=False) - - if not isinstance(info, dict): - result_container[0] = None - return - - # Extract relevant fields - result_container[0] = { - "extractor": info.get("extractor", ""), - "title": info.get("title", ""), - "entries": info.get("entries", []), # Will be populated if playlist - "duration": info.get("duration"), - "uploader": info.get("uploader"), - "description": info.get("description"), - "url": url, - } - except Exception as exc: - log(f"Probe error for {url}: {exc}") - result_container[1] = exc - - thread = threading.Thread(target=_do_probe, daemon=False) - thread.start() - thread.join(timeout=timeout_seconds) - - if thread.is_alive(): - # Probe timed out - return None to fall back to direct download - debug( - f"Probe timeout for {url} (>={timeout_seconds}s), proceeding with download" - ) - return None - - if result_container[1] is not None: - # Probe error - return None to proceed anyway - return None - - return cast(Optional[Dict[str, Any]], result_container[0]) - - -__all__ = [ - "is_url_supported_by_ytdlp", - "list_formats", - "probe_url", - "DownloadError", - "DownloadOptions", - "DownloadMediaResult", -] diff --git a/SYS/metadata.py b/SYS/metadata.py index c9f1623..a932837 100644 --- a/SYS/metadata.py +++ b/SYS/metadata.py @@ -8,895 +8,26 @@ from urllib.parse import urlsplit, urlunsplit, unquote from collections import deque from pathlib import Path from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple + +from API.HydrusNetwork import apply_hydrus_tag_mutation, fetch_hydrus_metadata, fetch_hydrus_metadata_by_url from SYS.models import FileRelationshipTracker -try: - import musicbrainzngs # type: ignore -except ImportError: # pragma: no cover - musicbrainzngs = None - -from imdbinfo.services import get_movie # type: ignore - -try: - import yt_dlp # type: ignore -except ImportError: # pragma: no cover +try: # Optional; used when available for richer metadata fetches + import yt_dlp +except Exception: # pragma: no cover - optional dependency yt_dlp = None -try: - from SYS.config import load_config, resolve_output_dir # type: ignore -except ImportError: # pragma: no cover - load_config = None # type: ignore[assignment] - resolve_output_dir = None # type: ignore[assignment] +try: # Optional; used for IMDb lookup without API key + from imdbinfo.services import search_title # type: ignore +except Exception: # pragma: no cover - optional dependency + search_title = None # type: ignore[assignment] -try: - from SYS.utils import sha256_file -except ImportError: # pragma: no cover - sha256_file = None # type: ignore[assignment] -try: # Optional metadata helper for audio files - import mutagen # type: ignore -except ImportError: # pragma: no cover - best effort - mutagen = None # type: ignore +def value_normalize(value: Any) -> str: + text = str(value).strip() + return text.lower() if text else "" -from SYS.utils import sanitize_metadata_value, unique_preserve_order -try: - from helpers.hydrus import HydrusClient, HydrusRequestError, HydrusRequestSpec # type: ignore -except ImportError: # pragma: no cover - HydrusClient = None # type: ignore[assignment] - HydrusRequestError = RuntimeError # type: ignore[assignment] - HydrusRequestSpec = None # type: ignore[assignment] -if musicbrainzngs: # pragma: no branch - musicbrainzngs.set_useragent("DownlowScript", "0.1", "admin@example.com") - MusicBrainzRequestError = getattr( - musicbrainzngs, - "MusicBrainzRequestError", - Exception - ) -else: # pragma: no cover - MusicBrainzRequestError = Exception - -# Global relationship tracker for the current session -_CURRENT_RELATIONSHIP_TRACKER = FileRelationshipTracker() - - -def prepare_ffmpeg_metadata(payload: Optional[Dict[str, Any]]) -> Dict[str, str]: - """Build ffmpeg/mutagen metadata map from payload.""" - if not isinstance(payload, dict): - return {} - - metadata: Dict[str, - str] = {} - - def set_field(key: str, raw: Any, limit: int = 2000) -> None: - sanitized = sanitize_metadata_value(raw) - if not sanitized: - return - if len(sanitized) > limit: - sanitized = sanitized[:limit] - metadata[key] = sanitized - - set_field("title", payload.get("title")) - set_field("artist", payload.get("artist"), 512) - set_field("album", payload.get("album"), 512) - set_field("date", payload.get("year"), 20) - - comment = payload.get("comment") - tags_value = payload.get("tag") - - tag_strings: List[str] = [] - artists_from_tags: List[str] = [] - albums_from_tags: List[str] = [] - genres_from_tags: List[str] = [] - - if isinstance(tags_value, list): - for raw_tag in tags_value: - if raw_tag is None: - continue - if not isinstance(raw_tag, str): - raw_tag = str(raw_tag) - tag = raw_tag.strip() - if not tag: - continue - - tag_strings.append(tag) - namespace, sep, value = tag.partition(":") - if sep and value: - ns = namespace.strip().lower() - value = value.strip() - if ns in {"artist", - "creator", - "author", - "performer"}: - artists_from_tags.append(value) - elif ns in {"album", - "series", - "collection", - "group"}: - albums_from_tags.append(value) - elif ns in {"genre", - "rating"}: - genres_from_tags.append(value) - elif ns in {"comment", - "description"} and not comment: - comment = value - elif ns in {"year", - "date"} and not payload.get("year"): - set_field("date", value, 20) - else: - genres_from_tags.append(tag) - - if "artist" not in metadata and artists_from_tags: - set_field( - "artist", - ", ".join(unique_preserve_order(artists_from_tags)[:3]), - 512 - ) - if "album" not in metadata and albums_from_tags: - set_field("album", unique_preserve_order(albums_from_tags)[0], 512) - if genres_from_tags: - set_field("genre", ", ".join(unique_preserve_order(genres_from_tags)[:5]), 256) - - if tag_strings: - joined_tags = ", ".join(tag_strings[:50]) - set_field("keywords", joined_tags, 2000) - if not comment: - comment = joined_tags - - if comment: - set_field("comment", comment, 2000) - set_field("description", comment, 2000) - - return metadata - - -def apply_mutagen_metadata(path: Path, metadata: Dict[str, str], fmt: str) -> None: - """Best-effort metadata writing for audio containers.""" - if fmt != "audio": - return - if not metadata: - return - if mutagen is None: - return - - try: - audio = mutagen.File(path, easy=True) # type: ignore[attr-defined] - except Exception as exc: # pragma: no cover - best effort only - log(f"mutagen load failed: {exc}", file=sys.stderr) - return - - if audio is None: - return - - field_map = { - "title": "title", - "artist": "artist", - "album": "album", - "genre": "genre", - "comment": "comment", - "description": "comment", - "date": "date", - } - - changed = False - for source_key, target_key in field_map.items(): - value = metadata.get(source_key) - if not value: - continue - try: - audio[target_key] = [value] - changed = True - except Exception: # pragma: no cover - continue - - if not changed: - return - - try: - audio.save() - except Exception as exc: # pragma: no cover - log(f"mutagen save failed: {exc}", file=sys.stderr) - - -def build_ffmpeg_command( - ffmpeg_path: str, - input_path: Path, - output_path: Path, - fmt: str, - max_width: int, - metadata: Optional[Dict[str, - str]] = None, -) -> List[str]: - """Build an ffmpeg command line for common export formats.""" - cmd: List[str] = [ffmpeg_path, "-y", "-i", str(input_path)] - - if fmt in {"mp4", - "webm"} and max_width and max_width > 0: - cmd.extend(["-vf", f"scale='min({max_width},iw)':-2"]) - - if metadata: - for key, value in metadata.items(): - cmd.extend(["-metadata", f"{key}={value}"]) - - # Video formats - if fmt == "mp4": - cmd.extend( - [ - "-c:v", - "libx265", - "-preset", - "medium", - "-crf", - "26", - "-tag:v", - "hvc1", - "-pix_fmt", - "yuv420p", - "-c:a", - "aac", - "-b:a", - "192k", - "-movflags", - "+faststart", - ] - ) - elif fmt == "webm": - cmd.extend( - [ - "-c:v", - "libvpx-vp9", - "-b:v", - "0", - "-crf", - "32", - "-c:a", - "libopus", - "-b:a", - "160k", - ] - ) - cmd.extend(["-f", "webm"]) - - # Audio formats - elif fmt == "mp3": - cmd.extend(["-vn", "-c:a", "libmp3lame", "-b:a", "192k"]) - cmd.extend(["-f", "mp3"]) - elif fmt == "flac": - cmd.extend(["-vn", "-c:a", "flac"]) - cmd.extend(["-f", "flac"]) - elif fmt == "wav": - cmd.extend(["-vn", "-c:a", "pcm_s16le"]) - cmd.extend(["-f", "wav"]) - elif fmt == "aac": - cmd.extend(["-vn", "-c:a", "aac", "-b:a", "192k"]) - cmd.extend(["-f", "adts"]) - elif fmt == "m4a": - cmd.extend(["-vn", "-c:a", "aac", "-b:a", "192k"]) - cmd.extend(["-f", "ipod"]) - elif fmt == "ogg": - cmd.extend(["-vn", "-c:a", "libvorbis", "-b:a", "192k"]) - cmd.extend(["-f", "ogg"]) - elif fmt == "opus": - cmd.extend(["-vn", "-c:a", "libopus", "-b:a", "192k"]) - cmd.extend(["-f", "opus"]) - elif fmt == "audio": - # Legacy format name for mp3 - cmd.extend(["-vn", "-c:a", "libmp3lame", "-b:a", "192k"]) - cmd.extend(["-f", "mp3"]) - elif fmt != "copy": - raise ValueError(f"Unsupported format: {fmt}") - - cmd.append(str(output_path)) - return cmd - - -def field(obj: Any, name: str, value: Any = None) -> Any: - """Get or set a field on dict or object.""" - if value is None: - if isinstance(obj, dict): - return obj.get(name) - return getattr(obj, name, None) - - if isinstance(obj, dict): - obj[name] = value - else: - setattr(obj, name, value) - return value - - -def _generate_hydrus_url_variants(url: str) -> List[str]: - seen: Set[str] = set() - variants: List[str] = [] - - def push(candidate: Optional[str]) -> None: - if not candidate: - return - text = candidate.strip() - if not text or text in seen: - return - seen.add(text) - variants.append(text) - - push(url) - try: - parsed = urlsplit(url) - except Exception: - return variants - - if parsed.scheme in {"http", - "https"}: - alternate_scheme = "https" if parsed.scheme == "http" else "http" - push( - urlunsplit( - ( - alternate_scheme, - parsed.netloc, - parsed.path, - parsed.query, - parsed.fragment - ) - ) - ) - - normalised_netloc = parsed.netloc.lower() - if normalised_netloc and normalised_netloc != parsed.netloc: - push( - urlunsplit( - ( - parsed.scheme, - normalised_netloc, - parsed.path, - parsed.query, - parsed.fragment - ) - ) - ) - - if parsed.path: - trimmed_path = parsed.path.rstrip("/") - if trimmed_path != parsed.path: - push( - urlunsplit( - ( - parsed.scheme, - parsed.netloc, - trimmed_path, - parsed.query, - parsed.fragment - ) - ) - ) - else: - push( - urlunsplit( - ( - parsed.scheme, - parsed.netloc, - parsed.path + "/", - parsed.query, - parsed.fragment - ) - ) - ) - unquoted_path = unquote(parsed.path) - if unquoted_path != parsed.path: - push( - urlunsplit( - ( - parsed.scheme, - parsed.netloc, - unquoted_path, - parsed.query, - parsed.fragment - ) - ) - ) - - if parsed.query or parsed.fragment: - push(urlunsplit((parsed.scheme, parsed.netloc, parsed.path, "", ""))) - if parsed.path: - unquoted_path = unquote(parsed.path) - push(urlunsplit((parsed.scheme, parsed.netloc, unquoted_path, "", ""))) - - return variants - - -def normalize_urls(value: Any) -> List[str]: - """Normalize a URL field into a stable, deduplicated list. - - Accepts: - - None - - a single URL string (optionally containing multiple URLs) - - a list/tuple/set of URL strings - - This helper is used by cmdlets/stores/pipeline to keep `url` consistent. - """ - - def _iter_raw_urls(raw: Any) -> Iterable[str]: - if raw is None: - return - - if isinstance(raw, str): - text = raw.strip() - if not text: - return - # Support legacy prefixes like "url:https://...". - if text.lower().startswith("url:"): - text = text.split(":", 1)[1].strip() - - # Prefer extracting obvious URLs to avoid splitting inside query strings. - matches = re.findall(r"https?://[^\s,]+", text, flags=re.IGNORECASE) - if matches: - for m in matches: - yield m - return - - # Fallback: split on commas/whitespace. - for token in text.replace("\n", - " ").replace("\r", - " ").replace(",", - " ").split(): - if token: - yield token - return - - if isinstance(raw, (list, tuple, set)): - for item in raw: - if item is None: - continue - if isinstance(item, str): - if item.strip(): - yield item - else: - text = str(item).strip() - if text: - yield text - return - - # Last resort: string-coerce. - text = str(raw).strip() - if text: - yield text - - def _canonicalize(url_text: str) -> Optional[str]: - u = str(url_text or "").strip() - if not u: - return None - - # Trim common wrappers and trailing punctuation. - u = u.strip("<>\"' ") - u = u.rstrip(')].,;"') - if not u: - return None - - # IMPORTANT: URLs can be case-sensitive in the path/query on some hosts - # (e.g., https://0x0.st/PzGY.webp). Do not lowercase or otherwise rewrite - # the URL here; preserve exact casing and percent-encoding. - return u - - seen: Set[str] = set() - out: List[str] = [] - for raw_url in _iter_raw_urls(value): - canonical = _canonicalize(raw_url) - if not canonical: - continue - if canonical in seen: - continue - seen.add(canonical) - out.append(canonical) - - return out - - -def value_normalize(value: str) -> str: - """Normalize whitespace: collapse internal spaces, strip, remove newlines.""" - value = value.replace("\n", " ").replace("\r", " ") - value = re.sub(r"\s+", " ", value).strip() - return value - - -def import_pending_sidecars(db_root: Path, db: Any) -> None: - """Import pending sidecars (.tag/.metadata/.notes) into the database.""" - try: - sidecar_patterns = ["**/*.tag", "**/*.metadata", "**/*.notes"] - - for pattern in sidecar_patterns: - for sidecar_path in db_root.glob(pattern): - if ".downlow" in sidecar_path.parts: - continue - - try: - base_path = sidecar_path.with_suffix("") - except Exception: - continue - - if not base_path.exists(): - continue - - # Ensure file entry exists (folder store schema is keyed by hash). - file_hash_value: Optional[str] = None - if sha256_file and base_path.exists(): - try: - file_hash_value = sha256_file(base_path) - except Exception: - file_hash_value = None - - if not file_hash_value: - continue - - try: - db_file_path = ( - db._to_db_file_path(base_path) # type: ignore[attr-defined] - if hasattr(db, "_to_db_file_path") - else str(base_path) - ) - except Exception: - db_file_path = str(base_path) - - try: - file_modified = float(base_path.stat().st_mtime) - except Exception: - file_modified = None - - try: - cursor = db.connection.cursor() if db.connection else None - if cursor: - cursor.execute( - "SELECT hash FROM file WHERE file_path = ?", - (str(db_file_path),), - ) - result = cursor.fetchone() - if not result: - cursor.execute( - "INSERT INTO file (hash, file_path, file_modified) VALUES (?, ?, ?)", - (file_hash_value, str(db_file_path), file_modified), - ) - db.connection.commit() - except Exception: - continue - - if sidecar_path.suffix == ".tag": - try: - content = sidecar_path.read_text(encoding="utf-8") - except Exception: - continue - - tags = [ - line.strip() for line in content.splitlines() if line.strip() - ] - if tags: - try: - cursor = db.connection.cursor() if db.connection else None - if cursor: - for tag in tags: - cursor.execute( - "INSERT OR IGNORE INTO tag (hash, tag) VALUES (?, ?)", - (file_hash_value, - tag), - ) - db.connection.commit() - except Exception: - pass - - elif sidecar_path.suffix == ".metadata": - url: List[str] = [] - relationships: List[str] = [] - hash_value: Optional[str] = None - - try: - content = sidecar_path.read_text(encoding="utf-8") - except Exception: - content = "" - - for raw_line in content.splitlines(): - line = raw_line.strip() - if not line or line.startswith("#"): - continue - lower = line.lower() - if lower.startswith("hash:"): - hash_value = line.split(":", 1)[1].strip() or None - elif lower.startswith("url:") or lower.startswith("url:"): - url_part = line.split(":", 1)[1].strip() - if url_part: - for url_segment in url_part.replace(",", " ").split(): - clean = url_segment.strip() - if clean and clean not in url: - url.append(clean) - elif lower.startswith("relationship:"): - rel_value = line.split(":", 1)[1].strip() - if rel_value: - relationships.append(rel_value) - - if sha256_file and base_path.exists(): - try: - hash_value = sha256_file(base_path) - except Exception: - pass - - if not hash_value: - hash_value = file_hash_value - - try: - cursor = db.connection.cursor() if db.connection else None - if cursor: - cursor.execute( - 'INSERT OR REPLACE INTO metadata (hash, url, relationships, time_imported, time_modified) VALUES (?, ?, ?, datetime("now"), datetime("now"))', - ( - hash_value, - json.dumps(url), - json.dumps(relationships), - ), - ) - db.connection.commit() - except Exception: - pass - - elif sidecar_path.suffix == ".notes": - try: - content = sidecar_path.read_text(encoding="utf-8").strip() - except Exception: - content = "" - if content: - try: - cursor = db.connection.cursor() if db.connection else None - if cursor: - cursor.execute( - 'INSERT INTO note (hash, name, note, created_at, updated_at) VALUES (?, ?, ?, datetime("now"), datetime("now")) ON CONFLICT(hash, name) DO UPDATE SET note = excluded.note, updated_at = datetime("now")', - (file_hash_value, "default", content), - ) - db.connection.commit() - except Exception: - pass - except Exception: - pass - - -def _extract_from_sequence(values: Sequence) -> Iterable[str]: - """Extract string values from a sequence of mixed types (dicts, strings, etc.).""" - seen = set() - for item in values: - candidate = None - if isinstance(item, dict): - candidate = ( - item.get("name") or item.get("title") or item.get("value") - or item.get("text") or item.get("id") or item.get("imdb_id") - ) - else: - candidate = str(item) - if candidate: - normalized = value_normalize(str(candidate)) - if normalized and normalized not in seen: - seen.add(normalized) - yield normalized - - -def _add_tag(tags: List[str], namespace: str, value: Optional[str]) -> None: - """Add a single namespaced tag (e.g., 'artist:Beatles').""" - if not value: - return - value = value_normalize(str(value)) - if not value: - return - tags.append(f"{namespace}:{value}") - - -def _extend_tags(tags: List[str], namespace: str, values) -> None: - """Extend tags from a single value or sequence, with optional namespace.""" - if not values: - return - if isinstance(values, set): - values = list(values) - if isinstance(values, (list, tuple)): - for candidate in _extract_from_sequence(values): - _add_tag(tags, namespace, candidate) - else: - _add_tag(tags, namespace, values) - - -def imdb_tag(imdb_id: str) -> Dict[str, object]: - movie = get_movie(imdb_id) - if movie is None: - raise ValueError(f"IMDb title not found: {imdb_id}") - if hasattr(movie, "model_dump"): - info = movie.model_dump() - elif hasattr(movie, "dict"): - info = movie.dict() - else: - info = {} - tags: List[str] = [] - canonical_id = getattr(movie, "imdb_id", None) or info.get("imdb_id") or imdb_id - if canonical_id: - canonical_id = str(canonical_id).strip().lower() - if not canonical_id.startswith("tt"): - canonical_id = f"tt{canonical_id}" - else: - canonical_id = imdb_id.lower() - if not canonical_id.startswith("tt"): - canonical_id = f"tt{canonical_id}" - _add_tag(tags, "imdb", canonical_id) - _add_tag(tags, "title", info.get("title") or getattr(movie, "title", None)) - _add_tag( - tags, - "year", - info.get("year") or info.get("start_year") or getattr(movie, - "year", - None) - ) - _add_tag(tags, "rating", info.get("rating")) - runtime_value = None - if isinstance(info.get("runtime"), (str, int)): - runtime_value = info["runtime"] - elif isinstance(info.get("runtimes"), (list, tuple)) and info["runtimes"]: - runtime_value = info["runtimes"][0] - elif info.get("duration"): - runtime_value = info["duration"] - _add_tag(tags, "runtime", runtime_value) - kind = None - if hasattr(movie, "is_series") and movie.is_series(): - kind = "series" - elif hasattr(movie, "is_episode") and movie.is_episode(): - kind = "episode" - else: - kind = info.get("kind") or "movie" - _add_tag(tags, "kind", kind) - _extend_tags(tags, "genre", info.get("genres") or info.get("genre")) - _extend_tags(tags, "language", info.get("languages")) - _extend_tags(tags, "country", info.get("countries")) - creators = ( - info.get("directors") or info.get("director") or info.get("producers") - or info.get("writers") - ) - if creators: - _extend_tags(tags, "creator", creators) - info_episode = getattr(movie, "info_episode", None) - series_title = None - season = info.get("season") or info.get("series_season") - episode = info.get("episode") or info.get("series_episode") - if info_episode: - if hasattr(info_episode, "model_dump"): - episode_meta = info_episode.model_dump() - elif hasattr(info_episode, "dict"): - episode_meta = info_episode.dict() - else: - episode_meta = getattr(info_episode, - "__dict__", - {}) or {} - season = season or episode_meta.get("season") or episode_meta.get("season_n") - episode = episode or episode_meta.get("episode" - ) or episode_meta.get("episode_n") - series_title = episode_meta.get("series_title") - if not series_title: - series_title = getattr(getattr(movie, "series_info", None), "title", None) - if kind == "episode" and not season: - season = getattr(getattr(movie, "series_info", None), "season", None) - if season: - _add_tag(tags, "season", season) - if episode: - _add_tag(tags, "episode", episode) - series_title = ( - series_title or info.get("series_title") or info.get("series") - or getattr(getattr(movie, - "series_info", - None), - "title", - None) - ) - if series_title: - _add_tag(tags, "series", series_title) - summary = info.get("plot outline") or info.get("plot_outline") or info.get("plot") - if isinstance(summary, (list, tuple)): - summary = summary[0] if summary else None - if not summary and hasattr(movie, "plot_outline"): - summary = getattr(movie, "plot_outline") - if not summary: - summaries = info.get("summaries") - if isinstance(summaries, (list, tuple)) and summaries: - summary = summaries[0] - if summary: - _add_tag(tags, "summary", summary) - cast_sources = ( - info.get("cast") or info.get("actors") or info.get("cast_members") - or info.get("stars") - ) - cast_names: List[str] = [] - if cast_sources: - for name in _extract_from_sequence(cast_sources): - if name: - cast_names.append(name) - if len(cast_names) >= 10: - break - if cast_names: - _extend_tags(tags, "cast", cast_names) - return { - "source": "imdb", - "id": canonical_id, - "tag": tags - } - - -def fetch_musicbrainz_tags(mbid: str, entity: str) -> Dict[str, object]: - if not musicbrainzngs: - raise RuntimeError("musicbrainzngs package is not available") - entity = entity.lower() - if entity not in {"release", - "recording", - "artist"}: - raise ValueError("Unsupported MusicBrainz entity: %s" % entity) - - def _fetch_with_fallback(getter, key: str, includes: List[str]): - try: - return getter(mbid, includes=includes)[key] - except MusicBrainzRequestError as exc: - if "Bad includes" in str(exc) and "genres" in includes: - fallback = [inc for inc in includes if inc != "genres"] - return getter(mbid, includes=fallback)[key] - raise - - include = ["tags", "genres"] - match entity: - case "release": - include.extend(["artist-credits", "release-groups"]) - data = _fetch_with_fallback( - musicbrainzngs.get_release_by_id, - "release", - include - ) - case "recording": - include.extend(["artists", "releases"]) - data = _fetch_with_fallback( - musicbrainzngs.get_recording_by_id, - "recording", - include - ) - case _: - include.extend(["release-groups", "aliases"]) - data = _fetch_with_fallback( - musicbrainzngs.get_artist_by_id, - "artist", - include - ) - tags: List[str] = [] - _add_tag(tags, "musicbrainz", mbid) - _add_tag(tags, "entity", entity) - _add_tag(tags, "title", data.get("title")) - if entity != "artist": - date = data.get("date") or data.get("first-release-date") - if date: - _add_tag(tags, "date", date) - _add_tag(tags, "year", date[:4]) - if data.get("country"): - _add_tag(tags, "country", data["country"]) - if data.get("status"): - _add_tag(tags, "status", data["status"]) - artist_credit = data.get("artist-credit") or data.get("artists") - if artist_credit: - names = [] - for item in artist_credit: - if isinstance(item, dict): - name = item.get("name") or item.get("artist", - {}).get("name") - if name: - names.append(name) - _extend_tags(tags, "artist", names) - tag_list = data.get("tag-list") or data.get("tags") or [] - for tag in tag_list: - if isinstance(tag, dict) and tag.get("name"): - _add_tag(tags, "tag", tag["name"]) - genre_list = data.get("genre-list") or data.get("genres") or [] - for genre in genre_list: - if isinstance(genre, dict) and genre.get("name"): - _add_tag(tags, "genre", genre["name"]) - return { - "source": "musicbrainz", - "id": mbid, - "tag": tags, - "entity": entity - } - - -def _append_unique(target: List[str], seen: Set[str], value: Optional[str]) -> None: - """Append a single value if not already in seen set (deduplication).""" - if value is None: - return +def _append_unique(target: List[str], seen: Set[str], value: Any) -> None: normalized = value_normalize(str(value)) if not normalized or normalized in seen: return @@ -904,6 +35,13 @@ def _append_unique(target: List[str], seen: Set[str], value: Optional[str]) -> N target.append(normalized) +def _normalize_tag(tag: Any) -> Optional[str]: + if tag is None: + return None + normalized = value_normalize(tag) + return normalized or None + + def _extend_namespaced( target: List[str], seen: Set[str], @@ -1089,672 +227,151 @@ def resolve_remote_metadata(payload: Dict[str, Any]) -> Dict[str, Any]: return result -def _ensure_hydrus_client() -> None: - if (HydrusClient is None or HydrusRequestSpec - is None): # pragma: no cover - depends on optional module - raise RuntimeError("Hydrus helpers are unavailable") +def imdb_tag(imdb_id: str, timeout: float = 10.0) -> Dict[str, Any]: + """Fetch IMDb data using imdbinfo (no API key required). + Returns at minimum an imdb: tag. When imdbinfo is installed, enriches + with title/year/type/rating from the first search result for the id. + """ + normalized = value_normalize(imdb_id) + if not normalized: + raise ValueError("imdb_id is required") + if not normalized.startswith("tt"): + normalized = f"tt{normalized}" -def _normalize_hash(value: Any) -> str: - candidate = str(value or "").strip().lower() - if not candidate: - raise ValueError("Hydrus hash is required") - if len(candidate) != 64 or any(ch not in "0123456789abcdef" for ch in candidate): - raise ValueError("Hydrus hash must be a 64-character hex string") - return candidate + tags: List[str] = [] + seen: Set[str] = set() + _append_unique(tags, seen, f"imdb:{normalized}") - -def _normalize_tag(tag: Any) -> Optional[str]: - if tag is None: - return None - if isinstance(tag, str): - candidate = tag.strip() - else: - candidate = str(tag).strip() - return candidate or None - - -def _extract_tag_services(entry: Dict[str, Any]) -> List[Dict[str, Any]]: - tags_section = entry.get("tags") - services: List[Dict[str, Any]] = [] - if not isinstance(tags_section, dict): - return services - names_map = tags_section.get("service_keys_to_names") - if not isinstance(names_map, dict): - names_map = {} - - def get_record(service_key: Optional[str], - service_name: Optional[str]) -> Dict[str, - Any]: - key_lower = service_key.lower() if isinstance(service_key, str) else None - name_lower = service_name.lower() if isinstance(service_name, str) else None - for record in services: - existing_key = record.get("service_key") - if key_lower and isinstance(existing_key, - str) and existing_key.lower() == key_lower: - if service_name and not record.get("service_name"): - record["service_name"] = service_name - return record - existing_name = record.get("service_name") - if (name_lower and isinstance(existing_name, - str) and existing_name.lower() == name_lower): - if service_key and not record.get("service_key"): - record["service_key"] = service_key - return record - record = { - "service_key": service_key, - "service_name": service_name, - "tags": [], - } - services.append(record) - return record - - def _iter_current_status_lists(container: Any) -> Iterable[List[Any]]: - if isinstance(container, dict): - for status_key, tags_list in container.items(): - if str(status_key) != "0": - continue - if isinstance(tags_list, list): - yield tags_list - elif isinstance(container, list): - yield container - - statuses_map = tags_section.get("service_keys_to_statuses_to_tags") - if isinstance(statuses_map, dict): - for service_key, status_map in statuses_map.items(): - record = get_record( - service_key if isinstance(service_key, - str) else None, - names_map.get(service_key) - ) - for tags_list in _iter_current_status_lists(status_map): - for tag in tags_list: - normalized = _normalize_tag(tag) - if normalized: - record["tags"].append(normalized) - - ignored_keys = { - "service_keys_to_statuses_to_tags", - "service_keys_to_statuses_to_display_tags", - "service_keys_to_display_friendly_tags", - "service_keys_to_names", - "tag_display_types_to_namespaces", - "namespace_display_string_lookup", - "tag_display_decoration_colour_lookup", + result: Dict[str, Any] = { + "id": normalized, + "tag": tags, } - for key, service in tags_section.items(): - if key in ignored_keys: - continue - if isinstance(service, dict): - service_key = service.get("service_key" - ) or (key if isinstance(key, - str) else None) - service_name = ( - service.get("service_name") or service.get("name") - or names_map.get(service_key) - ) - record = get_record( - service_key if isinstance(service_key, - str) else None, - service_name - ) - storage = ( - service.get("storage_tags") or service.get("statuses_to_tags") - or service.get("tags") - ) - if isinstance(storage, dict): - for tags_list in _iter_current_status_lists(storage): - for tag in tags_list: - normalized = _normalize_tag(tag) - if normalized: - record["tags"].append(normalized) - elif isinstance(storage, list): - for tag in storage: - normalized = _normalize_tag(tag) - if normalized: - record["tags"].append(normalized) - - # Use canonical dedup function - for record in services: - record["tags"] = dedup_tags_by_namespace(record["tags"], keep_first=True) - return services - - -def _select_primary_tags( - services: List[Dict[str, - Any]], - aggregated: List[str], - prefer_service: Optional[str] -) -> Tuple[Optional[str], - List[str]]: - prefer_lower = prefer_service.lower() if isinstance(prefer_service, str) else None - if prefer_lower: - for record in services: - name = record.get("service_name") - if isinstance(name, - str) and name.lower() == prefer_lower and record["tags"]: - return record.get("service_key"), record["tags"] - for record in services: - if record["tags"]: - return record.get("service_key"), record["tags"] - return None, aggregated - - -def _derive_title( - tags_primary: List[str], - tags_aggregated: List[str], - entry: Dict[str, - Any] -) -> Optional[str]: - for source in (tags_primary, tags_aggregated): - for tag in source: - namespace, sep, value = tag.partition(":") - if sep and namespace and namespace.lower() == "title": - cleaned = value.strip() - if cleaned: - return cleaned - for key in ( - "title", - "display_name", - "pretty_name", - "original_display_filename", - "original_filename", - ): - value = entry.get(key) - if isinstance(value, str): - cleaned = value.strip() - if cleaned: - return cleaned - return None - - -def _derive_clip_time( - tags_primary: List[str], - tags_aggregated: List[str], - entry: Dict[str, - Any] -) -> Optional[str]: - namespaces = {"clip", - "clip_time", - "cliptime"} - for source in (tags_primary, tags_aggregated): - for tag in source: - namespace, sep, value = tag.partition(":") - if sep and namespace and namespace.lower() in namespaces: - cleaned = value.strip() - if cleaned: - return cleaned - clip_value = entry.get("clip_time") - if isinstance(clip_value, str): - cleaned_clip = clip_value.strip() - if cleaned_clip: - return cleaned_clip - return None - - -def _summarize_hydrus_entry( - entry: Dict[str, - Any], - prefer_service: Optional[str] -) -> Tuple[Dict[str, - Any], - List[str], - Optional[str], - Optional[str], - Optional[str]]: - services = _extract_tag_services(entry) - aggregated: List[str] = [] - seen: Set[str] = set() - for record in services: - for tag in record["tags"]: - if tag not in seen: - seen.add(tag) - aggregated.append(tag) - service_key, primary_tags = _select_primary_tags(services, aggregated, prefer_service) - title = _derive_title(primary_tags, aggregated, entry) - clip_time = _derive_clip_time(primary_tags, aggregated, entry) - summary = dict(entry) - if title and not summary.get("title"): - summary["title"] = title - if clip_time and not summary.get("clip_time"): - summary["clip_time"] = clip_time - summary["tag_service_key"] = service_key - summary["has_current_file_service"] = _has_current_file_service(entry) - if "is_local" not in summary: - summary["is_local"] = bool(entry.get("is_local")) - return summary, primary_tags, service_key, title, clip_time - - -def _looks_like_hash(value: Any) -> bool: - if not isinstance(value, str): - return False - candidate = value.strip().lower() - return len(candidate) == 64 and all(ch in "0123456789abcdef" for ch in candidate) - - -def _collect_relationship_hashes(payload: Any, accumulator: Set[str]) -> None: - if isinstance(payload, dict): - for value in payload.values(): - _collect_relationship_hashes(value, accumulator) - elif isinstance(payload, (list, tuple, set)): - for value in payload: - _collect_relationship_hashes(value, accumulator) - elif isinstance(payload, str) and _looks_like_hash(payload): - accumulator.add(payload) - - -def _build_hydrus_query( - hashes: Optional[Sequence[str]], - file_ids: Optional[Sequence[int]], - include_relationships: bool, - minimal: bool, -) -> Dict[str, - str]: - query: Dict[str, - str] = {} - if hashes: - query["hashes"] = json.dumps(list(hashes)) - if file_ids: - query["file_ids"] = json.dumps([int(value) for value in file_ids]) - if not query: - raise ValueError("hashes or file_ids must be provided") - query["include_service_keys_to_tags"] = json.dumps(True) - query["include_tag_services"] = json.dumps(True) - query["include_file_services"] = json.dumps(True) - if include_relationships: - query["include_file_relationships"] = json.dumps(True) - if not minimal: - extras = ( - "include_url", - "include_size", - "include_width", - "include_height", - "include_duration", - "include_mime", - "include_has_audio", - "include_is_trashed", - ) - for key in extras: - query[key] = json.dumps(True) - return query - - -def _fetch_hydrus_entries( - client: Any, - hashes: Optional[Sequence[str]], - file_ids: Optional[Sequence[int]], - include_relationships: bool, - minimal: bool, -) -> List[Dict[str, - Any]]: - if not hashes and not file_ids: - return [] - assert HydrusRequestSpec is not None - spec = HydrusRequestSpec( - method="GET", - endpoint="/get_files/file_metadata", - query=_build_hydrus_query(hashes, - file_ids, - include_relationships, - minimal), - ) - response = client._perform_request(spec) # type: ignore[attr-defined] - metadata = response.get("metadata") if isinstance(response, dict) else None - if isinstance(metadata, list): - return [entry for entry in metadata if isinstance(entry, dict)] - return [] - - -def _has_current_file_service(entry: Dict[str, Any]) -> bool: - services = entry.get("file_services") - if not isinstance(services, dict): - return False - current = services.get("current") - if isinstance(current, dict): - for value in current.values(): - if value: - return True - return False - if isinstance(current, list): - return len(current) > 0 - return False - - -def _compute_file_flags(entry: Dict[str, Any]) -> Tuple[bool, bool, bool]: - mime = entry.get("mime") - mime_lower = mime.lower() if isinstance(mime, str) else "" - is_video = mime_lower.startswith("video/") - is_audio = mime_lower.startswith("audio/") - is_deleted = False - if entry.get("is_trashed"): - is_deleted = True - file_services = entry.get("file_services") - if not is_deleted and isinstance(file_services, dict): - deleted = file_services.get("deleted") - if isinstance(deleted, dict) and deleted: - is_deleted = True - return is_video, is_audio, is_deleted - - -def fetch_hydrus_metadata(payload: Dict[str, Any]) -> Dict[str, Any]: - _ensure_hydrus_client() - assert HydrusClient is not None - hash_hex = None - raw_hash_value = payload.get("hash") - if raw_hash_value is not None: - hash_hex = _normalize_hash(raw_hash_value) - file_ids: List[int] = [] - raw_file_ids = payload.get("file_ids") - if isinstance(raw_file_ids, (list, tuple, set)): - for value in raw_file_ids: - try: - file_ids.append(int(value)) - except (TypeError, ValueError): - continue - elif raw_file_ids is not None: - try: - file_ids.append(int(raw_file_ids)) - except (TypeError, ValueError): - file_ids = [] - raw_file_id = payload.get("file_id") - if raw_file_id is not None: - try: - coerced = int(raw_file_id) - except (TypeError, ValueError): - coerced = None - if coerced is not None and coerced not in file_ids: - file_ids.append(coerced) - base_url = str(payload.get("api_url") or "").strip() - if not base_url: - raise ValueError("Hydrus api_url is required") - access_key = str(payload.get("access_key") or "").strip() - options_raw = payload.get("options") - options = options_raw if isinstance(options_raw, - dict) else {} - prefer_service = options.get("prefer_service_name") - if isinstance(prefer_service, str): - prefer_service = prefer_service.strip() - else: - prefer_service = None - include_relationships = bool(options.get("include_relationships")) - minimal = bool(options.get("minimal")) - timeout = float(options.get("timeout") or 60.0) - client = HydrusClient(base_url, access_key, timeout) - hashes: Optional[List[str]] = None - if hash_hex: - hashes = [hash_hex] - if not hashes and not file_ids: - raise ValueError("Hydrus hash or file id is required") - try: - entries = _fetch_hydrus_entries( - client, - hashes, - file_ids or None, - include_relationships, - minimal - ) - except HydrusRequestError as exc: # type: ignore[misc] - raise RuntimeError(str(exc)) - if not entries: - response: Dict[str, - Any] = { - "hash": hash_hex, - "metadata": {}, - "tags": [], - "warnings": - [f"No Hydrus metadata for {hash_hex or file_ids}"], - "error": "not_found", - } - if file_ids: - response["file_id"] = file_ids[0] - return response - entry = entries[0] - if not hash_hex: - entry_hash = entry.get("hash") - if isinstance(entry_hash, str) and entry_hash: - hash_hex = entry_hash - hashes = [hash_hex] - summary, primary_tags, service_key, title, clip_time = _summarize_hydrus_entry( - entry, prefer_service - ) - is_video, is_audio, is_deleted = _compute_file_flags(entry) - has_current_file_service = _has_current_file_service(entry) - is_local = bool(entry.get("is_local")) - size_bytes = entry.get("size") or entry.get("file_size") - filesize_mb = None - if isinstance(size_bytes, (int, float)) and size_bytes > 0: - filesize_mb = float(size_bytes) / (1024.0 * 1024.0) - duration = entry.get("duration") - if duration is None and isinstance(entry.get("duration_ms"), (int, float)): - duration = float(entry["duration_ms"]) / 1000.0 - warnings: List[str] = [] - if not primary_tags: - warnings.append("No tags returned for preferred service") - relationships = None - relationship_metadata: Dict[str, - Dict[str, - Any]] = {} - if include_relationships and hash_hex: - try: - assert HydrusRequestSpec is not None - rel_spec = HydrusRequestSpec( - method="GET", - endpoint="/manage_file_relationships/get_file_relationships", - query={ - "hash": hash_hex - }, - ) - relationships = client._perform_request( - rel_spec - ) # type: ignore[attr-defined] - except HydrusRequestError as exc: # type: ignore[misc] - warnings.append(f"Relationship lookup failed: {exc}") - relationships = None - if isinstance(relationships, dict): - related_hashes: Set[str] = set() - _collect_relationship_hashes(relationships, related_hashes) - related_hashes.discard(hash_hex) - if related_hashes: - try: - related_entries = _fetch_hydrus_entries( - client, - sorted(related_hashes), - None, - False, - True - ) - except HydrusRequestError as exc: # type: ignore[misc] - warnings.append(f"Relationship metadata fetch failed: {exc}") - else: - for rel_entry in related_entries: - rel_hash = rel_entry.get("hash") - if not isinstance(rel_hash, str): - continue - rel_summary, rel_tags, _, rel_title, rel_clip = _summarize_hydrus_entry( - rel_entry, prefer_service - ) - rel_summary["tags"] = rel_tags - if rel_title: - rel_summary["title"] = rel_title - if rel_clip: - rel_summary["clip_time"] = rel_clip - relationship_metadata[rel_hash] = rel_summary - result: Dict[str, - Any] = { - "hash": entry.get("hash") or hash_hex, - "metadata": summary, - "tags": primary_tags, - "tag_service_key": service_key, - "title": title, - "clip_time": clip_time, - "duration": duration, - "filesize_mb": filesize_mb, - "is_video": is_video, - "is_audio": is_audio, - "is_deleted": is_deleted, - "is_local": is_local, - "has_current_file_service": has_current_file_service, - "matched_hash": entry.get("hash") or hash_hex, - "swap_recommended": False, - } - file_id_value = entry.get("file_id") - if isinstance(file_id_value, (int, float)): - result["file_id"] = int(file_id_value) - if relationships is not None: - result["relationships"] = relationships - if relationship_metadata: - result["relationship_metadata"] = relationship_metadata - if warnings: - result["warnings"] = warnings - return result - - -def fetch_hydrus_metadata_by_url(payload: Dict[str, Any]) -> Dict[str, Any]: - _ensure_hydrus_client() - assert HydrusClient is not None - raw_url = payload.get("url") or payload.get("source_url") - url = str(raw_url or "").strip() - if not url: - raise ValueError("URL is required to fetch Hydrus metadata by URL") - base_url = str(payload.get("api_url") or "").strip() - if not base_url: - raise ValueError("Hydrus api_url is required") - access_key = str(payload.get("access_key") or "").strip() - options_raw = payload.get("options") - options = options_raw if isinstance(options_raw, - dict) else {} - timeout = float(options.get("timeout") or 60.0) - client = HydrusClient(base_url, access_key, timeout) - hashes: Optional[List[str]] = None - file_ids: Optional[List[int]] = None - matched_url = None - normalised_reported = None - seen: Set[str] = set() - queue = deque() - for variant in _generate_hydrus_url_variants(url): - queue.append(variant) - if not queue: - queue.append(url) - tried_variants: List[str] = [] - while queue: - candidate = queue.popleft() - candidate = str(candidate or "").strip() - if not candidate or candidate in seen: - continue - seen.add(candidate) - tried_variants.append(candidate) - assert HydrusRequestSpec is not None - spec = HydrusRequestSpec( - method="GET", - endpoint="/add_urls/get_url_files", - query={ - "url": candidate - }, - ) - try: - response = client._perform_request(spec) # type: ignore[attr-defined] - except HydrusRequestError as exc: # type: ignore[misc] - raise RuntimeError(str(exc)) - response_hashes_list: List[str] = [] - response_file_ids_list: List[int] = [] - if isinstance(response, dict): - normalised_value = response.get("normalised_url") - if isinstance(normalised_value, str): - trimmed = normalised_value.strip() - if trimmed: - normalised_reported = normalised_reported or trimmed - if trimmed not in seen: - queue.append(trimmed) - for redirect_key in ("redirect_url", "url"): - redirect_value = response.get(redirect_key) - if isinstance(redirect_value, str): - redirect_trimmed = redirect_value.strip() - if redirect_trimmed and redirect_trimmed not in seen: - queue.append(redirect_trimmed) - raw_hashes = response.get("hashes") or response.get("file_hashes") - if isinstance(raw_hashes, list): - for item in raw_hashes: - try: - normalized = _normalize_hash(item) - except ValueError: - continue - if normalized: - response_hashes_list.append(normalized) - raw_ids = response.get("file_ids") or response.get("file_id") - if isinstance(raw_ids, list): - for item in raw_ids: - try: - response_file_ids_list.append(int(item)) - except (TypeError, ValueError): - continue - elif raw_ids is not None: - try: - response_file_ids_list.append(int(raw_ids)) - except (TypeError, ValueError): - pass - statuses = response.get("url_file_statuses") - if isinstance(statuses, list): - for entry in statuses: - if not isinstance(entry, dict): - continue - status_hash = entry.get("hash") or entry.get("file_hash") - if status_hash: - try: - normalized = _normalize_hash(status_hash) - except ValueError: - normalized = None - if normalized: - response_hashes_list.append(normalized) - status_id = entry.get("file_id") or entry.get("fileid") - if status_id is not None: - try: - response_file_ids_list.append(int(status_id)) - except (TypeError, ValueError): - continue - if response_hashes_list: - hashes = response_hashes_list - if response_file_ids_list: - file_ids = response_file_ids_list - if hashes or file_ids: - matched_url = candidate - break - if not hashes and not file_ids: - result = { - "found": False, - "url": url, - "variants": tried_variants, - "metadata": {}, - "tags": [], - "warnings": [f"No Hydrus file found for {url}"], - "error": "not_found", - } - if normalised_reported: - result["normalised_url"] = normalised_reported + if search_title is None: + result["warnings"] = ["imdbinfo is not installed; returning minimal IMDb tag"] return result - hash_value = str(hashes[0]) if hashes else None - followup_payload: Dict[str, - Any] = { - "api_url": base_url, - "access_key": access_key, - "options": options, - } - if hash_value: - followup_payload["hash"] = hash_value - if file_ids: - followup_payload["file_id"] = file_ids[0] - result = fetch_hydrus_metadata(followup_payload) - result["found"] = True - result["url"] = url - if matched_url and matched_url != url: - result["matched_url"] = matched_url - if file_ids: - result["file_id"] = file_ids[0] - if normalised_reported: - result["normalised_url"] = normalised_reported - result["variants"] = tried_variants + + try: + search_result = search_title(normalized, timeout=timeout) + except Exception as exc: # pragma: no cover - network dependent + result["warnings"] = [f"IMDb lookup failed: {exc}"] + return result + + titles = getattr(search_result, "titles", None) or [] + if not titles: + result["warnings"] = ["IMDb lookup returned no data"] + return result + + entry = titles[0] + title = getattr(entry, "title", None) or getattr(entry, "title_localized", None) + year = getattr(entry, "year", None) + kind = getattr(entry, "kind", None) + rating = getattr(entry, "rating", None) + + if title: + _append_unique(tags, seen, f"title:{title}") + if year: + _append_unique(tags, seen, f"year:{year}") + if kind: + _append_unique(tags, seen, f"type:{kind}") + if rating: + _append_unique(tags, seen, f"rating:{rating}") + + result["metadata"] = { + "title": title, + "year": year, + "type": kind, + "rating": rating, + } + result["tag"] = tags return result +def normalize_urls(value: Any) -> List[str]: + """Normalize a URL field into a stable, deduplicated list. + + Accepts: + - None + - a single URL string (optionally containing multiple URLs) + - a list/tuple/set of URL strings + + This helper is used by cmdlets/stores/pipeline to keep `url` consistent. + """ + + def _iter_raw_urls(raw: Any) -> Iterable[str]: + if raw is None: + return + + if isinstance(raw, str): + text = raw.strip() + if not text: + return + # Support legacy prefixes like "url:https://...". + if text.lower().startswith("url:"): + text = text.split(":", 1)[1].strip() + + # Prefer extracting obvious URLs to avoid splitting inside query strings. + matches = re.findall(r"https?://[^\s,]+", text, flags=re.IGNORECASE) + if matches: + for m in matches: + yield m + return + + # Fallback: split on commas/whitespace. + for token in text.replace("\n", + " ").replace("\r", + " ").replace(",", + " ").split(): + if token: + yield token + return + + if isinstance(raw, (list, tuple, set)): + for item in raw: + if item is None: + continue + if isinstance(item, str): + if item.strip(): + yield item + else: + text = str(item).strip() + if text: + yield text + return + + # Last resort: string-coerce. + text = str(raw).strip() + if text: + yield text + + def _canonicalize(url_text: str) -> Optional[str]: + u = str(url_text or "").strip() + if not u: + return None + + # Trim common wrappers and trailing punctuation. + u = u.strip("<>\"' ") + u = u.rstrip(')].,;"') + if not u: + return None + + # IMPORTANT: URLs can be case-sensitive in the path/query on some hosts + # (e.g., https://0x0.st/PzGY.webp). Do not lowercase or otherwise rewrite + # the URL here; preserve exact casing and percent-encoding. + return u + + seen: Set[str] = set() + out: List[str] = [] + for raw_url in _iter_raw_urls(value): + canonical = _canonicalize(raw_url) + if not canonical: + continue + if canonical in seen: + continue + seen.add(canonical) + out.append(canonical) + + return out def _normalise_string_list(values: Optional[Iterable[Any]]) -> List[str]: if not values: @@ -2136,122 +753,6 @@ def sync_sidecar(payload: Dict[str, Any]) -> Dict[str, Any]: } -def _build_hydrus_context( - payload: Dict[str, - Any] -) -> Tuple[Any, - str, - str, - float, - Optional[str]]: - _ensure_hydrus_client() - assert HydrusClient is not None - base_url = str(payload.get("api_url") or "").strip() - if not base_url: - raise ValueError("Hydrus api_url is required") - access_key = str(payload.get("access_key") or "").strip() - options_raw = payload.get("options") - options = options_raw if isinstance(options_raw, - dict) else {} - timeout = float(options.get("timeout") or payload.get("timeout") or 60.0) - prefer_service = payload.get("prefer_service_name" - ) or options.get("prefer_service_name") - if isinstance(prefer_service, str): - prefer_service = prefer_service.strip() or None - else: - prefer_service = None - client = HydrusClient(base_url, access_key, timeout) - return client, base_url, access_key, timeout, prefer_service - - -def _refetch_hydrus_summary( - base_url: str, - access_key: str, - hash_hex: str, - timeout: float, - prefer_service: Optional[str] -) -> Dict[str, - Any]: - payload: Dict[str, - Any] = { - "hash": hash_hex, - "api_url": base_url, - "access_key": access_key, - "options": { - "minimal": True, - "include_relationships": False, - "timeout": timeout, - }, - } - if prefer_service: - payload["options"]["prefer_service_name"] = prefer_service - return fetch_hydrus_metadata(payload) - - -def _apply_hydrus_tag_mutation( - payload: Dict[str, - Any], - add: Iterable[Any], - remove: Iterable[Any] -) -> Dict[str, - Any]: - client, base_url, access_key, timeout, prefer_service = _build_hydrus_context(payload) - hash_hex = _normalize_hash(payload.get("hash")) - add_list = [_normalize_tag(tag) for tag in add if _normalize_tag(tag)] - remove_list = [_normalize_tag(tag) for tag in remove if _normalize_tag(tag)] - if not add_list and not remove_list: - raise ValueError("No tag changes supplied") - service_key = payload.get("service_key") or payload.get("tag_service_key") - summary = None - if not service_key: - summary = _refetch_hydrus_summary( - base_url, - access_key, - hash_hex, - timeout, - prefer_service - ) - service_key = summary.get("tag_service_key") - if not isinstance(service_key, str) or not service_key: - raise RuntimeError("Unable to determine Hydrus tag service key") - actions: Dict[str, - List[str]] = {} - if add_list: - actions["0"] = [tag for tag in add_list if tag] - if remove_list: - actions["1"] = [tag for tag in remove_list if tag] - if not actions: - raise ValueError("Tag mutation produced no actionable changes") - request_payload = { - "hashes": [hash_hex], - "service_keys_to_actions_to_tags": { - service_key: actions, - }, - } - try: - assert HydrusRequestSpec is not None - tag_spec = HydrusRequestSpec( - method="POST", - endpoint="/add_tags/add_tags", - data=request_payload, - ) - client._perform_request(tag_spec) - except HydrusRequestError as exc: # type: ignore[misc] - raise RuntimeError(str(exc)) - summary_after = _refetch_hydrus_summary( - base_url, - access_key, - hash_hex, - timeout, - prefer_service - ) - result = dict(summary_after) - result["added_tags"] = actions.get("0", []) - result["removed_tags"] = actions.get("1", []) - result["tag_service_key"] = summary_after.get("tag_service_key") - return result - - def apply_tag_mutation(payload: Dict[str, Any], operation: str = "add") -> Dict[str, @@ -2274,13 +775,13 @@ def apply_tag_mutation(payload: Dict[str, new_tag = _normalize_tag(payload.get("new_tag")) if not new_tag: raise ValueError("new_tag is required") - result = _apply_hydrus_tag_mutation(payload, [new_tag], []) + result = apply_hydrus_tag_mutation(payload, [new_tag], []) result["added"] = True return result else: # update old_tag = _normalize_tag(payload.get("old_tag")) new_tag = _normalize_tag(payload.get("new_tag")) - result = _apply_hydrus_tag_mutation( + result = apply_hydrus_tag_mutation( payload, [new_tag] if new_tag else [], [old_tag] if old_tag else [] @@ -3683,7 +2184,7 @@ def enrich_playlist_entries(entries: list, extractor: str) -> list: List of enriched entry dicts """ # Import here to avoid circular dependency - from SYS.download import is_url_supported_by_ytdlp + from tool.ytdlp import is_url_supported_by_ytdlp if not entries: return entries diff --git a/SYS/tasks.py b/SYS/tasks.py deleted file mode 100644 index 1bd097c..0000000 --- a/SYS/tasks.py +++ /dev/null @@ -1,234 +0,0 @@ -"""Background task handling and IPC helpers for mpv integration.""" - -from __future__ import annotations -import errno -import json -import os -import socket -import subprocess -import sys - -from SYS.logger import log -import threading -import time -from typing import IO, Iterable - - -def connect_ipc(path: str, timeout: float = 5.0) -> IO[bytes] | None: - """Connect to the mpv IPC server located at *path*.""" - deadline = time.time() + timeout - if not path: - return None - if os.name == "nt": - # mpv exposes a named pipe on Windows. Keep retrying until it is ready. - while True: - try: - return open(path, "r+b", buffering=0) - except FileNotFoundError: - if time.time() > deadline: - return None - time.sleep(0.05) - except OSError as exc: # Pipe busy - # Windows named pipes can intermittently raise EINVAL while the pipe exists - # but is not ready/accepting connections yet. - if exc.errno not in (errno.ENOENT, - errno.EPIPE, - errno.EBUSY, - errno.EINVAL): - raise - if time.time() > deadline: - return None - time.sleep(0.05) - else: - sock = socket.socket(socket.AF_UNIX) - while True: - try: - sock.connect(path) - return sock.makefile("r+b", buffering=0) - except FileNotFoundError: - if time.time() > deadline: - return None - time.sleep(0.05) - except OSError as exc: - if exc.errno not in (errno.ENOENT, errno.ECONNREFUSED): - raise - if time.time() > deadline: - return None - time.sleep(0.05) - - -def ipc_sender(ipc: IO[bytes] | None): - """Create a helper function for sending script messages via IPC.""" - if ipc is None: - - def _noop(_event: str, _payload: dict) -> None: - return None - - return _noop - lock = threading.Lock() - - def _send(event: str, payload: dict) -> None: - message = json.dumps( - { - "command": ["script-message", - event, - json.dumps(payload)] - }, - ensure_ascii=False - ) - encoded = message.encode("utf-8") + b"\n" - with lock: - try: - ipc.write(encoded) - ipc.flush() - except OSError: - pass - - return _send - - -def iter_stream(stream: Iterable[str]) -> Iterable[str]: - for raw in stream: - yield raw.rstrip("\r\n") - - -def _run_task(args, parser) -> int: - if not args.command: - parser.error( - 'run-task requires a command to execute (use "--" before the command).' - ) - env = os.environ.copy() - for entry in args.env: - key, sep, value = entry.partition("=") - if not sep: - parser.error(f"Invalid environment variable definition: {entry!r}") - env[key] = value - command = list(args.command) - if command and command[0] == "--": - command.pop(0) - notifier = ipc_sender(connect_ipc(args.ipc, timeout=args.ipc_timeout)) - if not command: - notifier( - "downlow-task-event", - { - "id": args.task_id, - "event": "error", - "message": "No command provided after separator", - }, - ) - log("[downlow.py] No command provided for run-task", file=sys.stderr) - return 1 - if command and isinstance(command[0], str) and sys.executable: - first = command[0].lower() - if first in {"python", - "python3", - "py", - "python.exe", - "python3.exe", - "py.exe"}: - command[0] = sys.executable - if os.environ.get("DOWNLOW_DEBUG"): - log(f"Launching command: {command}", file=sys.stderr) - notifier( - "downlow-task-event", - { - "id": args.task_id, - "event": "start", - "command": command, - "cwd": args.cwd or os.getcwd(), - }, - ) - - popen_kwargs = {} - if os.name == "nt": - # Avoid flashing a console window when spawning console-subsystem executables. - flags = 0 - try: - flags |= int(getattr(subprocess, "CREATE_NO_WINDOW", 0x08000000)) - except Exception: - flags |= 0x08000000 - popen_kwargs["creationflags"] = flags - try: - si = subprocess.STARTUPINFO() - si.dwFlags |= subprocess.STARTF_USESHOWWINDOW - si.wShowWindow = subprocess.SW_HIDE - popen_kwargs["startupinfo"] = si - except Exception: - pass - try: - process = subprocess.Popen( - command, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - cwd=args.cwd or None, - env=env, - text=True, - bufsize=1, - universal_newlines=True, - **popen_kwargs, - ) - except FileNotFoundError as exc: - notifier( - "downlow-task-event", - { - "id": args.task_id, - "event": "error", - "message": f"Executable not found: {exc.filename}", - }, - ) - log(f"{exc}", file=sys.stderr) - return 1 - stdout_lines: list[str] = [] - stderr_lines: list[str] = [] - - def pump(stream: IO[str], label: str, sink: list[str]) -> None: - for line in iter_stream(stream): - sink.append(line) - notifier( - "downlow-task-event", - { - "id": args.task_id, - "event": label, - "line": line, - }, - ) - - threads = [] - if process.stdout: - t_out = threading.Thread( - target=pump, - args=(process.stdout, - "stdout", - stdout_lines), - daemon=True - ) - t_out.start() - threads.append(t_out) - if process.stderr: - t_err = threading.Thread( - target=pump, - args=(process.stderr, - "stderr", - stderr_lines), - daemon=True - ) - t_err.start() - threads.append(t_err) - return_code = process.wait() - for t in threads: - t.join(timeout=0.1) - notifier( - "downlow-task-event", - { - "id": args.task_id, - "event": "exit", - "returncode": return_code, - "success": return_code == 0, - }, - ) - # Also mirror aggregated output to stdout/stderr for compatibility when IPC is unavailable. - if stdout_lines: - log("\n".join(stdout_lines)) - if stderr_lines: - log("\n".join(stderr_lines), file=sys.stderr) - return return_code diff --git a/Store/registry.py b/Store/registry.py index c811be9..50464a8 100644 --- a/Store/registry.py +++ b/Store/registry.py @@ -142,6 +142,8 @@ class Store: BaseStore] = {} self._backend_errors: Dict[str, str] = {} + self._backend_types: Dict[str, + str] = {} self._load_backends() def _maybe_register_temp_alias( @@ -179,6 +181,7 @@ class Store: # Keep original name working, but add an alias. if backend_name != "temp": self._backends["temp"] = backend + self._backend_types["temp"] = store_type except Exception: return @@ -187,6 +190,7 @@ class Store: if not isinstance(store_cfg, dict): store_cfg = {} + self._backend_types = {} classes_by_type = _discover_store_classes() for raw_store_type, instances in store_cfg.items(): if not isinstance(instances, dict): @@ -232,6 +236,7 @@ class Store: backend_name = str(kwargs.get("NAME") or instance_name) self._backends[backend_name] = backend + self._backend_types[backend_name] = store_type # If this is the configured temp directory, also alias it as 'temp'. self._maybe_register_temp_alias( @@ -249,6 +254,47 @@ class Store: f"[Store] Failed to register {store_cls.__name__} instance '{instance_name}': {exc}" ) + def _resolve_backend_name(self, + backend_name: str) -> tuple[Optional[str], Optional[str]]: + requested = str(backend_name or "") + if requested in self._backends: + return requested, None + + requested_norm = _normalize_store_type(requested) + + ci_matches = [ + name for name in self._backends + if _normalize_store_type(name) == requested_norm + ] + if len(ci_matches) == 1: + return ci_matches[0], None + if len(ci_matches) > 1: + return None, f"Ambiguous store alias '{backend_name}' matches {ci_matches}" + + type_matches = [ + name for name, store_type in self._backend_types.items() + if store_type == requested_norm + ] + if len(type_matches) == 1: + return type_matches[0], None + if len(type_matches) > 1: + return None, ( + f"Ambiguous store alias '{backend_name}' matches type '{requested_norm}': {type_matches}" + ) + + prefix_matches = [ + name for name, store_type in self._backend_types.items() + if store_type.startswith(requested_norm) + ] + if len(prefix_matches) == 1: + return prefix_matches[0], None + if len(prefix_matches) > 1: + return None, ( + f"Ambiguous store alias '{backend_name}' matches type prefix '{requested_norm}': {prefix_matches}" + ) + + return None, None + def get_backend_error(self, backend_name: str) -> Optional[str]: return self._backend_errors.get(str(backend_name)) @@ -277,14 +323,20 @@ class Store: return sorted(chosen.values()) def __getitem__(self, backend_name: str) -> BaseStore: - if backend_name not in self._backends: + resolved, err = self._resolve_backend_name(backend_name) + if resolved: + return self._backends[resolved] + if err: raise KeyError( - f"Unknown store backend: {backend_name}. Available: {list(self._backends.keys())}" + f"Unknown store backend: {backend_name}. {err}" ) - return self._backends[backend_name] + raise KeyError( + f"Unknown store backend: {backend_name}. Available: {list(self._backends.keys())}" + ) def is_available(self, backend_name: str) -> bool: - return backend_name in self._backends + resolved, _err = self._resolve_backend_name(backend_name) + return resolved is not None def try_add_url_for_pipe_object(self, pipe_obj: Any, url: str) -> bool: """Best-effort helper: if `pipe_obj` contains `store` + `hash`, add `url` to that store backend. diff --git a/cmdlet/_shared.py b/cmdlet/_shared.py index 2772c02..40c8b42 100644 --- a/cmdlet/_shared.py +++ b/cmdlet/_shared.py @@ -244,7 +244,7 @@ class SharedArgs: description="Destination location", ) - DELETE_FLAG = CmdletArg( + DELETE = CmdletArg( "delete", type="flag", description="Delete the file and its .tag after successful operation.", @@ -2081,6 +2081,12 @@ def extract_url_from_result(result: Any) -> list[str]: _extend(result.metadata.get("url")) _extend(result.metadata.get("url")) _extend(result.metadata.get("url")) + if isinstance(getattr(result, "full_metadata", None), dict): + fm = getattr(result, "full_metadata", None) + if isinstance(fm, dict): + _extend(fm.get("url")) + _extend(fm.get("url")) + _extend(fm.get("url")) elif hasattr(result, "url") or hasattr(result, "url"): # Handle objects with url/url attribute _extend(getattr(result, "url", None)) @@ -2090,6 +2096,11 @@ def extract_url_from_result(result: Any) -> list[str]: _extend(result.get("url")) _extend(result.get("url")) _extend(result.get("url")) + fm = result.get("full_metadata") + if isinstance(fm, dict): + _extend(fm.get("url")) + _extend(fm.get("url")) + _extend(fm.get("url")) extra = result.get("extra") if isinstance(extra, dict): _extend(extra.get("url")) @@ -2531,6 +2542,30 @@ def resolve_tidal_manifest_path(item: Any) -> Optional[str]: metadata["_tidal_track_details_fetched"] = True except Exception: pass + if not metadata.get("url"): + try: + resp_info = httpx.get( + "https://tidal-api.binimum.org/info/", + params={"id": str(track_int)}, + timeout=10.0, + ) + resp_info.raise_for_status() + info_payload = resp_info.json() + info_data = info_payload.get("data") if isinstance(info_payload, dict) else None + if isinstance(info_data, dict) and info_data: + try: + for k, v in info_data.items(): + if k not in metadata: + metadata[k] = v + except Exception: + pass + try: + if info_data.get("url"): + metadata["url"] = info_data.get("url") + except Exception: + pass + except Exception: + pass except Exception: pass diff --git a/cmdlet/add_file.py b/cmdlet/add_file.py index 5d55256..8b36963 100644 --- a/cmdlet/add_file.py +++ b/cmdlet/add_file.py @@ -345,6 +345,14 @@ class Add_File(Cmdlet): else: items_to_process = [result] + total_items = len(items_to_process) if isinstance(items_to_process, list) else 0 + processed_items = 0 + try: + if total_items: + progress.set_percent(0) + except Exception: + pass + # Minimal step-based progress for single-item runs. # Many add-file flows don't emit intermediate items, so without steps the pipe can look "stuck". use_steps = False @@ -496,9 +504,25 @@ class Add_File(Cmdlet): and len(items_to_process) > 1 ) - for item in items_to_process: + for idx, item in enumerate(items_to_process, 1): pipe_obj = coerce_to_pipe_object(item, path_arg) + try: + label = pipe_obj.title or pipe_obj.name + if not label and pipe_obj.path: + try: + label = Path(str(pipe_obj.path)).name + except Exception: + label = pipe_obj.path + if not label: + label = "file" + if total_items: + pending_pct = int(round(((idx - 1) / max(1, total_items)) * 100)) + progress.set_percent(pending_pct) + progress.set_status(f"adding {idx}/{total_items}: {label}") + except Exception: + pass + temp_dir_to_cleanup: Optional[Path] = None delete_after_item = delete_after try: @@ -597,6 +621,14 @@ class Add_File(Cmdlet): shutil.rmtree(temp_dir_to_cleanup, ignore_errors=True) except Exception: pass + processed_items += 1 + try: + pct = int(round((processed_items / max(1, total_items)) * 100)) + progress.set_percent(pct) + if processed_items >= total_items: + progress.clear_status() + except Exception: + pass # Apply deferred url associations (bulk) before showing the final store table. if pending_url_associations: diff --git a/cmdlet/convert_file.py b/cmdlet/convert_file.py new file mode 100644 index 0000000..5040c23 --- /dev/null +++ b/cmdlet/convert_file.py @@ -0,0 +1,289 @@ +from __future__ import annotations + +from typing import Any, Dict, Sequence, Optional +from pathlib import Path +import sys +import shutil +import subprocess + +from SYS.logger import log, debug +from SYS.utils import sha256_file +from . import _shared as sh +from SYS import pipeline as ctx + +Cmdlet = sh.Cmdlet +CmdletArg = sh.CmdletArg +QueryArg = sh.QueryArg +SharedArgs = sh.SharedArgs +parse_cmdlet_args = sh.parse_cmdlet_args +normalize_result_input = sh.normalize_result_input +extract_title_from_result = sh.extract_title_from_result + + +VIDEO_EXTS = { + "mp4", + "mkv", + "webm", + "mov", + "avi", + "flv", + "mpeg", + "mpg", + "m4v", +} + +AUDIO_EXTS = { + "mp3", + "m4a", + "m4b", + "aac", + "flac", + "wav", + "ogg", + "opus", + "mka", +} + +IMAGE_EXTS = { + "png", + "jpg", + "jpeg", + "webp", + "bmp", + "tif", + "tiff", + "gif", +} + +DOC_EXTS = { + "pdf", + "mobi", + "epub", + "azw3", + "txt", + "rtf", + "html", + "htm", + "md", + "doc", + "docx", +} + + +def _detect_kind(ext: str) -> str: + e = ext.lower().lstrip(".") + if e in VIDEO_EXTS: + return "video" + if e in AUDIO_EXTS: + return "audio" + if e in IMAGE_EXTS: + return "image" + if e in DOC_EXTS: + return "doc" + return "unknown" + + +def _allowed(source_kind: str, target_kind: str) -> bool: + if source_kind == target_kind: + return True + if source_kind == "video" and target_kind == "audio": + return True + return False + + +def _ffmpeg_convert( + input_path: Path, + output_path: Path, + target_kind: str, + copy_metadata: bool, +) -> bool: + ffmpeg_path = shutil.which("ffmpeg") + if not ffmpeg_path: + log("ffmpeg not found in PATH", file=sys.stderr) + return False + + cmd = [ffmpeg_path, "-y", "-i", str(input_path)] + + if target_kind == "audio": + cmd.extend(["-vn"]) + + if copy_metadata: + cmd.extend(["-map_metadata", "0"]) + + cmd.append(str(output_path)) + + debug(f"[convert-file] Running ffmpeg: {' '.join(cmd)}") + proc = subprocess.run(cmd, capture_output=True, text=True) + if proc.returncode != 0: + log(f"ffmpeg error: {proc.stderr}", file=sys.stderr) + return False + return True + + +def _doc_convert(input_path: Path, output_path: Path) -> bool: + try: + import pypandoc # type: ignore + except Exception: + log("pypandoc is required for document conversion; install pypandoc-binary", file=sys.stderr) + return False + + target_fmt = output_path.suffix.lstrip(".").lower() or "pdf" + + try: + pypandoc.convert_file( + str(input_path), + to=target_fmt, + outputfile=str(output_path), + ) + except OSError as exc: + log(f"pandoc is missing or failed to run: {exc}", file=sys.stderr) + return False + except Exception as exc: + log(f"pypandoc conversion failed: {exc}", file=sys.stderr) + return False + + if not output_path.exists(): + log("pypandoc conversion did not produce an output file", file=sys.stderr) + return False + + return True + + +CMDLET = Cmdlet( + name="convert-file", + summary="Convert files between media/container formats (video, audio, image, documents).", + usage="convert-file -to [-path ] [-delete] [-query format:]", + arg=[ + QueryArg("to", key="format", query_only=False, required=True, + description="Target format/extension (e.g., mp4, mp3, wav, jpg, pdf)."), + SharedArgs.PATH, + SharedArgs.QUERY, + SharedArgs.DELETE, + ], + detail=[ + "Allows video↔video, audio↔audio, image↔image, doc↔doc, and video→audio conversions.", + "Disallows incompatible conversions (e.g., video→pdf).", + "Uses ffmpeg for media and pypandoc-binary (bundled pandoc) for document formats (mobi/epub→pdf/txt/etc).", + ], +) + + +def _resolve_output_path(input_path: Path, outdir: Optional[Path], target_ext: str) -> Path: + base = input_path.stem + directory = outdir if outdir is not None else input_path.parent + directory.mkdir(parents=True, exist_ok=True) + candidate = directory / f"{base}.{target_ext}" + if candidate.exists(): + for i in range(1, 1000): + alt = directory / f"{base}_{i}.{target_ext}" + if not alt.exists(): + candidate = alt + break + return candidate + + +def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: + parsed = parse_cmdlet_args(args, CMDLET) + + target_fmt_raw = parsed.get("to") or parsed.get("format") + if not target_fmt_raw: + log("-to is required", file=sys.stderr) + return 1 + target_fmt = str(target_fmt_raw).lower().lstrip(".") + target_kind = _detect_kind(target_fmt) + if target_kind == "unknown": + log(f"Unsupported target format: {target_fmt}", file=sys.stderr) + return 1 + + delete_src = bool(parsed.get("delete", False)) + + inputs = normalize_result_input(result) + path_arg = parsed.get("path") + + outdir_override: Optional[Path] = None + if path_arg: + try: + p = Path(str(path_arg)).expanduser() + if p.exists() and p.is_dir(): + outdir_override = p + else: + inputs.append({"path": p}) + except Exception: + inputs.append({"path": path_arg}) + + if not inputs: + log("No input provided to convert-file", file=sys.stderr) + return 1 + + success = 0 + + for item in inputs: + input_path: Optional[Path] = None + if isinstance(item, dict): + p = item.get("path") or item.get("target") + elif hasattr(item, "path"): + p = getattr(item, "path") + else: + p = item + + try: + input_path = Path(str(p)) if p else None + except Exception: + input_path = None + + if not input_path or not input_path.exists() or not input_path.is_file(): + log("convert-file: input path missing or not found", file=sys.stderr) + continue + + source_ext = input_path.suffix.lower().lstrip(".") + source_kind = _detect_kind(source_ext) + + if not _allowed(source_kind, target_kind): + log( + f"Conversion from {source_kind or 'unknown'} to {target_kind} is not allowed", + file=sys.stderr, + ) + continue + + output_path = _resolve_output_path(input_path, outdir_override, target_fmt) + + converted = False + if target_kind in {"video", "audio", "image"}: + converted = _ffmpeg_convert(input_path, output_path, target_kind, copy_metadata=True) + elif target_kind == "doc": + converted = _doc_convert(input_path, output_path) + else: + log(f"No converter for target kind {target_kind}", file=sys.stderr) + + if not converted: + continue + + try: + out_hash = sha256_file(output_path) + except Exception: + out_hash = None + + title = extract_title_from_result(item) or output_path.stem + + ctx.emit({ + "path": str(output_path), + "title": title, + "hash": out_hash, + "media_kind": target_kind, + "source_path": str(input_path), + }) + + if delete_src: + try: + input_path.unlink() + log(f"Deleted source file: {input_path}", file=sys.stderr) + except Exception as exc: + log(f"Failed to delete source {input_path}: {exc}", file=sys.stderr) + + success += 1 + + return 0 if success else 1 + + +CMDLET.exec = _run +CMDLET.register() diff --git a/cmdlet/download_file.py b/cmdlet/download_file.py index a0dda28..826d398 100644 --- a/cmdlet/download_file.py +++ b/cmdlet/download_file.py @@ -22,8 +22,8 @@ from Provider import internetarchive as ia_provider from Provider import alldebrid as ad_provider from Provider import openlibrary as ol_provider -from SYS.download import DownloadError, _download_direct_file -from SYS.models import DownloadOptions, DownloadMediaResult +from API.HTTP import _download_direct_file +from SYS.models import DownloadError, DownloadOptions, DownloadMediaResult from SYS.logger import log, debug from SYS.pipeline_progress import PipelineProgress from SYS.result_table import ResultTable @@ -890,7 +890,6 @@ class Download_File(Cmdlet): return expanded_items def _process_provider_items( - self, *, piped_items: Sequence[Any], final_output_dir: Path, @@ -900,8 +899,9 @@ class Download_File(Cmdlet): registry: Dict[str, Any], progress: PipelineProgress, - ) -> int: + ) -> tuple[int, int]: downloaded_count = 0 + queued_magnet_submissions = 0 get_search_provider = registry.get("get_search_provider") SearchResult = registry.get("SearchResult") @@ -911,8 +911,17 @@ class Download_File(Cmdlet): config=config ) + total_items = len(expanded_items) + processed_items = 0 + try: + if total_items: + progress.set_percent(0) + except Exception: + pass + for item in expanded_items: try: + label = "item" table = get_field(item, "table") title = get_field(item, "title") target = get_field(item, "path") or get_field(item, "url") @@ -933,6 +942,25 @@ class Download_File(Cmdlet): if isinstance(extra_md, dict): full_metadata = extra_md + try: + label = title or target + label = str(label or "item").strip() + if total_items: + pct = int(round((processed_items / max(1, total_items)) * 100)) + progress.set_percent(pct) + progress.set_status( + f"downloading {processed_items + 1}/{total_items}: {label}" + ) + except Exception: + pass + + transfer_label = label + if str(table or "").lower() == "hifi": + try: + progress.begin_transfer(label=transfer_label, total=None) + except Exception: + pass + # If this looks like a provider item and providers are available, prefer provider.download() downloaded_path: Optional[Path] = None attempted_provider_download = False @@ -1065,6 +1093,45 @@ class Download_File(Cmdlet): continue + # Magnet targets (e.g., torrent provider results) -> submit/download via AllDebrid + if downloaded_path is None and isinstance(target, str) and is_magnet_link(str(target)): + magnet_spec = ad_provider.resolve_magnet_spec(str(target)) + if magnet_spec: + + def _on_emit(path: Path, file_url: str, relpath: str, metadata: Dict[str, Any]) -> None: + title_hint = metadata.get("name") or relpath or title + self._emit_local_file( + downloaded_path=path, + source=file_url or target, + title_hint=title_hint, + tags_hint=None, + media_kind_hint="file", + full_metadata=metadata, + progress=progress, + config=config, + provider_hint="alldebrid", + ) + + downloaded, magnet_id = ad_provider.download_magnet( + magnet_spec, + str(target), + final_output_dir, + config, + progress, + quiet_mode, + self._path_from_download_result, + _on_emit, + ) + + if downloaded > 0: + downloaded_count += downloaded + continue + + # If queued but not yet ready, skip the generic unsupported-target error. + if magnet_id is not None: + queued_magnet_submissions += 1 + continue + # Fallback: if we have a direct HTTP URL, download it directly if (downloaded_path is None and isinstance(target, str) @@ -1080,6 +1147,7 @@ class Download_File(Cmdlet): file=sys.stderr, ) continue + debug( f"[download-file] Provider item looks like direct URL, downloading: {target}" ) @@ -1150,8 +1218,22 @@ class Download_File(Cmdlet): log(f"Download failed: {e}", file=sys.stderr) except Exception as e: log(f"Error downloading item: {e}", file=sys.stderr) + finally: + if str(table or "").lower() == "hifi": + try: + progress.finish_transfer(label=transfer_label) + except Exception: + pass + processed_items += 1 + try: + pct = int(round((processed_items / max(1, total_items)) * 100)) + progress.set_percent(pct) + if processed_items >= total_items: + progress.clear_status() + except Exception: + pass - return downloaded_count + return downloaded_count, queued_magnet_submissions # === Streaming helpers (yt-dlp) === @@ -2687,6 +2769,15 @@ class Download_File(Cmdlet): debug(f"Output directory: {final_output_dir}") + try: + PipelineProgress(pipeline_context).ensure_local_ui( + label="download-file", + total_items=len(supported_url), + items_preview=supported_url, + ) + except Exception: + pass + clip_spec = parsed.get("clip") query_spec = parsed.get("query") @@ -3572,7 +3663,7 @@ class Download_File(Cmdlet): if early_exit is not None: return int(early_exit) - downloaded_count += self._process_provider_items( + provider_downloaded, magnet_submissions = self._process_provider_items( piped_items=piped_items, final_output_dir=final_output_dir, config=config, @@ -3580,9 +3671,13 @@ class Download_File(Cmdlet): registry=registry, progress=progress, ) + downloaded_count += provider_downloaded - if downloaded_count > 0 or streaming_downloaded > 0: - debug(f"✓ Successfully processed {downloaded_count} file(s)") + if downloaded_count > 0 or streaming_downloaded > 0 or magnet_submissions > 0: + msg = f"✓ Successfully processed {downloaded_count} file(s)" + if magnet_submissions: + msg += f" and queued {magnet_submissions} magnet(s)" + debug(msg) return 0 if streaming_exit_code is not None: diff --git a/cmdlet/get_tag.py b/cmdlet/get_tag.py index a997f60..b881a9c 100644 --- a/cmdlet/get_tag.py +++ b/cmdlet/get_tag.py @@ -255,7 +255,7 @@ def _pick_supported_ytdlp_url(urls: List[str]) -> Optional[str]: # Prefer a true support check when the Python module is available. try: - from SYS.download import is_url_supported_by_ytdlp + from tool.ytdlp import is_url_supported_by_ytdlp for text in candidates: try: diff --git a/cmdlet/search_file.py b/cmdlet/search_file.py index ed25e56..02d08c8 100644 --- a/cmdlet/search_file.py +++ b/cmdlet/search_file.py @@ -246,7 +246,7 @@ class search_file(Cmdlet): else: table_title = f"{provider_label}: {query}".strip().rstrip(":") - preserve_order = provider_lower in {"youtube", "openlibrary", "loc"} + preserve_order = provider_lower in {"youtube", "openlibrary", "loc", "torrent"} table_type = provider_name table_meta: Dict[str, Any] = {"provider": provider_name} if provider_lower == "hifi": diff --git a/cmdnat/matrix.py b/cmdnat/matrix.py index fe6fcc6..670210b 100644 --- a/cmdnat/matrix.py +++ b/cmdnat/matrix.py @@ -444,7 +444,7 @@ def _resolve_upload_path(item: Any, config: Dict[str, Any]) -> Optional[str]: url = _maybe_unlock_alldebrid_url(url, config) try: - from SYS.download import _download_direct_file + from API.HTTP import _download_direct_file base_tmp = None if isinstance(config, dict): diff --git a/docs/provider_guide.md b/docs/provider_guide.md new file mode 100644 index 0000000..3e5b0e7 --- /dev/null +++ b/docs/provider_guide.md @@ -0,0 +1,165 @@ +# Provider Development Guide + +## 🎯 Purpose +This guide describes how to write, test, and register a provider so the application can discover and use it as a pluggable component. + +> Keep provider code small, focused, and well-tested. Use existing providers as examples. + +--- + +## 🔧 Anatomy of a Provider +A provider is a Python class that extends `ProviderCore.base.Provider` and implements a few key methods and attributes. + +Minimum expectations: +- `class MyProvider(Provider):` — subclass the base provider +- `URL` / `URL_DOMAINS` or `url_patterns()` — to let the registry route URLs +- `validate(self) -> bool` — return True when provider is configured and usable +- `search(self, query, limit=50, filters=None, **kwargs)` — return a list of `SearchResult` + +Optional but common: +- `download(self, result: SearchResult, output_dir: Path) -> Optional[Path]` — download a provider result +- `selector(self, selected_items, *, ctx, stage_is_last=True, **kwargs) -> bool` — handle `@N` selections +- `download_url(self, url, output_dir, progress_cb=None)` — direct URL-handling helper + +--- + +## 🧩 SearchResult +Use `ProviderCore.base.SearchResult` to describe results returned by `search()`. +Important fields: +- `table` (str) — provider table name +- `title` (str) — short human title +- `path` (str) — canonical URL / link the provider/dl may use +- `media_kind` (str) — `file`, `folder`, `book`, etc. +- `columns` (list[tuple[str,str]]) — extra key/value pairs to display +- `full_metadata` (dict) — provider-specific metadata for downstream stages +- `annotations` / `tag` — simple metadata for filtering + +Return a list of `SearchResult(...)` objects or simple dicts convertible with `.to_dict()`. + +--- + +## ✅ Implementing search() +- Parse and sanitize `query` and `filters`. +- Return no more than `limit` results. +- Use `columns` to provide table columns (TITLE, Seeds, Size, etc.). +- Keep `search()` fast and predictable (apply reasonable timeouts). + +Example: + +```python +from ProviderCore.base import Provider, SearchResult + +class HelloProvider(Provider): + def search(self, query, limit=50, filters=None, **kwargs): + q = (query or "").strip() + if not q: + return [] + results = [] + # Build up results + results.append( + SearchResult( + table="hello", + title=f"Hit for {q}", + path=f"https://example/{q}", + columns=[("Info", "example")], + full_metadata={"source": "hello"}, + ) + ) + return results[:max(0, int(limit))] +``` + +--- + +## ⬇️ Implementing download() and download_url() +- Prefer provider `download(self, result, output_dir)` for piped provider items. +- For provider-provided URLs, implement `download_url` to allow `download-file` to route downloads through providers. +- Use the repo `_download_direct_file` helper for HTTP downloads when possible. + +Example download(): + +```python +def download(self, result: SearchResult, output_dir: Path) -> Optional[Path]: + # Validate config + url = getattr(result, "path", None) + if not url or not url.startswith("http"): + return None + # use existing helpers to fetch the file + return _download_direct_file(url, output_dir) +``` + +--- + +## 🧭 URL routing +Providers can declare: +- `URL = ("magnet:",)` or similar prefix list +- `URL_DOMAINS = ("example.com",)` to match hosts +- Or override `@classmethod def url_patterns(cls):` to combine static and dynamic patterns + +The registry uses these to match `download-file ` or to pick which provider should handle the URL. + +--- + +## 🛠 Selector (handling `@N` picks) +- Implement `selector(self, selected_items, *, ctx, stage_is_last=True)` to present a sub-table or to enqueue downloads. +- Use `ctx.set_last_result_table()` and `ctx.set_current_stage_table()` to display follow-ups. +- Return `True` when you handled the selection and the pipeline should pause or proceed accordingly. + +--- + +## 🧪 Testing providers +- Keep tests small and local. Create `tests/test_provider_.py`. +- Test `search()` with mock HTTP responses (use `requests-mock` or similar). +- Test `download()` using a temp directory and a small file server or by mocking `_download_direct_file`. +- Test `selector()` by constructing a fake result and `ctx` object. + +Example PowerShell commands to run tests (repo root): + +```powershell +# Run a single test file +pytest tests/test_provider_hello.py -q + +# Run all tests +pytest -q +``` + +--- + +## 📦 Registration & packaging +- Add your provider module under `Provider/` and ensure it is imported by module package initialization. Common approach: + - Place file `Provider/myprovider.py` + - Ensure `Provider/__init__.py` imports the module (or the registry auto-discovers by package import) +- If the project has a central provider registry, add lookup helpers there (e.g., `ProviderCore/registry.py`). Usually providers register themselves at import time. + +--- + +## 💡 Best practices & tips +- Use `debug()` / `log()` appropriately; avoid noisy stderr output in normal runs. +- Prefer returning `SearchResult` objects to provide consistent UX. +- Keep `search()` tolerant (timeouts, malformed responses) and avoid raising for expected network problems. +- Use `full_metadata` to pass non-display data to `download()` and `selector()`. +- Respect the `limit` parameter in `search()`. + +--- + +## 🧾 Example provider checklist +- [ ] Implement `search()` and return `SearchResult` items +- [ ] Implement `validate()` to check essential config (API keys, credentials) +- [ ] Provide `URL` / `URL_DOMAINS` or `url_patterns()` for routing +- [ ] Add `download()` or `download_url()` for piped/passed URL downloads +- [ ] Add tests under `tests/` +- [ ] Add module to `Provider/` package and ensure import/registration + +--- + +## 🔗 Further reading +- See existing providers in `Provider/` for patterns and edge cases. +- Check `API/` helpers for HTTP and debrid clients. + + +--- + +If you'd like, I can: +- Add an example provider file under `Provider/` as a template (see `Provider/hello_provider.py`), and +- Create unit tests for it (see `tests/test_provider_hello.py`). + +I have added a minimal example provider and tests in this repository; use them as a starting point for new providers. diff --git a/scripts/requirements.txt b/scripts/requirements.txt index b02bdbb..bebb645 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -19,6 +19,7 @@ pypdf>=3.0.0 mutagen>=1.46.0 cbor2>=4.0 zstandard>=0.23.0 +pypandoc-binary # Image and media support Pillow>=10.0.0 @@ -45,3 +46,4 @@ playwright>=1.40.0 # Development and utilities python-dateutil>=2.8.0 + diff --git a/tmp_trim_registry.py b/tmp_trim_registry.py new file mode 100644 index 0000000..4267295 --- /dev/null +++ b/tmp_trim_registry.py @@ -0,0 +1,10 @@ +from pathlib import Path + +path = Path("ProviderCore/registry.py") +text = path.read_text() +marker = '"""Provider registry.' +first = text.find(marker) +second = text.find(marker, first + 1) +if second != -1: + trimmed = text[:second].rstrip() + "\n" + path.write_text(trimmed, encoding="utf-8") diff --git a/tmp_write_registry.py b/tmp_write_registry.py new file mode 100644 index 0000000..2abda3f --- /dev/null +++ b/tmp_write_registry.py @@ -0,0 +1,3 @@ +from pathlib import Path + +new_content = """""" \ No newline at end of file diff --git a/tool/ytdlp.py b/tool/ytdlp.py index 7e691cd..4d0751c 100644 --- a/tool/ytdlp.py +++ b/tool/ytdlp.py @@ -29,6 +29,9 @@ from SYS.models import ( from SYS.pipeline_progress import PipelineProgress from SYS.utils import ensure_directory, sha256_file +_YTDLP_TRANSFER_STATE: Dict[str, Dict[str, Any]] = {} + + try: import yt_dlp # type: ignore from yt_dlp.extractor import gen_extractors # type: ignore @@ -565,9 +568,35 @@ class YtDlpTool: # Progress + utility helpers for yt-dlp driven downloads (previously in cmdlet/download_media). _YTDLP_PROGRESS_BAR = ProgressBar() +_YTDLP_TRANSFER_STATE: Dict[str, Dict[str, Any]] = {} _SUBTITLE_EXTS = (".vtt", ".srt", ".ass", ".ssa", ".lrc") +def _progress_label(status: Dict[str, Any]) -> str: + info_dict = status.get("info_dict") if isinstance(status.get("info_dict"), dict) else {} + + candidates = [ + status.get("filename"), + info_dict.get("_filename"), + info_dict.get("filepath"), + info_dict.get("title"), + info_dict.get("id"), + ] + + for cand in candidates: + if not cand: + continue + try: + name = Path(str(cand)).name + except Exception: + name = str(cand) + label = str(name or "").strip() + if label: + return label + + return "download" + + def _live_ui_and_pipe_index() -> tuple[Optional[Any], int]: ui = None try: @@ -937,19 +966,53 @@ def _extract_sha256(info: Dict[str, Any]) -> Optional[str]: def _progress_callback(status: Dict[str, Any]) -> None: + label = _progress_label(status) event = status.get("status") - if event == "downloading": - downloaded = status.get("downloaded_bytes") - total = status.get("total_bytes") or status.get("total_bytes_estimate") + downloaded = status.get("downloaded_bytes") + total = status.get("total_bytes") or status.get("total_bytes_estimate") - _YTDLP_PROGRESS_BAR.update( - downloaded=int(downloaded) if downloaded is not None else None, - total=int(total) if total is not None else None, - label="download", - file=sys.stderr, - ) + pipeline = PipelineProgress(pipeline_context) + live_ui, _ = pipeline.ui_and_pipe_index() + use_live = live_ui is not None + + def _total_bytes(value: Any) -> Optional[int]: + try: + if isinstance(value, (int, float)) and value > 0: + return int(value) + except Exception: + pass + return None + + if event == "downloading": + if use_live: + try: + if not _YTDLP_TRANSFER_STATE.get(label, {}).get("started"): + pipeline.begin_transfer(label=label, total=_total_bytes(total)) + _YTDLP_TRANSFER_STATE[label] = {"started": True} + pipeline.update_transfer( + label=label, + completed=int(downloaded) if downloaded is not None else None, + total=_total_bytes(total), + ) + except Exception: + pass + else: + _YTDLP_PROGRESS_BAR.update( + downloaded=int(downloaded) if downloaded is not None else None, + total=int(total) if total is not None else None, + label=label, + file=sys.stderr, + ) elif event == "finished": - _YTDLP_PROGRESS_BAR.finish() + if use_live: + try: + if _YTDLP_TRANSFER_STATE.get(label, {}).get("started"): + pipeline.finish_transfer(label=label) + except Exception: + pass + _YTDLP_TRANSFER_STATE.pop(label, None) + else: + _YTDLP_PROGRESS_BAR.finish() elif event in ("postprocessing", "processing"): return