From bd69119996f2ff83e856d5adbefdb900bf134a99 Mon Sep 17 00:00:00 2001 From: nose Date: Tue, 25 Nov 2025 20:09:33 -0800 Subject: [PATCH] AST --- .gitattributes | 42 + .gitignore copy | 219 ++ CLI.py | 1789 ++++++++++++++++ README copy.md | 64 + TUI/__init__.py | 1 + TUI/menu_actions.py | 105 + TUI/modalscreen/__init__.py | 7 + TUI/modalscreen/access.py | 139 ++ TUI/modalscreen/download.py | 1880 +++++++++++++++++ TUI/modalscreen/download.tcss | 183 ++ TUI/modalscreen/export.py | 512 +++++ TUI/modalscreen/export.tcss | 85 + TUI/modalscreen/search.py | 505 +++++ TUI/modalscreen/search.tcss | 121 ++ TUI/modalscreen/workers.py | 585 ++++++ TUI/modalscreen/workers.tcss | 119 ++ TUI/pipeline_runner.py | 356 ++++ TUI/tui.py | 332 +++ TUI/tui.tcss | 100 + cmdlets/__init__.py | 139 ++ cmdlets/_shared.py | 1229 +++++++++++ cmdlets/add_file.py | 910 ++++++++ cmdlets/add_note.py | 84 + cmdlets/add_relationship.py | 264 +++ cmdlets/add_tags.py | 276 +++ cmdlets/add_url.py | 78 + cmdlets/adjective.py | 148 ++ cmdlets/check_file_status.py | 153 ++ cmdlets/cleanup.py | 110 + cmdlets/delete_file.py | 242 +++ cmdlets/delete_note.py | 79 + cmdlets/delete_tag.py | 219 ++ cmdlets/delete_url.py | 82 + cmdlets/download_data.py | 2633 ++++++++++++++++++++++++ cmdlets/get_file.py | 1618 +++++++++++++++ cmdlets/get_metadata.py | 246 +++ cmdlets/get_note.py | 87 + cmdlets/get_relationship.py | 240 +++ cmdlets/get_tag.py | 1191 +++++++++++ cmdlets/get_url.py | 90 + cmdlets/manage_config.py | 138 ++ cmdlets/merge_file.py | 916 +++++++++ cmdlets/pipe.py | 335 +++ cmdlets/screen_shot.py | 739 +++++++ cmdlets/search_file.py | 351 ++++ cmdlets/worker.py | 325 +++ config.py | 360 ++++ helper/__init__.py | 92 + helper/adjective.json | 130 ++ helper/alldebrid.py | 829 ++++++++ helper/archive_client.py | 567 +++++ helper/download.py | 730 +++++++ helper/file_server.py | 180 ++ helper/file_storage.py | 1039 ++++++++++ helper/http_client.py | 579 ++++++ helper/hydrus.py | 1553 ++++++++++++++ helper/libgen_service.py | 377 ++++ helper/local_library.py | 1395 +++++++++++++ helper/logger.py | 70 + helper/mpv_file.py | 951 +++++++++ helper/progress.py | 143 ++ helper/query_parser.py | 159 ++ helper/search_provider.py | 1777 ++++++++++++++++ helper/tasks.py | 155 ++ helper/unified_book_downloader.py | 706 +++++++ helper/utils.py | 492 +++++ helper/utils_constant.py | 79 + helper/worker_manager.py | 655 ++++++ hydrus_health_check.py | 425 ++++ medeia_entry.py | 13 + medeia_macina/__init__.py | 2 + medeia_macina/cli_entry.py | 13 + metadata.py | 3199 +++++++++++++++++++++++++++++ models.py | 678 ++++++ pipeline.py | 679 ++++++ pyproject.toml | 183 ++ requirements-dev.txt | 29 + requirements.txt | 43 + result_table.py | 1228 +++++++++++ setup.py | 39 + 80 files changed, 39615 insertions(+) create mode 100644 .gitattributes create mode 100644 .gitignore copy create mode 100644 CLI.py create mode 100644 README copy.md create mode 100644 TUI/__init__.py create mode 100644 TUI/menu_actions.py create mode 100644 TUI/modalscreen/__init__.py create mode 100644 TUI/modalscreen/access.py create mode 100644 TUI/modalscreen/download.py create mode 100644 TUI/modalscreen/download.tcss create mode 100644 TUI/modalscreen/export.py create mode 100644 TUI/modalscreen/export.tcss create mode 100644 TUI/modalscreen/search.py create mode 100644 TUI/modalscreen/search.tcss create mode 100644 TUI/modalscreen/workers.py create mode 100644 TUI/modalscreen/workers.tcss create mode 100644 TUI/pipeline_runner.py create mode 100644 TUI/tui.py create mode 100644 TUI/tui.tcss create mode 100644 cmdlets/__init__.py create mode 100644 cmdlets/_shared.py create mode 100644 cmdlets/add_file.py create mode 100644 cmdlets/add_note.py create mode 100644 cmdlets/add_relationship.py create mode 100644 cmdlets/add_tags.py create mode 100644 cmdlets/add_url.py create mode 100644 cmdlets/adjective.py create mode 100644 cmdlets/check_file_status.py create mode 100644 cmdlets/cleanup.py create mode 100644 cmdlets/delete_file.py create mode 100644 cmdlets/delete_note.py create mode 100644 cmdlets/delete_tag.py create mode 100644 cmdlets/delete_url.py create mode 100644 cmdlets/download_data.py create mode 100644 cmdlets/get_file.py create mode 100644 cmdlets/get_metadata.py create mode 100644 cmdlets/get_note.py create mode 100644 cmdlets/get_relationship.py create mode 100644 cmdlets/get_tag.py create mode 100644 cmdlets/get_url.py create mode 100644 cmdlets/manage_config.py create mode 100644 cmdlets/merge_file.py create mode 100644 cmdlets/pipe.py create mode 100644 cmdlets/screen_shot.py create mode 100644 cmdlets/search_file.py create mode 100644 cmdlets/worker.py create mode 100644 config.py create mode 100644 helper/__init__.py create mode 100644 helper/adjective.json create mode 100644 helper/alldebrid.py create mode 100644 helper/archive_client.py create mode 100644 helper/download.py create mode 100644 helper/file_server.py create mode 100644 helper/file_storage.py create mode 100644 helper/http_client.py create mode 100644 helper/hydrus.py create mode 100644 helper/libgen_service.py create mode 100644 helper/local_library.py create mode 100644 helper/logger.py create mode 100644 helper/mpv_file.py create mode 100644 helper/progress.py create mode 100644 helper/query_parser.py create mode 100644 helper/search_provider.py create mode 100644 helper/tasks.py create mode 100644 helper/unified_book_downloader.py create mode 100644 helper/utils.py create mode 100644 helper/utils_constant.py create mode 100644 helper/worker_manager.py create mode 100644 hydrus_health_check.py create mode 100644 medeia_entry.py create mode 100644 medeia_macina/__init__.py create mode 100644 medeia_macina/cli_entry.py create mode 100644 metadata.py create mode 100644 models.py create mode 100644 pipeline.py create mode 100644 pyproject.toml create mode 100644 requirements-dev.txt create mode 100644 requirements.txt create mode 100644 result_table.py create mode 100644 setup.py diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..c6e7763 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,42 @@ +# Auto detect text files and normalize line endings to LF +* text=auto + +# Python files +*.py text eol=lf +*.pyx text eol=lf +*.pyi text eol=lf + +# Shell scripts +*.sh text eol=lf +*.bash text eol=lf + +# Windows batch files +*.bat text eol=crlf +*.cmd text eol=crlf +*.ps1 text eol=crlf + +# Config files +*.json text eol=lf +*.toml text eol=lf +*.yaml text eol=lf +*.yml text eol=lf +*.ini text eol=lf +*.cfg text eol=lf + +# Documentation +*.md text eol=lf +README text eol=lf +LICENSE text eol=lf + +# Binary files +*.db binary +*.sqlite binary +*.png binary +*.jpg binary +*.jpeg binary +*.gif binary +*.webp binary +*.mov binary +*.mp4 binary +*.webm binary +*.pdf binary diff --git a/.gitignore copy b/.gitignore copy new file mode 100644 index 0000000..2adf715 --- /dev/null +++ b/.gitignore copy @@ -0,0 +1,219 @@ +# ---> Python +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class +config.json +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py +config.json +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +#uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc + +# ---> Lua +# Compiled Lua sources +luac.out + +# luarocks build files +*.src.rock +*.zip +*.tar.gz + +# Object files +*.o +*.os +*.ko +*.obj +*.elf + +# Precompiled Headers +*.gch +*.pch + +# Libraries +*.lib +*.a +*.la +*.lo +*.def +*.exp + +# Shared objects (inc. Windows DLLs) +*.dll +*.so +*.so.* +*.dylib + +# Executables +*.exe +*.out +*.app +*.i*86 +*.x86_64 +*.hex + + diff --git a/CLI.py b/CLI.py new file mode 100644 index 0000000..daee9b1 --- /dev/null +++ b/CLI.py @@ -0,0 +1,1789 @@ +from __future__ import annotations + +"""CLI REPL for Medeia-Macina with autocomplete support.""" + +import sys +import json +import re +import io +import uuid +import atexit +from copy import deepcopy +from importlib import import_module +from pathlib import Path +from typing import Any, Dict, List, Optional, Sequence, Set, TextIO, TYPE_CHECKING, cast + +try: + import typer +except ImportError: + typer = None + +try: + from result_table import ResultTable, format_result + RESULT_TABLE_AVAILABLE = True +except ImportError: + RESULT_TABLE_AVAILABLE = False + ResultTable = None # type: ignore + format_result = None # type: ignore + +try: + from prompt_toolkit import PromptSession + from prompt_toolkit.completion import Completer, Completion + from prompt_toolkit.document import Document + PROMPT_TOOLKIT_AVAILABLE = True +except ImportError: # pragma: no cover - optional dependency + PromptSession = None # type: ignore + Completer = None # type: ignore + Completion = None # type: ignore + Document = None # type: ignore + PROMPT_TOOLKIT_AVAILABLE = False + + +try: + from helper.worker_manager import WorkerManager +except ImportError: # pragma: no cover - optional dependency + WorkerManager = None # type: ignore + +if TYPE_CHECKING: # pragma: no cover - typing helper + from helper.worker_manager import WorkerManager as WorkerManagerType +else: + WorkerManagerType = Any + +from config import get_local_storage_path, load_config + + +class _WorkerOutputMirror(io.TextIOBase): + """Mirror stdout/stderr to worker manager while preserving console output.""" + + def __init__(self, original: TextIO, manager: WorkerManagerType, worker_id: str, channel: str): + self._original = original + self._manager = manager + self._worker_id = worker_id + self._channel = channel + self._pending: str = "" + + def write(self, data: str) -> int: + if not data: + return 0 + self._original.write(data) + self._buffer_text(data) + return len(data) + + def flush(self) -> None: + self._original.flush() + self._flush_pending(force=True) + + def isatty(self) -> bool: # pragma: no cover - passthrough + return bool(getattr(self._original, "isatty", lambda: False)()) + + def _buffer_text(self, data: str) -> None: + combined = self._pending + data + lines = combined.splitlines(keepends=True) + if not lines: + self._pending = combined + return + if lines[-1].endswith(("\n", "\r")): + complete = lines + self._pending = "" + else: + complete = lines[:-1] + self._pending = lines[-1] + for chunk in complete: + self._emit(chunk) + + def _flush_pending(self, force: bool = False) -> None: + if self._pending and force: + self._emit(self._pending) + self._pending = "" + + def _emit(self, text: str) -> None: + if not text: + return + try: + self._manager.append_stdout(self._worker_id, text, channel=self._channel) + except Exception: + pass + + @property + def encoding(self) -> str: # type: ignore[override] + return getattr(self._original, "encoding", "utf-8") + + +class _WorkerStageSession: + """Lifecycle helper for wrapping a CLI cmdlet execution in a worker record.""" + + def __init__( + self, + manager: WorkerManagerType, + worker_id: str, + orig_stdout: TextIO, + orig_stderr: TextIO, + stdout_proxy: _WorkerOutputMirror, + stderr_proxy: _WorkerOutputMirror, + config: Optional[Dict[str, Any]], + logging_enabled: bool, + completion_label: str, + error_label: str, + ) -> None: + self.manager = manager + self.worker_id = worker_id + self.orig_stdout = orig_stdout + self.orig_stderr = orig_stderr + self.stdout_proxy = stdout_proxy + self.stderr_proxy = stderr_proxy + self.config = config + self.logging_enabled = logging_enabled + self.closed = False + self._completion_label = completion_label + self._error_label = error_label + + def close(self, status: str = "completed", error_msg: str = "") -> None: + if self.closed: + return + try: + self.stdout_proxy.flush() + self.stderr_proxy.flush() + except Exception: + pass + sys.stdout = self.orig_stdout + sys.stderr = self.orig_stderr + if self.logging_enabled: + try: + self.manager.disable_logging_for_worker(self.worker_id) + except Exception: + pass + try: + if status == "completed": + self.manager.log_step(self.worker_id, self._completion_label) + else: + self.manager.log_step(self.worker_id, f"{self._error_label}: {error_msg or status}") + except Exception: + pass + try: + self.manager.finish_worker(self.worker_id, result=status or "completed", error_msg=error_msg or "") + except Exception: + pass + if self.config and self.config.get('_current_worker_id') == self.worker_id: + self.config.pop('_current_worker_id', None) + self.closed = True + + +_CLI_WORKER_MANAGER: Optional[WorkerManagerType] = None +_CLI_ORPHAN_CLEANUP_DONE = False +CLI_ROOT = Path(__file__).resolve().parent + + +def _load_cli_config() -> Dict[str, Any]: + """Load config.json relative to the CLI script location.""" + try: + return deepcopy(load_config(config_dir=CLI_ROOT)) + except Exception: + return {} + + +def _get_table_title_for_command(cmd_name: str, emitted_items: Optional[List[Any]] = None) -> str: + """Generate a dynamic table title based on the command and emitted items. + + Args: + cmd_name: The command name (e.g., 'search-file', 'get-tag', 'get-file') + emitted_items: The items being displayed + + Returns: + A descriptive title for the result table + """ + # Mapping of commands to title templates + title_map = { + 'search-file': 'Results', + 'search_file': 'Results', + 'download-data': 'Downloads', + 'download_data': 'Downloads', + 'get-tag': 'Tags', + 'get_tag': 'Tags', + 'get-file': 'Results', + 'get_file': 'Results', + 'add-tag': 'Results', + 'add_tag': 'Results', + 'delete-tag': 'Results', + 'delete_tag': 'Results', + 'add-url': 'Results', + 'add_url': 'Results', + 'get-url': 'URLs', + 'get_url': 'URLs', + 'delete-url': 'Results', + 'delete_url': 'Results', + 'get-note': 'Notes', + 'get_note': 'Notes', + 'add-note': 'Results', + 'add_note': 'Results', + 'delete-note': 'Results', + 'delete_note': 'Results', + 'get-relationship': 'Relationships', + 'get_relationship': 'Relationships', + 'add-relationship': 'Results', + 'add_relationship': 'Results', + 'add-file': 'Results', + 'add_file': 'Results', + 'delete-file': 'Results', + 'delete_file': 'Results', + 'check-file-status': 'Status', + 'check_file_status': 'Status', + } + + return title_map.get(cmd_name, 'Results') + + +def _close_cli_worker_manager() -> None: + global _CLI_WORKER_MANAGER + if _CLI_WORKER_MANAGER: + try: + _CLI_WORKER_MANAGER.close() + except Exception: + pass + _CLI_WORKER_MANAGER = None + + +atexit.register(_close_cli_worker_manager) + + +def _ensure_worker_manager(config: Dict[str, Any]) -> Optional[WorkerManagerType]: + """Attach a WorkerManager to the CLI config for cmdlet execution.""" + global _CLI_WORKER_MANAGER, _CLI_ORPHAN_CLEANUP_DONE + if WorkerManager is None: + return None + if not isinstance(config, dict): + return None + existing = config.get('_worker_manager') + if isinstance(existing, WorkerManager): + return existing + library_root = get_local_storage_path(config) + if not library_root: + return None + try: + resolved_root = Path(library_root).resolve() + except Exception: + resolved_root = Path(library_root) + try: + if not _CLI_WORKER_MANAGER or Path(getattr(_CLI_WORKER_MANAGER, 'library_root', '')) != resolved_root: + if _CLI_WORKER_MANAGER: + try: + _CLI_WORKER_MANAGER.close() + except Exception: + pass + _CLI_WORKER_MANAGER = WorkerManager(resolved_root, auto_refresh_interval=0) + manager = _CLI_WORKER_MANAGER + config['_worker_manager'] = manager + if manager and not _CLI_ORPHAN_CLEANUP_DONE: + try: + manager.expire_running_workers( + older_than_seconds=120, + worker_id_prefix="cli_%", + reason="CLI session ended unexpectedly; marking worker as failed", + ) + except Exception: + pass + else: + _CLI_ORPHAN_CLEANUP_DONE = True + return manager + except Exception as exc: + print(f"[worker] Could not initialize worker manager: {exc}", file=sys.stderr) + return None + + +def _start_worker_session( + worker_manager: Optional[WorkerManagerType], + *, + worker_type: str, + title: str, + description: str, + pipe_text: str, + config: Optional[Dict[str, Any]], + completion_label: str, + error_label: str, + skip_logging_for: Optional[Set[str]] = None, +) -> Optional[_WorkerStageSession]: + """Create a worker session wrapper and mirror stdout/stderr.""" + if worker_manager is None: + return None + if skip_logging_for and worker_type in skip_logging_for: + return None + safe_type = worker_type or "cmd" + worker_id = f"cli_{safe_type[:8]}_{uuid.uuid4().hex[:6]}" + try: + tracked = worker_manager.track_worker( + worker_id, + worker_type=worker_type, + title=title, + description=description or "(no args)", + pipe=pipe_text, + ) + if not tracked: + return None + except Exception as exc: + print(f"[worker] Failed to track {worker_type}: {exc}", file=sys.stderr) + return None + logging_enabled = False + try: + handler = worker_manager.enable_logging_for_worker(worker_id) + logging_enabled = handler is not None + except Exception: + logging_enabled = False + orig_stdout = sys.stdout + orig_stderr = sys.stderr + stdout_proxy = _WorkerOutputMirror(orig_stdout, worker_manager, worker_id, 'stdout') + stderr_proxy = _WorkerOutputMirror(orig_stderr, worker_manager, worker_id, 'stderr') + sys.stdout = stdout_proxy + sys.stderr = stderr_proxy + if isinstance(config, dict): + config['_current_worker_id'] = worker_id + try: + worker_manager.log_step(worker_id, f"Started {worker_type}") + except Exception: + pass + return _WorkerStageSession( + manager=worker_manager, + worker_id=worker_id, + orig_stdout=orig_stdout, + orig_stderr=orig_stderr, + stdout_proxy=stdout_proxy, + stderr_proxy=stderr_proxy, + config=config, + logging_enabled=logging_enabled, + completion_label=completion_label, + error_label=error_label, + ) + + +def _begin_worker_stage( + worker_manager: Optional[WorkerManagerType], + cmd_name: str, + stage_tokens: Sequence[str], + config: Optional[Dict[str, Any]], + command_text: str, +) -> Optional[_WorkerStageSession]: + """Start a worker entry for an individual CLI stage.""" + description = " ".join(stage_tokens[1:]) if len(stage_tokens) > 1 else "(no args)" + return _start_worker_session( + worker_manager, + worker_type=cmd_name, + title=f"{cmd_name} stage", + description=description, + pipe_text=command_text, + config=config, + completion_label="Stage completed", + error_label="Stage error", + skip_logging_for={".worker", "worker", "workers"}, + ) + + +def _begin_pipeline_worker( + worker_manager: Optional[WorkerManagerType], + pipeline_text: str, + config: Optional[Dict[str, Any]], +) -> Optional[_WorkerStageSession]: + """Start a worker that represents the entire pipeline execution.""" + return _start_worker_session( + worker_manager, + worker_type="pipeline", + title="Pipeline run", + description=pipeline_text, + pipe_text=pipeline_text, + config=config, + completion_label="Pipeline completed", + error_label="Pipeline error", + ) + + +def _get_cmdlet_names() -> List[str]: + """Get list of all available cmdlet names.""" + try: + from cmdlets import REGISTRY + return sorted(set(REGISTRY.keys())) + except Exception: + return [] + + +def _get_cmdlet_args(cmd_name: str) -> List[str]: + """Get list of argument flags for a cmdlet (with - and -- prefixes).""" + try: + # Try to load CMDLET object from the module + mod_name = cmd_name.replace("-", "_") + + # Try importing as cmdlet first, then as root-level module + data = None + try: + mod = import_module(f"cmdlets.{mod_name}") + data = getattr(mod, "CMDLET", None) + except (ModuleNotFoundError, ImportError): + try: + # Try root-level modules like search_soulseek + mod = import_module(mod_name) + data = getattr(mod, "CMDLET", None) + except (ModuleNotFoundError, ImportError): + pass + + if data: + # If CMDLET is an object (not dict), use build_flag_registry if available + if not isinstance(data, dict) and hasattr(data, 'build_flag_registry'): + registry = data.build_flag_registry() + # Flatten all flags into a single list + all_flags = [] + for flag_set in registry.values(): + all_flags.extend(flag_set) + return sorted(all_flags) + + # Fallback for dict format or old style + args_list = data.get("args", []) if isinstance(data, dict) else getattr(data, "args", []) + arg_names = [] + for arg in args_list: + if isinstance(arg, dict): + name = arg.get("name", "") + else: + name = getattr(arg, "name", "") + if name: + # Add both - and -- variants + arg_names.append(f"-{name}") + arg_names.append(f"--{name}") + return arg_names + return [] + except Exception: + return [] + + +def _get_arg_choices(cmd_name: str, arg_name: str) -> List[str]: + """Get list of valid choices for a specific cmdlet argument.""" + try: + mod_name = cmd_name.replace("-", "_") + try: + mod = import_module(f"cmdlets.{mod_name}") + data = getattr(mod, "CMDLET", None) + if data: + args_list = data.get("args", []) if isinstance(data, dict) else getattr(data, "args", []) + for arg in args_list: + if isinstance(arg, dict): + arg_obj_name = arg.get("name", "") + else: + arg_obj_name = getattr(arg, "name", "") + + if arg_obj_name == arg_name: + # Found matching arg, get choices + if isinstance(arg, dict): + return arg.get("choices", []) + else: + return getattr(arg, "choices", []) + except ModuleNotFoundError: + pass + return [] + except Exception: + return [] + + +if ( + PROMPT_TOOLKIT_AVAILABLE + and PromptSession is not None + and Completion is not None + and Completer is not None + and Document is not None +): + CompletionType = cast(Any, Completion) + + class CmdletCompleter(Completer): + """Custom completer for cmdlet REPL with autocomplete tied to cmdlet metadata.""" + + def __init__(self): + self.cmdlet_names = _get_cmdlet_names() + + def get_completions(self, document: Document, complete_event): # type: ignore[override] + """Generate completions for the current input.""" + text = document.text_before_cursor + tokens = text.split() + + if not tokens: + for cmd in self.cmdlet_names: + yield CompletionType(cmd, start_position=0) + elif len(tokens) == 1: + current = tokens[0].lower() + for cmd in self.cmdlet_names: + if cmd.startswith(current): + yield CompletionType(cmd, start_position=-len(current)) + for keyword in ["help", "exit", "quit"]: + if keyword.startswith(current): + yield CompletionType(keyword, start_position=-len(current)) + else: + cmd_name = tokens[0].replace("_", "-").lower() + current_token = tokens[-1].lower() + prev_token = tokens[-2].lower() if len(tokens) > 1 else "" + + choices = _get_arg_choices(cmd_name, prev_token) + if choices: + for choice in choices: + if choice.lower().startswith(current_token): + yield CompletionType(choice, start_position=-len(current_token)) + return + + arg_names = _get_cmdlet_args(cmd_name) + for arg in arg_names: + if arg.lower().startswith(current_token): + yield CompletionType(arg, start_position=-len(current_token)) + + if "--help".startswith(current_token): + yield CompletionType("--help", start_position=-len(current_token)) + + async def get_completions_async(self, document: Document, complete_event): # type: ignore[override] + for completion in self.get_completions(document, complete_event): + yield completion +else: # pragma: no cover - prompt toolkit unavailable + CmdletCompleter = None # type: ignore[assignment] + + +def _create_cmdlet_cli(): + """Create Typer CLI app for cmdlet-based commands.""" + if typer is None: + return None + + app = typer.Typer(help="Medeia-Macina CLI") + + @app.command("repl") + def repl(): + """Start interactive REPL for cmdlets with autocomplete.""" + banner = """ +Medeia-Macina +======================================= +Commands: help | exit | --help +Example: search-file --help + """ + print(banner) + + # Pre-acquire Hydrus session key at startup (like hub-ui does) + try: + config = _load_cli_config() + if config: + # Initialize debug logging + from helper.logger import set_debug, debug + debug_enabled = config.get("debug", False) + set_debug(debug_enabled) + if debug_enabled: + debug("✓ Debug logging enabled") + + try: + from helper.hydrus import get_client + get_client(config) # Pre-acquire and cache session key + debug("✓ Hydrus session key acquired") + except RuntimeError as e: + # Hydrus is not available - this is expected and normal + # Don't show a message, just continue without it + pass + except Exception as e: + debug(f"⚠ Could not pre-acquire Hydrus session key: {e}") + + # Check MPV availability at startup + try: + from hydrus_health_check import check_mpv_availability + check_mpv_availability() + except Exception as e: + debug(f"⚠ Could not check MPV availability: {e}") + except Exception: + pass # Silently ignore if config loading fails + + if PROMPT_TOOLKIT_AVAILABLE and PromptSession is not None and CmdletCompleter is not None: + completer = CmdletCompleter() + session = PromptSession(completer=cast(Any, completer)) + + def get_input(prompt: str = ">>>|") -> str: + return session.prompt(prompt) + + else: + def get_input(prompt: str = ">>>|") -> str: + return input(prompt) + + while True: + try: + user_input = get_input(">>>|").strip() + except (EOFError, KeyboardInterrupt): + print("\nGoodbye!") + break + + if not user_input: + continue + + low = user_input.lower() + if low in {"exit", "quit", "q"}: + print("Goodbye!") + break + + if low in {"help", "?"}: + _show_cmdlet_list() + continue + + pipeline_ctx_ref = None + try: + import pipeline as ctx # noqa: F401 + ctx.set_current_command_text(user_input) + pipeline_ctx_ref = ctx + except Exception: + pipeline_ctx_ref = None + + try: + import shlex + tokens = shlex.split(user_input) + except ValueError: + tokens = user_input.split() + + if not tokens: + continue + + # Handle special @.. selector to restore previous result table + if len(tokens) == 1 and tokens[0] == "@..": + try: + import pipeline as ctx + if ctx.restore_previous_result_table(): + # Check for overlay table first + if hasattr(ctx, 'get_display_table'): + last_table = ctx.get_display_table() + else: + last_table = None + + if last_table is None: + last_table = ctx.get_last_result_table() + + if last_table: + print() + # Also update current stage table so @N expansion works correctly + ctx.set_current_stage_table(last_table) + print(last_table.format_plain()) + else: + # Fallback to items if no table object + items = ctx.get_last_result_items() + if items: + # Clear current stage table if we only have items + ctx.set_current_stage_table(None) + print(f"Restored {len(items)} items (no table format available)") + else: + print("No previous result table in history") + else: + print("Result table history is empty") + except Exception as e: + print(f"Error restoring previous result table: {e}") + continue + + # Check for pipe operators to support chaining: cmd1 arg1 | cmd2 arg2 | cmd3 arg3 + # Also treat selection commands (@1, @*, etc) as pipelines so they can be expanded + try: + if '|' in tokens or (tokens and tokens[0].startswith('@')): + _execute_pipeline(tokens) + else: + cmd_name = tokens[0].replace("_", "-").lower() + is_help = any(arg in {"-help", "--help", "-h"} for arg in tokens[1:]) + + if is_help: + _show_cmdlet_help(cmd_name) + else: + # Execute the cmdlet + _execute_cmdlet(cmd_name, tokens[1:]) + finally: + if pipeline_ctx_ref: + pipeline_ctx_ref.clear_current_command_text() + + return app + + +def _execute_pipeline(tokens: list): + """Execute a pipeline of cmdlets separated by pipes (|). + + Example: cmd1 arg1 arg2 | cmd2 arg2 | cmd3 arg3 + """ + try: + from cmdlets import REGISTRY + import json + import pipeline as ctx + + # Split tokens by pipe operator + stages = [] + current_stage = [] + + for token in tokens: + if token == '|': + if current_stage: + stages.append(current_stage) + current_stage = [] + else: + current_stage.append(token) + + if current_stage: + stages.append(current_stage) + + if not stages: + print("Invalid pipeline syntax\n") + return + + # Load config relative to CLI root + config = _load_cli_config() + + # Check if the first stage has @ selection - if so, apply it before pipeline execution + first_stage_tokens = stages[0] if stages else [] + first_stage_selection_indices = [] + first_stage_had_extra_args = False + if first_stage_tokens: + # Look for @N, @N-M, @{N,M} in the first stage args + new_first_stage = [] + first_stage_select_all = False + for token in first_stage_tokens: + if token.startswith('@'): + selection = _parse_selection_syntax(token) + if selection is not None: + # This is a selection syntax - apply it to get initial piped_result + first_stage_selection_indices = sorted([i - 1 for i in selection]) + elif token == "@*": + # Special case: select all items + first_stage_select_all = True + else: + # Not a valid selection, keep as arg + new_first_stage.append(token) + else: + new_first_stage.append(token) + # Update first stage - if it's now empty (only had @N), keep the selection for later processing + if new_first_stage: + stages[0] = new_first_stage + # If we found selection indices but still have tokens, these are extra args + if first_stage_selection_indices or first_stage_select_all: + first_stage_had_extra_args = True + elif first_stage_selection_indices or first_stage_select_all: + # First stage was ONLY selection (@N or @*) - remove it and apply selection to next stage's input + stages.pop(0) + + # Execute each stage, threading results to the next + piped_result = None + worker_manager = _ensure_worker_manager(config) + pipeline_text = " | ".join(" ".join(stage) for stage in stages) + pipeline_session = _begin_pipeline_worker(worker_manager, pipeline_text, config) + pipeline_status = "completed" + pipeline_error = "" + + # Apply first-stage selection if present + if first_stage_selection_indices: + # Ensure we have a table context for expansion from previous command + if not ctx.get_current_stage_table_source_command(): + last_table = ctx.get_last_result_table() + if last_table: + ctx.set_current_stage_table(last_table) + + # Special check for YouTube search results BEFORE command expansion + # If we are selecting from a YouTube search, we want to force auto-piping to .pipe + # instead of trying to expand to a command (which search-file doesn't support well for re-execution) + source_cmd = ctx.get_current_stage_table_source_command() + source_args = ctx.get_current_stage_table_source_args() + + if source_cmd == 'search-file' and source_args and 'youtube' in source_args: + # Force fallback to item-based selection so we can auto-pipe + command_expanded = False + # Skip the command expansion block below + else: + # Try command-based expansion first if we have source command info + command_expanded = False + + if source_cmd: + # Try to find row args for the selected indices + selected_row_args = [] + for idx in first_stage_selection_indices: + row_args = ctx.get_current_stage_table_row_selection_args(idx) + if row_args: + selected_row_args.extend(row_args) + break # For now, take first selected row's args + + if selected_row_args: + # Success: Reconstruct the command with selection args + expanded_stage = [source_cmd] + source_args + selected_row_args + + if first_stage_had_extra_args: + # Append extra args from the first stage (e.g. @3 arg1 arg2) + expanded_stage += stages[0] + stages[0] = expanded_stage + else: + # Insert expanded command as first stage (it was popped earlier if it was only @N) + stages.insert(0, expanded_stage) + + log_msg = f"@N expansion: {source_cmd} + {' '.join(selected_row_args)}" + worker_manager.log_step(pipeline_session.worker_id, log_msg) if pipeline_session and worker_manager else None + + first_stage_selection_indices = [] # Clear, we've expanded it + command_expanded = True + + # If command-based expansion didn't work, fall back to item-based selection + if not command_expanded and first_stage_selection_indices: + # FALLBACK: Item-based selection (filter piped items directly) + last_piped_items = ctx.get_last_result_items() + if last_piped_items: + try: + filtered = [last_piped_items[i] for i in first_stage_selection_indices if 0 <= i < len(last_piped_items)] + if filtered: + piped_result = filtered if len(filtered) > 1 else filtered[0] + log_msg = f"Applied @N selection {' | '.join('@' + str(i+1) for i in first_stage_selection_indices)}" + worker_manager.log_step(pipeline_session.worker_id, log_msg) if pipeline_session and worker_manager else None + + # Special case for youtube search results in fallback mode: auto-pipe to .pipe + # This handles the case where @N is the ONLY stage (e.g. user typed "@1") + # In this case, stages is [['@1']], but we are in the fallback block because command_expanded is False + # We need to check if the source was youtube search + source_cmd = ctx.get_last_result_table_source_command() + source_args = ctx.get_last_result_table_source_args() + + if source_cmd == 'search-file' and source_args and 'youtube' in source_args: + print(f"Auto-piping YouTube selection to .pipe") + # We can't modify stages here easily as we are outside the loop or before it? + # Actually, this block runs BEFORE the loop if stages[0] is a selection. + # But wait, the loop iterates over stages. + # If we are here, it means we handled the selection by filtering `piped_result`. + # The loop will then execute stages starting from 0? + # No, `_execute_pipeline` logic is complex. + + # Let's look at where this block is. + # It is inside `_execute_pipeline`. + # It runs if `first_stage_selection_indices` is set (meaning stages[0] was a selection). + # And `command_expanded` is False (meaning we didn't replace stages[0] with a command). + + # If we are here, `piped_result` holds the selected item(s). + # The loop below iterates `for stage_index, stage_tokens in enumerate(stages):` + # But we removed the first stage from `stages`? No. + + # Wait, let's check how `first_stage_selection_indices` is used. + # It seems `stages` is modified earlier? + # "if stages and stages[0] and stages[0][0].startswith('@'): ... stages.pop(0)" + + # Yes, lines 750-760 (approx) pop the first stage if it is a selection. + # So `stages` now contains the REST of the pipeline. + # If user typed just `@1`, `stages` is now empty `[]`. + + # So if we want to pipe to `.pipe`, we should append `.pipe` to `stages`. + stages.append(['.pipe']) + + else: + print(f"No items matched selection in pipeline\n") + return + except (TypeError, IndexError) as e: + print(f"Error applying selection in pipeline: {e}\n") + return + else: + print(f"No previous results to select from\n") + return + + try: + for stage_index, stage_tokens in enumerate(stages): + if not stage_tokens: + continue + + cmd_name = stage_tokens[0].replace("_", "-").lower() + stage_args = stage_tokens[1:] + + # Check if this is a selection syntax (@N, @N-M, @{N,M,K}, @*, @3,5,7, @3-6,8) instead of a command + if cmd_name.startswith('@'): + selection = _parse_selection_syntax(cmd_name) + is_select_all = (cmd_name == "@*") + + if selection is not None or is_select_all: + # This is a selection stage + # Check if we should expand it to a full command instead of just filtering + should_expand_to_command = False + + # Check if piped_result contains format objects and we have expansion info + source_cmd = ctx.get_current_stage_table_source_command() + source_args = ctx.get_current_stage_table_source_args() + + if source_cmd == '.pipe' or source_cmd == '.adjective': + should_expand_to_command = True + elif source_cmd == 'search-file' and source_args and 'youtube' in source_args: + # Special case for youtube search results: @N expands to .pipe + if stage_index + 1 >= len(stages): + # Only auto-pipe if this is the last stage + print(f"Auto-piping YouTube selection to .pipe") + stages.append(['.pipe']) + # Force should_expand_to_command to False so we fall through to filtering + should_expand_to_command = False + + elif isinstance(piped_result, (list, tuple)): + first_item = piped_result[0] if piped_result else None + if isinstance(first_item, dict) and first_item.get('format_id') is not None: + # Format objects detected - check for source command + if source_cmd: + should_expand_to_command = True + elif isinstance(piped_result, dict) and piped_result.get('format_id') is not None: + # Single format object + if source_cmd: + should_expand_to_command = True + + # If expanding to command, replace this stage and re-execute + if should_expand_to_command and selection is not None: + source_cmd = ctx.get_current_stage_table_source_command() + source_args = ctx.get_current_stage_table_source_args() + selection_indices = sorted([i - 1 for i in selection]) + + # Get row args for first selected index + selected_row_args = [] + for idx in selection_indices: + row_args = ctx.get_current_stage_table_row_selection_args(idx) + if row_args: + selected_row_args.extend(row_args) + break + + if selected_row_args: + # Expand to full command + # Include any arguments passed to the selection command (e.g. @3 arg1 arg2) + extra_args = stage_tokens[1:] + expanded_stage = [source_cmd] + source_args + selected_row_args + extra_args + print(f"Expanding {cmd_name} to: {' '.join(expanded_stage)}") + + # Replace current stage and re-execute it + stages[stage_index] = expanded_stage + stage_tokens = expanded_stage + cmd_name = expanded_stage[0].replace("_", "-").lower() + stage_args = expanded_stage[1:] + + # Clear piped_result so the expanded command doesn't receive the format objects + piped_result = None + + # Don't continue - fall through to execute the expanded command + + # If not expanding, use as filter + if not should_expand_to_command: + # This is a selection stage - filter piped results + if piped_result is None: + print(f"No piped results to select from with {cmd_name}\n") + pipeline_status = "failed" + pipeline_error = f"Selection {cmd_name} without upstream results" + return + + # Normalize piped_result to always be a list for indexing + if isinstance(piped_result, dict) or not isinstance(piped_result, (list, tuple)): + piped_result_list = [piped_result] + else: + piped_result_list = piped_result + + # Get indices to select + if is_select_all: + # @* means select all items + selection_indices = list(range(len(piped_result_list))) + elif selection is not None: + # Convert to 0-based indices + selection_indices = sorted([i - 1 for i in selection]) + else: + selection_indices = [] + + try: + filtered = [piped_result_list[i] for i in selection_indices if 0 <= i < len(piped_result_list)] + if filtered: + piped_result = filtered if len(filtered) > 1 else filtered[0] + print(f"Selected {len(filtered)} item(s) using {cmd_name}") + continue + else: + print(f"No items matched selection {cmd_name}\n") + pipeline_status = "failed" + pipeline_error = f"Selection {cmd_name} matched nothing" + return + except (TypeError, IndexError) as e: + print(f"Error applying selection {cmd_name}: {e}\n") + pipeline_status = "failed" + pipeline_error = f"Selection error: {e}" + return + # If parse failed, treat as regular command name (will fail below) + + # Get the cmdlet function + cmd_fn = REGISTRY.get(cmd_name) + if not cmd_fn: + print(f"Unknown command in pipeline: {cmd_name}\n") + pipeline_status = "failed" + pipeline_error = f"Unknown command {cmd_name}" + return + + # Create pipeline context for this stage + is_last_stage = (stage_index == len(stages) - 1) + pipeline_ctx = ctx.PipelineStageContext(stage_index=stage_index, total_stages=len(stages)) + ctx.set_stage_context(pipeline_ctx) + ctx.set_active(True) + + # Execute the cmdlet with piped input + stage_session: Optional[_WorkerStageSession] = None + stage_status = "completed" + stage_error = "" + stage_label = f"[Stage {stage_index + 1}/{len(stages)}] {cmd_name}" + if pipeline_session and worker_manager: + try: + worker_manager.log_step(pipeline_session.worker_id, f"{stage_label} started") + except Exception: + pass + else: + stage_session = _begin_worker_stage( + worker_manager=worker_manager, + cmd_name=cmd_name, + stage_tokens=stage_tokens, + config=config, + command_text=" ".join(stage_tokens), + ) + try: + ret_code = cmd_fn(piped_result, stage_args, config) + + # Store emitted results for next stage (or display if last stage) + if pipeline_ctx.emits: + if is_last_stage: + # Last stage - display results + if RESULT_TABLE_AVAILABLE and ResultTable is not None and pipeline_ctx.emits: + table_title = _get_table_title_for_command(cmd_name, pipeline_ctx.emits) + + # Only set source_command for search/filter commands (not display-only or action commands) + # This preserves context so @N refers to the original search, not intermediate results + selectable_commands = { + 'search-file', 'download-data', 'search_file', 'download_data', + '.config', '.worker' + } + # Display-only commands (just show data, don't modify or search) + display_only_commands = { + 'get-url', 'get_url', 'get-note', 'get_note', + 'get-relationship', 'get_relationship', 'get-file', 'get_file', + 'check-file-status', 'check_file_status' + } + # Commands that manage their own table/history state (e.g. get-tag) + self_managing_commands = { + 'get-tag', 'get_tag', 'tags' + } + + if cmd_name in self_managing_commands: + # Command has already set the table and history + # Retrieve the table it set so we print the correct custom formatting + + # Check for overlay table first (e.g. get-tag) + if hasattr(ctx, 'get_display_table'): + table = ctx.get_display_table() + else: + table = None + + if table is None: + table = ctx.get_last_result_table() + + if table is None: + # Fallback if something went wrong + table = ResultTable(table_title) + for emitted in pipeline_ctx.emits: + table.add_result(emitted) + else: + table = ResultTable(table_title) + for emitted in pipeline_ctx.emits: + table.add_result(emitted) + + if cmd_name in selectable_commands: + table.set_source_command(cmd_name, stage_args) + ctx.set_last_result_table(table, pipeline_ctx.emits) + elif cmd_name in display_only_commands: + # Display-only: show table but preserve search context + ctx.set_last_result_items_only(pipeline_ctx.emits) + else: + # Action commands (add-*, delete-*): update items only, don't change table/history + ctx.set_last_result_items_only(pipeline_ctx.emits) + + print() + print(table.format_plain()) + else: + for emitted in pipeline_ctx.emits: + if isinstance(emitted, dict): + print(json.dumps(emitted, indent=2)) + else: + print(emitted) + # For display-only results, also preserve context by not calling set_last_result_table + else: + # Intermediate stage - thread to next stage + piped_result = pipeline_ctx.emits + ctx.set_last_result_table(None, pipeline_ctx.emits) + + if ret_code != 0: + stage_status = "failed" + stage_error = f"exit code {ret_code}" + print(f"[stage {stage_index} exit code: {ret_code}]\n") + if pipeline_session: + pipeline_status = "failed" + pipeline_error = f"{stage_label} failed ({stage_error})" + return + + except Exception as e: + stage_status = "failed" + stage_error = f"{type(e).__name__}: {e}" + print(f"[error in stage {stage_index} ({cmd_name})]: {type(e).__name__}: {e}\n") + import traceback + traceback.print_exc() + if pipeline_session: + pipeline_status = "failed" + pipeline_error = f"{stage_label} error: {e}" + return + finally: + if stage_session: + stage_session.close(status=stage_status, error_msg=stage_error) + elif pipeline_session and worker_manager: + try: + worker_manager.log_step( + pipeline_session.worker_id, + f"{stage_label} {'completed' if stage_status == 'completed' else 'failed'}", + ) + except Exception: + pass + + # If we have a result but no stages left (e.g. pure selection @3 that didn't expand to a command), display it + if not stages and piped_result is not None: + if RESULT_TABLE_AVAILABLE and ResultTable is not None: + # Create a simple table for the result + table = ResultTable("Selection Result") + + # Normalize to list + items = piped_result if isinstance(piped_result, list) else [piped_result] + + for item in items: + table.add_result(item) + + # Preserve context for further selection + ctx.set_last_result_items_only(items) + + print() + print(table.format_plain()) + else: + print(piped_result) + + except Exception as e: + pipeline_status = "failed" + pipeline_error = str(e) + print(f"[error] Failed to execute pipeline: {e}\n") + import traceback + traceback.print_exc() + finally: + if pipeline_session: + pipeline_session.close(status=pipeline_status, error_msg=pipeline_error) + + except Exception as e: + print(f"[error] Failed to execute pipeline: {e}\n") + import traceback + traceback.print_exc() + + +def _execute_cmdlet(cmd_name: str, args: list): + """Execute a cmdlet with the given arguments. + + Supports @ selection syntax for filtering results from previous commands: + - @2 - select row 2 + - @2-5 - select rows 2-5 + - @{1,3,5} - select rows 1, 3, 5 + """ + try: + from cmdlets import REGISTRY + import json + import pipeline as ctx + + # Get the cmdlet function + cmd_fn = REGISTRY.get(cmd_name) + if not cmd_fn: + print(f"Unknown command: {cmd_name}\n") + return + + # Load config relative to CLI root + config = _load_cli_config() + + # Check for @ selection syntax in arguments + # Extract @N, @N-M, @{N,M,P} syntax and remove from args + filtered_args = [] + selected_indices = [] + + for arg in args: + if arg.startswith('@'): + # Parse selection: @2, @2-5, @{1,3,5} + selection_str = arg[1:] # Remove @ + try: + if '{' in selection_str and '}' in selection_str: + # @{1,3,5} format + selection_str = selection_str.strip('{}') + selected_indices = [int(x.strip()) - 1 for x in selection_str.split(',')] + elif '-' in selection_str: + # @2-5 format + parts = selection_str.split('-') + start = int(parts[0]) - 1 + end = int(parts[1]) + selected_indices = list(range(start, end)) + else: + # @2 format + selected_indices = [int(selection_str) - 1] + except (ValueError, IndexError): + # Invalid format, treat as regular arg + # Special case: @"string" should be treated as "string" (stripping @) + # This allows adding new items via @"New Item" syntax + if selection_str.startswith('"') or selection_str.startswith("'"): + filtered_args.append(selection_str.strip('"\'')) + else: + filtered_args.append(arg) + else: + filtered_args.append(arg) + + # Get piped items from previous command results + piped_items = ctx.get_last_result_items() + pipeline_ctx = ctx.PipelineStageContext(stage_index=0, total_stages=1) + ctx.set_stage_context(pipeline_ctx) + ctx.set_active(True) + + # Create result object - pass full list (or filtered list if @ selection used) to cmdlet + result = None + if piped_items: + if selected_indices: + # Filter to selected indices only + result = [piped_items[idx] for idx in selected_indices if 0 <= idx < len(piped_items)] + else: + # No selection specified, pass all items (cmdlets handle lists via normalize_result_input) + result = piped_items + + worker_manager = _ensure_worker_manager(config) + stage_session = _begin_worker_stage( + worker_manager=worker_manager, + cmd_name=cmd_name, + stage_tokens=[cmd_name, *filtered_args], + config=config, + command_text=" ".join([cmd_name, *filtered_args]).strip() or cmd_name, + ) + stage_status = "completed" + stage_error = "" + + # Execute the cmdlet + ctx.set_last_selection(selected_indices) + try: + ret_code = cmd_fn(result, filtered_args, config) + + # Print emitted results using ResultTable for structured output + if pipeline_ctx.emits: + if RESULT_TABLE_AVAILABLE and ResultTable is not None and pipeline_ctx.emits: + # Check if these are format objects (from download-data format selection) + # Format objects have format_id and should not be displayed as a table + is_format_selection = False + if pipeline_ctx.emits and len(pipeline_ctx.emits) > 0: + first_emit = pipeline_ctx.emits[0] + if isinstance(first_emit, dict) and 'format_id' in first_emit: + is_format_selection = True + + # Skip table display for format selection - user will use @N to select + if is_format_selection: + # Store items for @N selection but don't display table + ctx.set_last_result_items_only(pipeline_ctx.emits) + else: + # Try to format as a table if we have search results + table_title = _get_table_title_for_command(cmd_name, pipeline_ctx.emits) + + # Only set source_command for search/filter commands (not display-only or action commands) + # This preserves context so @N refers to the original search, not intermediate results + selectable_commands = { + 'search-file', 'download-data', 'search_file', 'download_data', + '.config', '.worker' + } + # Display-only commands (excluding get-tag which manages its own table) + display_only_commands = { + 'get-url', 'get_url', 'get-note', 'get_note', + 'get-relationship', 'get_relationship', 'get-file', 'get_file', + 'check-file-status', 'check_file_status' + } + # Commands that manage their own table/history state (e.g. get-tag) + self_managing_commands = { + 'get-tag', 'get_tag', 'tags' + } + + if cmd_name in self_managing_commands: + # Command has already set the table and history + # Retrieve the table it set so we print the correct custom formatting + table = ctx.get_last_result_table() + if table is None: + # Fallback if something went wrong + table = ResultTable(table_title) + for emitted in pipeline_ctx.emits: + table.add_result(emitted) + else: + table = ResultTable(table_title) + for emitted in pipeline_ctx.emits: + table.add_result(emitted) + + if cmd_name in selectable_commands: + table.set_source_command(cmd_name, filtered_args) + ctx.set_last_result_table(table, pipeline_ctx.emits) + # Clear any stale current_stage_table (e.g. from previous download-data formats) + # This ensures @N refers to these new results, not old format selections + ctx.set_current_stage_table(None) + elif cmd_name in display_only_commands: + # Display-only: show table but preserve search context + ctx.set_last_result_items_only(pipeline_ctx.emits) + else: + # Action commands: update items only without changing current table or history + ctx.set_last_result_items_only(pipeline_ctx.emits) + + print() + print(table.format_plain()) + + # Special case: if this was a youtube search, print a hint about auto-piping + if cmd_name == 'search-file' and filtered_args and 'youtube' in filtered_args: + print("\n[Hint] Type @N to play a video in MPV (e.g. @1)") + else: + # Fallback to raw output if ResultTable not available + for emitted in pipeline_ctx.emits: + if isinstance(emitted, dict): + print(json.dumps(emitted, indent=2)) + else: + print(emitted) + + # Store emitted items for @ selection + selectable_commands = { + 'search-file', 'download-data', 'search_file', 'download_data', + '.config', '.worker' + } + display_only_commands = { + 'get-url', 'get_url', 'get-note', 'get_note', + 'get-relationship', 'get_relationship', 'get-file', 'get_file', + 'check-file-status', 'check_file_status' + } + self_managing_commands = { + 'get-tag', 'get_tag', 'tags' + } + + if cmd_name in self_managing_commands: + pass # Already handled by cmdlet + elif cmd_name in selectable_commands: + ctx.set_last_result_table(None, pipeline_ctx.emits) + elif cmd_name in display_only_commands: + ctx.set_last_result_items_only(pipeline_ctx.emits) + else: + # Action commands: items only, don't change table/history + ctx.set_last_result_items_only(pipeline_ctx.emits) + + if ret_code != 0: + stage_status = "failed" + stage_error = f"exit code {ret_code}" + print(f"[exit code: {ret_code}]\n") + except Exception as e: + stage_status = "failed" + stage_error = f"{type(e).__name__}: {e}" + print(f"[error] {type(e).__name__}: {e}\n") + finally: + ctx.clear_last_selection() + if stage_session: + stage_session.close(status=stage_status, error_msg=stage_error) + except Exception as e: + print(f"[error] Failed to execute cmdlet: {e}\n") + + +def _show_cmdlet_list(): + """Display available cmdlets with full metadata: cmd:name alias:aliases args:args.""" + try: + from cmdlets import REGISTRY + import os + + # Collect unique commands by scanning cmdlet modules + cmdlet_info = {} + cmdlets_dir = os.path.join(os.path.dirname(__file__), "cmdlets") + + # Iterate through cmdlet files + for filename in os.listdir(cmdlets_dir): + if filename.endswith(".py") and not filename.startswith("_"): + mod_name = filename[:-3] + try: + mod = import_module(f"cmdlets.{mod_name}") + if hasattr(mod, "CMDLET"): + cmdlet = getattr(mod, "CMDLET") + # Extract name, aliases, and args + if hasattr(cmdlet, "name"): + cmd_name = cmdlet.name + aliases = [] + if hasattr(cmdlet, "aliases"): + aliases = cmdlet.aliases + + # Extract argument names + arg_names = [] + if hasattr(cmdlet, "args"): + for arg in cmdlet.args: + if hasattr(arg, "name"): + arg_names.append(arg.name) + elif isinstance(arg, dict): + arg_names.append(arg.get("name", "")) + + # Store info (skip if already seen) + if cmd_name not in cmdlet_info: + cmdlet_info[cmd_name] = { + "aliases": aliases, + "args": arg_names, + } + except Exception: + # If we can't import the module, try to get info from REGISTRY + pass + + # Also check root-level cmdlets (search_*, etc) + # Note: search_libgen, search_soulseek, and search_debrid are consolidated into search-file with providers + for mod_name in ["select_cmdlet", "unlock_link"]: + try: + mod = import_module(mod_name) + if hasattr(mod, "CMDLET"): + cmdlet = getattr(mod, "CMDLET") + if hasattr(cmdlet, "name"): + cmd_name = cmdlet.name + aliases = [] + if hasattr(cmdlet, "aliases"): + aliases = cmdlet.aliases + + # Extract argument names + arg_names = [] + if hasattr(cmdlet, "args"): + for arg in cmdlet.args: + if hasattr(arg, "name"): + arg_names.append(arg.name) + elif isinstance(arg, dict): + arg_names.append(arg.get("name", "")) + + if cmd_name not in cmdlet_info: + cmdlet_info[cmd_name] = { + "aliases": aliases, + "args": arg_names, + } + except Exception: + pass + + # Fallback: Show registry entries that we don't have full metadata for + # This ensures all registered cmdlets are shown even if they have import errors + seen_names = set() + for cmd_name in cmdlet_info.keys(): + seen_names.add(cmd_name) + + # For aliases, add them too + for cmd_name in list(cmdlet_info.keys()): + for alias in cmdlet_info[cmd_name].get("aliases", []): + seen_names.add(alias) + + # Now check registry for any missing cmdlets + for reg_name in REGISTRY.keys(): + if reg_name not in seen_names: + # Add this as a basic cmdlet entry + # Try to find a matching primary name + found_match = False + for cmd_name in cmdlet_info.keys(): + if reg_name in cmdlet_info[cmd_name].get("aliases", []): + found_match = True + break + + if not found_match: + # This is a top-level cmdlet not in our collection + cmdlet_info[reg_name] = { + "aliases": [], + "args": [], + } + + print("\nAvailable cmdlets:") + for cmd_name in sorted(cmdlet_info.keys()): + info = cmdlet_info[cmd_name] + aliases = info["aliases"] + args = info["args"] + + # Build the display string + display = f" cmd:{cmd_name}" + + if aliases: + alias_str = ", ".join(aliases) + display += f" alias:{alias_str}" + + if args: + args_str = ", ".join(args) + display += f" args:{args_str}" + + print(display) + + print() + except Exception as e: + print(f"Error: {e}\n") + + +def _show_cmdlet_help(cmd_name: str): + """Display help for a cmdlet.""" + try: + mod_name = cmd_name.replace("-", "_") + try: + mod = import_module(f"cmdlets.{mod_name}") + data = getattr(mod, "CMDLET", None) + if data: + _print_metadata(cmd_name, data) + return + except ModuleNotFoundError: + pass + + from cmdlets import REGISTRY + cmd_fn = REGISTRY.get(cmd_name) + if cmd_fn: + owner = import_module(getattr(cmd_fn, "__module__", "")) + data = getattr(owner, "CMDLET", None) + if data: + _print_metadata(cmd_name, data) + return + + print(f"Unknown command: {cmd_name}\n") + except Exception as e: + print(f"Error: {e}\n") + + +def _print_metadata(cmd_name: str, data): + """Print cmdlet metadata in PowerShell-style format.""" + d = data.to_dict() if hasattr(data, "to_dict") else data + if not isinstance(d, dict): + print(f"Invalid metadata for {cmd_name}\n") + return + + name = d.get('name', cmd_name) + summary = d.get("summary", "") + usage = d.get("usage", "") + description = d.get("description", "") + args = d.get("args", []) + details = d.get("details", []) + + # NAME section + print(f"\nNAME") + print(f" {name}") + + # SYNOPSIS section + print(f"\nSYNOPSIS") + if usage: + # Format usage similar to PowerShell syntax + print(f" {usage}") + else: + print(f" {name}") + + # DESCRIPTION section + if summary or description: + print(f"\nDESCRIPTION") + if summary: + print(f" {summary}") + if description: + print(f" {description}") + + # PARAMETERS section + if args and isinstance(args, list): + print(f"\nPARAMETERS") + for arg in args: + if isinstance(arg, dict): + name_str = arg.get("name", "?") + typ = arg.get("type", "string") + required = arg.get("required", False) + desc = arg.get("description", "") + else: + name_str = getattr(arg, "name", "?") + typ = getattr(arg, "type", "string") + required = getattr(arg, "required", False) + desc = getattr(arg, "description", "") + + # Format: -Name [required flag] + req_marker = "[required]" if required else "[optional]" + print(f" -{name_str} <{typ}>") + if desc: + print(f" {desc}") + print(f" {req_marker}") + print() + + # REMARKS/DETAILS section + if details: + print(f"REMARKS") + for detail in details: + print(f" {detail}") + print() + + +# ============================================================================ +# SELECTION UTILITIES - Consolidated from selection_syntax.py and select_utils.py +# ============================================================================ + +def _parse_selection_syntax(token: str) -> Optional[Set[int]]: + """Parse @ selection syntax into a set of 1-based indices. + + Args: + token: Token starting with @ (e.g., "@2", "@2-5", "@{1,3,5}", "@*", "@3,5,7", "@3-6,8") + + + Returns: + Set of 1-based indices (for concrete selections like @1, @2-5, @3,5,7) + None for special cases: @* (all), @.. (restore previous) + None for invalid format + + Special handling: + - @* returns None and should be handled as "select all current items" + - @.. returns None and is handled as "restore previous table" (separate code path) + - Invalid selections like @-1 or @a return None and are treated as invalid args + + Examples: + "@2" → {2} + "@2-5" → {2, 3, 4, 5} + "@{2,5,6}" → {2, 5, 6} + "@2,5,6" → {2, 5, 6} + "@2-5,8,10-12" → {2, 3, 4, 5, 8, 10, 11, 12} + "@*" → None (caller checks token=="@*" to handle as "all") + "@.." → None (separate code path) + """ + if not token.startswith("@"): + return None + + selector = token[1:].strip() + + # Special case: @.. means restore previous result table (handled separately) + # Special case: @* means all items (should be converted to actual list by caller) + if selector in (".", "*"): + return None + + indices = set() + + # Handle set notation: @{2,5,6,7} (convert to standard format) + if selector.startswith("{") and selector.endswith("}"): + selector = selector[1:-1] + + # Handle mixed comma and range notation: @2,5,7-9,10 or @2-5,8,10-12 + parts = selector.split(",") + + for part in parts: + part = part.strip() + if not part: + continue + + try: + if "-" in part: + # Range notation: 2-5 or 7-9 + range_parts = part.split("-", 1) # Split on first - only (in case of negative numbers) + if len(range_parts) == 2: + start_str = range_parts[0].strip() + end_str = range_parts[1].strip() + + # Make sure both are valid positive integers + if start_str and end_str: + start = int(start_str) + end = int(end_str) + if start > 0 and end > 0 and start <= end: + indices.update(range(start, end + 1)) + else: + return None # Invalid range + else: + return None + else: + return None + else: + # Single number + num = int(part) + if num > 0: + indices.add(num) + else: + return None + except (ValueError, AttributeError): + return None + + return indices if indices else None + + +def _filter_items_by_selection(items: List, selection: Optional[Set[int]]) -> List: + """Filter items by 1-based selection indices. + + Args: + items: List of items to filter + selection: Set of 1-based indices, or None for all items + + Returns: + Filtered list of items in original order + + Examples: + _filter_items_by_selection([a, b, c, d], {2, 4}) → [b, d] + _filter_items_by_selection([a, b, c, d], None) → [a, b, c, d] + """ + if selection is None or len(selection) == 0: + return items + + filtered = [] + for i, item in enumerate(items, start=1): + if i in selection: + filtered.append(item) + + return filtered + + +def _parse_line_selection(args: Sequence[str]) -> Set[int]: + """Parse selection arguments to indices. + + Args: + args: Line numbers and ranges (1-indexed) + Examples: ["3"], ["1", "3", "5"], ["1-3"] + + Returns: + Set of 0-indexed line numbers to select + + Raises: + ValueError: If selection is invalid + """ + selected_indices: Set[int] = set() + + for arg in args: + arg = str(arg).strip() + + # Check if it's a range (e.g., "1-3") + if '-' in arg and not arg.startswith('-'): + try: + parts = arg.split('-') + if len(parts) == 2: + start = int(parts[0]) - 1 # Convert to 0-indexed + end = int(parts[1]) # End is exclusive in range + for i in range(start, end): + selected_indices.add(i) + else: + raise ValueError(f"Invalid range format: {arg}") + except ValueError as e: + raise ValueError(f"Invalid range: {arg}") from e + else: + # Single line number (1-indexed) + try: + line_num = int(arg) + idx = line_num - 1 # Convert to 0-indexed + selected_indices.add(idx) + except ValueError: + raise ValueError(f"Invalid line number: {arg}") + + return selected_indices + + +def _validate_indices(selected_indices: Set[int], total_lines: int) -> List[str]: + """Validate indices are within bounds. + + Args: + selected_indices: Set of 0-indexed line numbers + total_lines: Total number of available lines + + Returns: + List of error messages (empty if all valid) + """ + errors = [] + for idx in selected_indices: + if idx < 0 or idx >= total_lines: + errors.append(f"Line {idx + 1} out of range (1-{total_lines})") + return errors + + +def _select_lines(lines: List[str], selected_indices: Set[int]) -> List[str]: + """Select specific lines from input. + + Args: + lines: List of input lines + selected_indices: Set of 0-indexed line numbers to select + + Returns: + List of selected lines in order + """ + selected_indices_sorted = sorted(selected_indices) + return [lines[idx] for idx in selected_indices_sorted] + + +# Keep helper references so static analyzers treat them as used in this module. +_SELECTION_HELPERS = ( + _filter_items_by_selection, + _parse_line_selection, + _validate_indices, + _select_lines, +) + + +def main(): + """Entry point for the CLI.""" + app = _create_cmdlet_cli() + if app: + app() + else: + print("Typer not available") + + +if __name__ == "__main__": + main() diff --git a/README copy.md b/README copy.md new file mode 100644 index 0000000..57480fa --- /dev/null +++ b/README copy.md @@ -0,0 +1,64 @@ +# Medeia-Macina + +A powerful CLI media management and search platform integrating local files, Hydrus, torrents, books, and P2P networks. + +## Key Features +* **Unified Search**: Search across Local, Hydrus, LibGen, Soulseek, and Debrid. +* **Pipeline Architecture**: Chain commands like PowerShell (e.g., `search | filter | download`). +* **Smart Selection**: Use `@N` syntax to interact with results. +* **Metadata Management**: Tagging, notes, and relationships. + +## Installation +1. Install Python 3.9+ and [Deno](https://deno.com/) (for YouTube support). +2. Install dependencies: `pip install -r requirements.txt` +3. Run the CLI: `python CLI.py` + +## Command Examples + +### Search & Download +```powershell +# Search and download the first result +search-file "daughter" | @1 | download-data + +# Search specific provider and download +search-file -provider libgen "dune" | @1 | download-data + +# Download YouTube video (auto-probes formats) +download-data "https://youtube.com/watch?v=..." +# Select format #2 from the list +@2 | download-data +``` + +### File Management +```powershell +# Add file to Hydrus +add-file -path "C:\Videos\movie.mp4" -storage hydrus + +# Upload to 0x0.st and associate URL with Hydrus file +search-file "my_video" | @1 | add-file -provider 0x0 + +# Add tags to a file +search-file "video" | @1 | add-tag "creator:someone, character:hero" + +# Use tag lists (from helper/adjective.json) +@1 | add-tag "{gnostic}" +``` + +### Metadata & Notes +```powershell +# Add a note +search-file "doc" | @1 | add-note "comment" "This is important" + +# Get tags +search-file "image" | @1 | get-tag +``` + +### Pipeline Syntax +* `|` : Pipe results from one command to another. +* `@N` : Select the Nth item from the previous result (e.g., `@1`). +* `@N-M` : Select a range (e.g., `@1-5`). +* `@{1,3,5}` : Select specific items. +* `@*` : Select all items. + +## Configuration +Edit `config.json` to set API keys (AllDebrid, OpenAI), storage paths, and Hydrus credentials. diff --git a/TUI/__init__.py b/TUI/__init__.py new file mode 100644 index 0000000..f6b5106 --- /dev/null +++ b/TUI/__init__.py @@ -0,0 +1 @@ +"""Medeia-Macina TUI - Terminal User Interface.""" diff --git a/TUI/menu_actions.py b/TUI/menu_actions.py new file mode 100644 index 0000000..7c882c2 --- /dev/null +++ b/TUI/menu_actions.py @@ -0,0 +1,105 @@ +"""Utilities that drive the modern Textual UI menus and presets.""" + +from __future__ import annotations + +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional, Sequence + +BASE_DIR = Path(__file__).resolve().parent +ROOT_DIR = BASE_DIR.parent +for path in (ROOT_DIR, BASE_DIR): + str_path = str(path) + if str_path not in sys.path: + sys.path.insert(0, str_path) + +import metadata + + +@dataclass(slots=True) +class PipelinePreset: + """Simple descriptor for a reusable pipeline.""" + + label: str + description: str + pipeline: str + + +PIPELINE_PRESETS: List[PipelinePreset] = [ + PipelinePreset( + label="Download → Merge → Local", + description="Use download-data with playlist auto-selection, merge the pieces, tag, then import into local storage.", + pipeline='download-data "" | merge-file | add-tag | add-file -storage local', + ), + PipelinePreset( + label="Download → Hydrus", + description="Fetch media, auto-tag, and push directly into Hydrus.", + pipeline='download-data "" | merge-file | add-tag | add-file -storage hydrus', + ), + PipelinePreset( + label="Search Local Library", + description="Run search-file against the local library and emit a result table for further piping.", + pipeline='search-file -library local -query ""', + ), +] + + +def load_tags(file_path: Path) -> List[str]: + """Read tags for a file using metadata.py as the single source of truth.""" + + try: + return metadata.read_tags_from_file(file_path) + except Exception: + return [] + + +def group_tags_by_namespace(tags: Sequence[str]) -> Dict[str, List[str]]: + """Return tags grouped by namespace for quick UI summaries.""" + + grouped: Dict[str, List[str]] = {} + for tag in metadata.normalize_tags(list(tags)): + namespace, value = metadata.split_tag(tag) + key = namespace or "_untagged" + grouped.setdefault(key, []).append(value) + + for items in grouped.values(): + items.sort() + return grouped + + +def build_metadata_snapshot(file_path: Path) -> Dict[str, Any]: + """Load any available sidecar metadata for the selected file.""" + + snapshot: Dict[str, Any] = { + "file": str(file_path), + "tags": group_tags_by_namespace(load_tags(file_path)), + } + + try: + sidecar = metadata._derive_sidecar_path(file_path) + if sidecar.is_file(): + title, tags, notes = metadata._read_sidecar_metadata(sidecar) + snapshot["sidecar"] = { + "title": title, + "tags": group_tags_by_namespace(tags), + "notes": notes, + } + except Exception: + snapshot["sidecar"] = None + + return snapshot + + +def summarize_result(result: Dict[str, Any]) -> str: + """Build a one-line summary for a pipeline result row.""" + + title = result.get("title") or result.get("identifier") or result.get("file_path") + source = result.get("source") or result.get("cmdlet") or "result" + return f"{source}: {title}" if title else source + + +def normalize_tags(tags: Iterable[str]) -> List[str]: + """Expose metadata.normalize_tags for callers that imported the old helper.""" + + return metadata.normalize_tags(list(tags)) diff --git a/TUI/modalscreen/__init__.py b/TUI/modalscreen/__init__.py new file mode 100644 index 0000000..1cd127c --- /dev/null +++ b/TUI/modalscreen/__init__.py @@ -0,0 +1,7 @@ +"""Modal screens for the Downlow Hub UI application.""" + +from .export import ExportModal +from .search import SearchModal +from .workers import WorkersModal + +__all__ = ["ExportModal", "SearchModal", "WorkersModal"] diff --git a/TUI/modalscreen/access.py b/TUI/modalscreen/access.py new file mode 100644 index 0000000..1f00cf8 --- /dev/null +++ b/TUI/modalscreen/access.py @@ -0,0 +1,139 @@ +"""Modal for displaying files/URLs to access in web mode.""" + +from textual.screen import ModalScreen +from textual.containers import Container, Vertical, Horizontal +from textual.widgets import Static, Button, Label +from textual.app import ComposeResult +import logging + +logger = logging.getLogger(__name__) + + +class AccessModal(ModalScreen): + """Modal to display a file/URL that can be accessed from phone browser.""" + + CSS = """ + Screen { + align: center middle; + } + + #access-container { + width: 80; + height: auto; + border: thick $primary; + background: $surface; + } + + #access-header { + dock: top; + height: 3; + background: $boost; + border-bottom: solid $accent; + content-align: center middle; + } + + #access-content { + height: auto; + width: 1fr; + padding: 1 2; + border-bottom: solid $accent; + } + + #access-footer { + dock: bottom; + height: 3; + background: $boost; + border-top: solid $accent; + align: center middle; + } + + .access-url { + width: 1fr; + height: auto; + margin-bottom: 1; + border: solid $accent; + padding: 1; + } + + .access-label { + width: 1fr; + height: auto; + margin-bottom: 1; + } + + Button { + margin-right: 1; + } + """ + + def __init__(self, title: str, content: str, is_url: bool = False): + """Initialize access modal. + + Args: + title: Title of the item being accessed + content: The URL or file path + is_url: Whether this is a URL (True) or file path (False) + """ + super().__init__() + self.item_title = title + self.item_content = content + self.is_url = is_url + + def compose(self) -> ComposeResult: + """Create the modal layout.""" + with Container(id="access-container"): + with Vertical(id="access-header"): + yield Label(f"[bold]{self.item_title}[/bold]") + yield Label("[dim]Click link below to open in your browser[/dim]") + + with Vertical(id="access-content"): + if self.is_url: + yield Label("[bold cyan]Link:[/bold cyan]", classes="access-label") + else: + yield Label("[bold cyan]File:[/bold cyan]", classes="access-label") + + # Display as clickable link using HTML link element for web mode + # Rich link markup `[link=URL]` has parsing issues with URLs containing special chars + # Instead, use the HTML link markup that Textual-serve renders as tag + # Format: [link=URL "tooltip"]text[/link] - the quotes help with parsing + link_text = f'[link="{self.item_content}"]Open in Browser[/link]' + content_box = Static(link_text, classes="access-url") + yield content_box + + # Also show the URL for reference/copying + yield Label(self.item_content, classes="access-label") + + yield Label("\n[yellow]↑ Click the link above to open on your device[/yellow]", classes="access-label") + + with Horizontal(id="access-footer"): + yield Button("Copy URL", id="copy-btn", variant="primary") + yield Button("Close", id="close-btn", variant="default") + + def on_button_pressed(self, event: Button.Pressed) -> None: + """Handle button presses.""" + if event.button.id == "copy-btn": + # Copy to clipboard (optional - not critical if fails) + logger.info(f"Attempting to copy: {self.item_content}") + try: + # Try to use pyperclip if available + try: + import pyperclip + pyperclip.copy(self.item_content) + logger.info("URL copied to clipboard via pyperclip") + except ImportError: + # Fallback: try xclip on Linux or pbcopy on Mac + import subprocess + import sys + if sys.platform == "win32": + # Windows: use clipboard via pyperclip (already tried) + logger.debug("Windows clipboard not available without pyperclip") + else: + # Linux/Mac + process = subprocess.Popen(['xclip', '-selection', 'clipboard'], stdin=subprocess.PIPE) + process.communicate(self.item_content.encode('utf-8')) + logger.info("URL copied to clipboard via xclip") + except Exception as e: + logger.debug(f"Clipboard copy not available: {e}") + # Not critical - just informational + elif event.button.id == "close-btn": + self.dismiss() diff --git a/TUI/modalscreen/download.py b/TUI/modalscreen/download.py new file mode 100644 index 0000000..32cb763 --- /dev/null +++ b/TUI/modalscreen/download.py @@ -0,0 +1,1880 @@ +"""Download request modal screen for initiating new downloads. + +This modal allows users to specify: +- URL or search query (paragraph) +- Tags to apply +- Source (Hydrus, local, AllDebrid, etc.) +- Actions (download, screenshot) +""" + +from textual.app import ComposeResult +from textual.screen import ModalScreen +from textual.containers import Container, Horizontal, Vertical, ScrollableContainer +from textual.widgets import Static, Button, Label, Select, Checkbox, TextArea, ProgressBar, Tree, Input +from textual.binding import Binding +from textual import work +import logging +from typing import Optional, Callable, Any +from pathlib import Path +import sys + +from helper.logger import log +import json + +# Add parent directory to path for imports +sys.path.insert(0, str(Path(__file__).parent.parent)) + +# Import cmdlets system to call get-tag +try: + from cmdlets import get as get_cmdlet +except ImportError: + get_cmdlet = None + +# Import tag processing helpers +try: + from metadata import expand_tag_lists, process_tags_from_string +except ImportError: + expand_tag_lists = None + process_tags_from_string = None + +logger = logging.getLogger(__name__) + + +class DownloadModal(ModalScreen): + """Modal screen for initiating new download requests.""" + + BINDINGS = [ + Binding("escape", "cancel", "Cancel"), + Binding("ctrl+enter", "submit", "Submit"), + ] + + CSS_PATH = "download.tcss" + + def __init__( + self, + on_submit: Optional[Callable[[dict], None]] = None, + available_sources: Optional[list] = None, + config: Optional[dict] = None + ): + """Initialize the download modal. + + Args: + on_submit: Callback function that receives download request dict + available_sources: List of available source names (e.g., ['hydrus', 'local', 'alldebrid']) + config: Configuration dict with download settings + """ + super().__init__() + self.on_submit = on_submit + self.available_sources = available_sources or ['hydrus', 'local', 'alldebrid'] + self.config = config or {} + + # UI Component references + self.paragraph_textarea: TextArea = None # type: ignore + self.tags_textarea: TextArea = None # type: ignore + self.source_select: Select = None # type: ignore + self.files_select: Select = None # type: ignore + self.download_checkbox: Checkbox = None # type: ignore + self.screenshot_checkbox: Checkbox = None # type: ignore + self.progress_bar: ProgressBar = None # type: ignore + self.selected_files: set = set() # Track selected files + + # Playlist support + self.playlist_tree: Tree = None # type: ignore + self.playlist_input: Input = None # type: ignore + self.playlist_merge_checkbox: Checkbox = None # type: ignore + self.is_playlist: bool = False # Track if current URL is a playlist + self.playlist_items: list = [] # Store playlist items + + + def compose(self) -> ComposeResult: + """Compose the download request modal.""" + yield Vertical( + # Title + Static("📥 New Download Request", id="download_title"), + + # Main layout: Horizontal split into left and right columns + Horizontal( + # Left column: URL (top) and Tags (bottom) + Vertical( + Container( + TextArea( + id="paragraph_textarea", + language="", + show_line_numbers=True, + ), + id="url_container", + classes="grid_container", + ), + Container( + TextArea( + id="tags_textarea", + language="", + show_line_numbers=True, + ), + id="tags_container", + classes="grid_container" + ), + id="left_column" + ), + + # Right column: Files/Playlist + Vertical( + # Formats Select (for single files) + Container( + Select( + id="files_select", + options=[], # Populated dynamically + ), + id="files_container", + classes="grid_container" + ), + + # Playlist Tree + Input + Merge (for playlists) + Container( + Vertical( + Tree( + "Playlist", + id="playlist_tree", + ), + Horizontal( + Input( + placeholder="Track selection (e.g., 1-3, all, merge, 1 5 8)", + id="playlist_input", + ), + Checkbox( + label="Merge", + id="playlist_merge_checkbox", + value=False, + ), + id="playlist_input_row" + ), + ), + id="playlist_container", + classes="grid_container" + ), + + id="right_column" + ), + + id="main_layout" + ), + + # Footer: All on one row - Checkboxes left, Source middle, Buttons right + Horizontal( + # Left: Checkboxes + Container( + Checkbox(label="Download", id="download_checkbox"), + Checkbox(label="Screenshot", id="screenshot_checkbox"), + id="checkbox_row" + ), + + # Middle: Source selector + Select( + id="source_select", + options=self._build_source_options() + ), + + # Progress bar (shown during download) + ProgressBar(id="progress_bar"), + + # Right: Buttons + Horizontal( + Button("Cancel", id="cancel_btn", variant="default"), + Button("Submit", id="submit_btn", variant="primary"), + id="button_row" + ), + + id="footer_layout", + classes="modal_footer" + ), + + id="download_modal", + classes="modal_vertical" + ) + + def _build_source_options(self) -> list[tuple[str, str]]: + """Build source select options. + + Returns: + List of (label, value) tuples for Select widget + """ + source_icons = { + 'hydrus': '🗃️ Hydrus', + 'local': '📁 Local', + 'alldebrid': '☁️ AllDebrid', + 'debrid': '☁️ Debrid', + 'soulseek': '🎵 Soulseek', + 'libgen': '📚 LibGen', + } + + options = [] + for source in self.available_sources: + label = source_icons.get(source.lower(), source) + options.append((label, source)) + + return options + + def on_mount(self) -> None: + """Called when the modal is mounted.""" + # Get references to widgets + self.paragraph_textarea = self.query_one("#paragraph_textarea", TextArea) + self.tags_textarea = self.query_one("#tags_textarea", TextArea) + self.source_select = self.query_one("#source_select", Select) + self.files_select = self.query_one("#files_select", Select) + self.download_checkbox = self.query_one("#download_checkbox", Checkbox) + self.screenshot_checkbox = self.query_one("#screenshot_checkbox", Checkbox) + self.progress_bar = self.query_one("#progress_bar", ProgressBar) + self.playlist_tree = self.query_one("#playlist_tree", Tree) + self.playlist_input = self.query_one("#playlist_input", Input) + self.playlist_merge_checkbox = self.query_one("#playlist_merge_checkbox", Checkbox) + + # Set default actions + self.download_checkbox.value = True + self.screenshot_checkbox.value = False + self.playlist_merge_checkbox.value = False + + # Initialize PDF playlist URLs (set by _handle_pdf_playlist) + self.pdf_urls = [] + self.is_pdf_playlist = False + + # Hide playlist by default (show format select) + self._show_format_select() + + # Focus on tags textarea + self.tags_textarea.focus() + + logger.debug("Download modal mounted") + + def action_submit(self) -> None: + """Submit the download request by executing cmdlet pipeline in background.""" + # Validate and get values first (on main thread) + url = self.paragraph_textarea.text.strip() + tags_str = self.tags_textarea.text.strip() + source = self.source_select.value or 'local' + download_enabled = self.download_checkbox.value + merge_enabled = self.playlist_merge_checkbox.value if self.is_playlist else False + + if not url: + logger.warning("Download request missing URL") + self.app.notify( + "URL is required", + title="Missing Input", + severity="warning" + ) + return + + # Parse tags (one per line) + tags = [] + if tags_str: + tags = [tag.strip() for tag in tags_str.split('\n') if tag.strip()] + + # Get playlist selection if this is a playlist + playlist_selection = "" + if self.is_playlist and not self.is_pdf_playlist: + # Regular playlist (non-PDF) + playlist_selection = self.playlist_input.value.strip() + if not playlist_selection: + # No selection provided - default to downloading all tracks + playlist_selection = f"1-{len(self.playlist_items)}" + logger.info(f"No selection provided, defaulting to all tracks: {playlist_selection}") + elif self.is_playlist and self.is_pdf_playlist: + # PDF playlist - handle selection + playlist_selection = self.playlist_input.value.strip() + if not playlist_selection: + # No selection provided - default to all PDFs + playlist_selection = f"1-{len(self.playlist_items)}" + logger.info(f"PDF playlist: no selection provided, defaulting to all PDFs: {playlist_selection}") + merge_enabled = True # Always merge PDFs if multiple selected + + # Launch the background worker with PDF playlist info + self._submit_worker(url, tags, source, download_enabled, playlist_selection, merge_enabled, + is_pdf_playlist=self.is_pdf_playlist, pdf_urls=self.pdf_urls if self.is_pdf_playlist else []) + + @work(thread=True) + def _submit_worker(self, url: str, tags: list, source: str, download_enabled: bool, playlist_selection: str = "", merge_enabled: bool = False, is_pdf_playlist: bool = False, pdf_urls: Optional[list] = None) -> None: + """Background worker to execute the cmdlet pipeline. + + Args: + url: URL to download + tags: List of tags to apply + source: Source for metadata + download_enabled: Whether to download the file + playlist_selection: Playlist track selection (e.g., "1-3", "all", "merge") + merge_enabled: Whether to merge playlist files after download + is_pdf_playlist: Whether this is a PDF pseudo-playlist + pdf_urls: List of PDF URLs if is_pdf_playlist is True + """ + if pdf_urls is None: + pdf_urls = [] + + # Initialize worker to None so outer exception handler can check it + worker = None + try: + # Show progress bar on main thread + self.app.call_from_thread(self._show_progress) + + logger.info(f"Building cmdlet pipeline: URL={url}, tags={len(tags)}, source={source}, download={download_enabled}, playlist_selection={playlist_selection}") + + # Create a worker instance using the app's helper method + worker = None + try: + if hasattr(self.app, 'create_worker'): + worker = self.app.create_worker( + 'download', + title=f"Download: {url[:50]}", + description=f"Tags: {', '.join(tags) if tags else 'None'}" + ) + else: + # Fallback if helper not available + import uuid + from helper.worker_manager import Worker + worker_id = f"dl_{uuid.uuid4().hex[:8]}" + worker = Worker(worker_id, "download", f"Download: {url[:50]}", + f"Tags: {', '.join(tags) if tags else 'None'}", None) + except Exception as e: + logger.error(f"Error creating worker: {e}") + worker = None + + # Log initial step + if worker: + worker.log_step("Download initiated") + + # Handle PDF playlist specially + if is_pdf_playlist and pdf_urls: + logger.info(f"Processing PDF playlist with {len(pdf_urls)} PDFs") + self._handle_pdf_playlist_download(pdf_urls, tags, playlist_selection, merge_enabled) + self.app.call_from_thread(self._hide_progress) + self.app.call_from_thread(self.dismiss) + return + + # Build the cmdlet pipeline + # Start with URL as initial object + result_obj = self._create_url_result(url) + + # Import cmdlet system + if not get_cmdlet: + logger.error("cmdlets module not available") + self.app.call_from_thread( + self.app.notify, + "Cmdlets system unavailable", + title="Error", + severity="error" + ) + self.app.call_from_thread(self._hide_progress) + return + + # Stage 1: Download data if enabled + download_succeeded = False + download_stderr_text = "" # Store for merge stage + if download_enabled: + download_cmdlet = get_cmdlet("download-data") + if download_cmdlet: + logger.info("📥 Executing download-data stage") + logger.info(f"download_cmdlet object: {download_cmdlet}") + logger.info(f"result_obj: {result_obj}") + + # Log step to worker + if worker: + worker.log_step("Starting download-data stage...") + + # Build arguments for download-data + cmdlet_args = [] + if self.is_playlist: + # Always use yt-dlp's native --playlist-items for playlists + if playlist_selection: + # User provided specific selection + ytdlp_selection = self._convert_selection_to_ytdlp(playlist_selection) + logger.info(f"Playlist with user selection: {playlist_selection} → {ytdlp_selection}") + else: + # No selection provided, download all + ytdlp_selection = f"1-{len(self.playlist_items)}" + logger.info(f"Playlist mode: downloading all {len(self.playlist_items)} items") + cmdlet_args = ["--playlist-items", ytdlp_selection] + + logger.info(f"Built cmdlet_args: {cmdlet_args}") + logger.info(f"About to call download_cmdlet({result_obj}, {cmdlet_args}, {type(self.config).__name__})") + + if worker: + worker.append_stdout(f"📥 Downloading from: {url}\n") + if cmdlet_args: + worker.append_stdout(f" Args: {cmdlet_args}\n") + + try: + # Capture output from the cmdlet using temp files (more reliable than redirect) + import tempfile + import subprocess + + # Try normal redirect first + import io + from contextlib import redirect_stdout, redirect_stderr + + stdout_buf = io.StringIO() + stderr_buf = io.StringIO() + + # Always capture output + try: + with redirect_stdout(stdout_buf), redirect_stderr(stderr_buf): + logger.info(f"Calling download_cmdlet...") + returncode = download_cmdlet(result_obj, cmdlet_args, self.config) + logger.info(f"download_cmdlet returned: {returncode}") + except Exception as cmdlet_error: + # If cmdlet throws an exception, log it + logger.error(f"❌ download-cmdlet exception: {cmdlet_error}", exc_info=True) + if worker: + import traceback + worker.append_stdout(f"❌ download-cmdlet exception: {cmdlet_error}\n{traceback.format_exc()}\n") + returncode = 1 + + stdout_text = stdout_buf.getvalue() + stderr_text = stderr_buf.getvalue() + download_stderr_text = stderr_text # Save for merge stage + + # Log raw output + logger.info(f"download-cmdlet returncode: {returncode}") + logger.info(f"stdout ({len(stdout_text)} chars): {stdout_text[:200] if stdout_text else '(empty)'}") + logger.info(f"stderr ({len(stderr_text)} chars): {stderr_text[:200] if stderr_text else '(empty)'}") + + # Always append output to worker for debugging + if worker: + if stdout_text: + worker.append_stdout(f"[download-data stdout]\n{stdout_text}\n") + if stderr_text: + worker.append_stdout(f"[download-data stderr]\n{stderr_text}\n") + + # Log the output so it gets captured by WorkerLoggingHandler + if stdout_text: + logger.info(f"[download-data output]\n{stdout_text}") + if stderr_text: + logger.info(f"[download-data stderr]\n{stderr_text}") + if returncode != 0: + download_failed_msg = f"❌ download-data stage failed with code {returncode}\nstdout: {stdout_text}\nstderr: {stderr_text}" + logger.error(download_failed_msg) + if worker: + worker.append_stdout(f"\n{download_failed_msg}\n") + worker.finish("error", "Download stage failed - see logs above for details") + + # Log to stderr as well so it shows in terminal + log(f"Return code: {returncode}", file=sys.stderr) + log(f"stdout:\n{stdout_text}", file=sys.stderr) + log(f"stderr:\n{stderr_text}", file=sys.stderr) + + # Extract error reason from stderr/stdout for user notification + # Try to extract meaningful error from yt-dlp output + error_reason = "Unknown error" + + # Search for yt-dlp error patterns (case-insensitive) + error_text = (stderr_text + "\n" + stdout_text).lower() + + # Look for specific error keywords in priority order + if "http error 403" in error_text or "error 403" in error_text: + error_reason = "HTTP 403: Access forbidden (YouTube blocked download, may be georestricted or SABR issue)" + elif "http error 401" in error_text or "error 401" in error_text: + error_reason = "HTTP 401: Authentication required (may need login credentials)" + elif "http error 404" in error_text or "error 404" in error_text: + error_reason = "HTTP 404: URL not found (video/content may have been deleted)" + elif "http error" in error_text: + # Extract the actual HTTP error code + import re + http_match = re.search(r'HTTP Error (\d{3})', stderr_text + stdout_text, re.IGNORECASE) + if http_match: + error_reason = f"HTTP Error {http_match.group(1)}: Server returned an error" + else: + error_reason = "HTTP error from server" + elif "no such file or directory" in error_text or "file not found" in error_text: + error_reason = "File not found (yt-dlp may not be installed or not in PATH)" + elif "unable to download" in error_text: + error_reason = "Unable to download video (network issue or content unavailable)" + elif "connection" in error_text or "timeout" in error_text or "timed out" in error_text: + error_reason = "Network connection failed or timed out" + elif "permission" in error_text or "access denied" in error_text: + error_reason = "Permission denied (may need elevated privileges or login)" + elif "private video" in error_text or "private" in error_text: + error_reason = "Video is private (not accessible)" + elif "age restricted" in error_text or "age gate" in error_text: + error_reason = "Video is age-restricted and requires login" + elif "region restricted" in error_text or "georestrict" in error_text: + error_reason = "Video is region-restricted (not available in your country)" + elif "member-only" in error_text or "members only" in error_text: + error_reason = "Video is available to members only" + + # If still unknown, try to extract last line of stderr as it often contains the actual error + if error_reason == "Unknown error": + stderr_lines = [line.strip() for line in stderr_text.split('\n') if line.strip()] + if stderr_lines: + # Look for error-like lines (usually contain "error", "failed", "ERROR", etc) + for line in reversed(stderr_lines): + if any(keyword in line.lower() for keyword in ["error", "failed", "exception", "traceback", "warning"]): + error_reason = line[:150] # Limit to 150 chars + break + # If no error keyword found, use the last line + if error_reason == "Unknown error": + error_reason = stderr_lines[-1][:150] + + # Log the extracted error reason for debugging + logger.error(f"Extracted error reason: {error_reason}") + + self.app.call_from_thread( + self.app.notify, + f"Download failed: {error_reason}", + title="Download Error", + severity="error" + ) + # Finish worker with error status + try: + self.app.call_from_thread( + self.app.finish_worker, + worker_id, + "error", + f"Download failed: {error_reason}" + ) + except Exception: + pass + + # Also append detailed error info to worker stdout for visibility + if worker: + worker.append_stdout(f"\n❌ DOWNLOAD FAILED\n") + worker.append_stdout(f"Reason: {error_reason}\n") + if stderr_text and stderr_text.strip(): + worker.append_stdout(f"\nFull error output:\n{stderr_text}\n") + if stdout_text and stdout_text.strip(): + worker.append_stdout(f"\nStandard output:\n{stdout_text}\n") + # Don't try to tag if download failed + self.app.call_from_thread(self._hide_progress) + self.app.call_from_thread(self.dismiss) + return + else: + download_succeeded = True + # Always log output at INFO level so we can see what happened + logger.info(f"download-data stage completed successfully") + if stdout_text: + logger.info(f"download-data stdout:\n{stdout_text}") + if stderr_text: + logger.info(f"download-data stderr:\n{stderr_text}") + + # Log step to worker + if worker: + worker.log_step(f"Download completed: {len(stdout_text.split('Saved to')) - 1} items downloaded") + + # For playlists with merge enabled, scan the output directory for ALL downloaded files + # instead of trying to parse individual "Saved to" lines + downloaded_files = [] + if self.is_playlist and merge_enabled: + # Get output directory + from pathlib import Path + from config import resolve_output_dir + output_dir = resolve_output_dir(self.config) + logger.info(f"Merge enabled: scanning {output_dir} for downloaded files") + + # First, try to extract filenames from download output + # Look for patterns like "→ filename.mp3" from yt-dlp output + extracted_files = [] + for line in stdout_text.split('\n'): + if '→' in line: + # Extract filename from arrow marker + parts = line.split('→') + if len(parts) > 1: + filename = parts[1].strip() + if filename: + full_path = output_dir / filename + if full_path.exists(): + extracted_files.append(str(full_path)) + logger.debug(f"Found downloaded file from output: {filename}") + + if extracted_files: + downloaded_files = extracted_files + logger.info(f"Found {len(downloaded_files)} downloaded files from output markers") + else: + # Fallback: List all recent mp3/m4a files in output directory + if output_dir.exists(): + import time + current_time = time.time() + recent_files = [] + for f in list(output_dir.glob("*.mp3")) + list(output_dir.glob("*.m4a")) + list(output_dir.glob("*.mp4")): + # Files modified in last 30 minutes (extended window) + if current_time - f.stat().st_mtime < 1800: + recent_files.append((f, f.stat().st_mtime)) + + # Sort by modification time to preserve order + recent_files.sort(key=lambda x: x[1]) + downloaded_files = [str(f[0]) for f in recent_files] + logger.info(f"Found {len(downloaded_files)} recently modified files in directory (fallback)") + + if downloaded_files: + logger.info(f"Found {len(downloaded_files)} files to merge") + if downloaded_files: + logger.info(f"Files to merge: {downloaded_files[:3]}... (showing first 3)") + else: + # For non-merge or non-playlist, just look for "Saved to" pattern + combined_output = stdout_text + "\n" + stderr_text + for line in combined_output.split('\n'): + if 'Saved to' in line: + # Extract path after "Saved to " + saved_idx = line.find('Saved to') + if saved_idx != -1: + path = line[saved_idx + 8:].strip() + if path: + downloaded_files.append(path) + logger.debug(f"Found downloaded file: {path}") + + # For merge scenarios, DON'T set to first file yet - merge first, then tag + # For non-merge, set to first file for tagging + if downloaded_files: + if not (self.is_playlist and merge_enabled): + # Non-merge case: set to first file for tagging + first_file = downloaded_files[0] + result_obj.target = first_file + result_obj.path = first_file + logger.info(f"Set result target/path to first file: {first_file}") + else: + # Merge case: save all files, will set to merged file after merge + logger.info(f"Merge enabled - will merge {len(downloaded_files)} files before tagging") + download_stderr_text = f"DOWNLOADED_FILES:{','.join(downloaded_files)}\n" + download_stderr_text + + logger.info("download-data stage completed successfully") + except Exception as e: + logger.error(f"download-data execution error: {e}", exc_info=True) + self.app.call_from_thread( + self.app.notify, + f"Download error: {e}", + title="Download Error", + severity="error" + ) + # Finish worker with error status + try: + self.app.call_from_thread( + self.app.finish_worker, + worker_id, + "error", + f"Download error: {str(e)}" + ) + except Exception: + pass + self.app.call_from_thread(self._hide_progress) + self.app.call_from_thread(self.dismiss) + return + + # Stage 2: Merge files if enabled and this is a playlist (BEFORE tagging) + merged_file_path = None + if merge_enabled and download_succeeded and self.is_playlist: + merge_cmdlet = get_cmdlet("merge-file") + if merge_cmdlet: + from pathlib import Path + logger.info("Executing merge-file stage") + + # Log step to worker + if worker: + worker.log_step("Starting merge-file stage...") + + merge_args = ["-delete", "-format", "mka"] # Delete source files, use MKA for speed (stream copy) and chapters + + try: + # For merge, we pass a list of result objects + # The merge-file cmdlet expects objects with 'target' attribute + files_to_merge = [] + + # Check if we have the special marker with downloaded files + if download_stderr_text.startswith("DOWNLOADED_FILES:"): + # Extract file list from marker + files_line = download_stderr_text.split('\n')[0] + if files_line.startswith("DOWNLOADED_FILES:"): + files_str = files_line[len("DOWNLOADED_FILES:"):] + file_list = [f.strip() for f in files_str.split(',') if f.strip()] + logger.info(f"Found {len(file_list)} downloaded files from marker") + + # Create result objects with proper attributes + for filepath in file_list: + filepath_obj = Path(filepath) + file_result = type('FileResult', (), { + 'target': str(filepath), + 'path': str(filepath), + 'media_kind': 'audio', + 'hash_hex': None, + 'hash': None, + 'known_urls': [], + 'title': filepath_obj.stem + })() + files_to_merge.append(file_result) + + if files_to_merge: + logger.info(f"Merging {len(files_to_merge)} files: {[f.target for f in files_to_merge]}") + + # Call merge-file with list of results + import io + from contextlib import redirect_stdout, redirect_stderr + + stdout_buf = io.StringIO() + stderr_buf = io.StringIO() + + with redirect_stdout(stdout_buf), redirect_stderr(stderr_buf): + # Pass the list of file results to merge-file + merge_returncode = merge_cmdlet(files_to_merge, merge_args, self.config) + + merge_stdout = stdout_buf.getvalue() + merge_stderr = stderr_buf.getvalue() + + # Log the merge output so it gets captured by WorkerLoggingHandler + if merge_stdout: + logger.info(f"[merge-file output]\n{merge_stdout}") + if merge_stderr: + logger.info(f"[merge-file stderr]\n{merge_stderr}") + + if merge_returncode != 0: + logger.error(f"merge-file stage failed with code {merge_returncode}") + logger.error(f" stderr: {merge_stderr}") + self.app.call_from_thread( + self.app.notify, + f"Merge failed: {merge_stderr[:100] if merge_stderr else 'unknown error'}", + title="Merge Error", + severity="warning" + ) + # Don't fail entirely - files were downloaded + else: + logger.info("merge-file stage completed successfully") + if merge_stdout: + logger.info(f"merge-file stdout: {merge_stdout}") + if merge_stderr: + logger.info(f"merge-file stderr: {merge_stderr}") + + # Log step to worker + if worker: + worker.log_step("Merge completed successfully") + + # Extract merged file path from stderr + # The merge-file cmdlet outputs: "[merge-file] Merged N files into: /path/to/merged.mp3" + for line in merge_stderr.split('\n'): + if 'Merged' in line and 'into:' in line: + # Extract path after "into: " + into_idx = line.find('into:') + if into_idx != -1: + merged_file_path = line[into_idx + 5:].strip() + if merged_file_path: + logger.info(f"Detected merged file path: {merged_file_path}") + break + + # If not found in stderr, try stdout + if not merged_file_path: + for line in merge_stdout.split('\n'): + if 'merged' in line.lower() or line.endswith('.mp3') or line.endswith('.m4a'): + merged_file_path = line.strip() + if merged_file_path and not merged_file_path.startswith('['): + logger.info(f"Detected merged file path: {merged_file_path}") + break + + # If we found the merged file, update result_obj to point to it + if merged_file_path: + result_obj.target = merged_file_path + result_obj.path = merged_file_path + logger.info(f"Updated result object to point to merged file: {merged_file_path}") + else: + logger.warning(f"No files found to merge. download_stderr_text length: {len(download_stderr_text)}, content preview: {download_stderr_text[:100]}") + except Exception as e: + logger.error(f"merge-file execution error: {e}", exc_info=True) + self.app.call_from_thread( + self.app.notify, + f"Merge error: {e}", + title="Merge Error", + severity="warning" + ) + # Don't fail entirely - files were downloaded + else: + logger.info("merge-file cmdlet not found") + + # Stage 3: Add tags (now after merge, if merge happened) + # If merge succeeded, result_obj now points to merged file + if tags and (download_succeeded or not download_enabled): + add_tags_cmdlet = get_cmdlet("add-tag") + if add_tags_cmdlet: + logger.info(f"Executing add-tag stage with {len(tags)} tags") + logger.info(f" Tags: {tags}") + logger.info(f" Source: {source}") + logger.info(f" Result path: {result_obj.path}") + logger.info(f" Result hash: {result_obj.hash_hex}") + + # Log step to worker + if worker: + worker.log_step(f"Starting add-tag stage with {len(tags)} tags...") + + # Build add-tag arguments: tag1 tag2 tag3 --source + tag_args = [str(t) for t in tags] + ["--source", str(source)] + logger.info(f" Tag args: {tag_args}") + logger.info(f" Result object attributes: target={getattr(result_obj, 'target', 'MISSING')}, path={getattr(result_obj, 'path', 'MISSING')}, hash_hex={getattr(result_obj, 'hash_hex', 'MISSING')}") + + try: + # Capture output from the cmdlet + import io + from contextlib import redirect_stdout, redirect_stderr + + stdout_buf = io.StringIO() + stderr_buf = io.StringIO() + + with redirect_stdout(stdout_buf), redirect_stderr(stderr_buf): + returncode = add_tags_cmdlet(result_obj, tag_args, self.config) + + stdout_text = stdout_buf.getvalue() + stderr_text = stderr_buf.getvalue() + + # Log the tag output so it gets captured by WorkerLoggingHandler + if stdout_text: + logger.info(f"[add-tag output]\n{stdout_text}") + if stderr_text: + logger.info(f"[add-tag stderr]\n{stderr_text}") + + if returncode != 0: + logger.error(f"add-tag stage failed with code {returncode}") + logger.error(f" stdout: {stdout_text}") + logger.error(f" stderr: {stderr_text}") + self.app.call_from_thread( + self.app.notify, + f"Failed to add tags: {stderr_text[:100] if stderr_text else stdout_text[:100] if stdout_text else 'unknown error'}", + title="Error", + severity="error" + ) + # Don't dismiss on tag failure - let user retry or cancel, but hide progress + self.app.call_from_thread(self._hide_progress) + return + else: + if stdout_text: + logger.debug(f"add-tag stdout: {stdout_text}") + if stderr_text: + logger.debug(f"add-tag stderr: {stderr_text}") + logger.info("add-tag stage completed successfully") + + # Log step to worker + if worker: + worker.log_step(f"Successfully added {len(tags)} tags") + except Exception as e: + logger.error(f"add-tag execution error: {e}", exc_info=True) + self.app.call_from_thread( + self.app.notify, + f"Error adding tags: {e}", + title="Error", + severity="error" + ) + self.app.call_from_thread(self._hide_progress) + return + else: + logger.error("add-tag cmdlet not found") + else: + if tags and download_enabled and not download_succeeded: + skip_msg = "⚠️ Skipping add-tag stage because download failed" + logger.info(skip_msg) + if worker: + worker.append_stdout(f"\n{skip_msg}\n") + worker.finish("error", "Download stage failed - see logs above for details") + elif tags: + logger.info("No tags to add (tags list is empty)") + + + # Success notification + self.app.call_from_thread( + self.app.notify, + f"Download request processed: {url}", + title="Success", + severity="information", + timeout=2 + ) + + # Finish worker with success status + if worker: + worker.finish("completed", "Download completed successfully") + + logger.info("Download request processing complete") + + # Hide progress and dismiss the modal + self.app.call_from_thread(self._hide_progress) + self.app.call_from_thread(self.dismiss) + + except Exception as e: + logger.error(f"Error in download submit: {e}", exc_info=True) + # Ensure worker is marked as finished even on exception + if worker: + try: + worker.finish("error", f"Download failed: {str(e)}") + except Exception: + pass + self.app.call_from_thread(self._hide_progress) + self.app.call_from_thread( + self.app.notify, + f"Error: {e}", + title="Error", + severity="error" + ) + + def _create_url_result(self, url: str): + """Create a result object from a URL for cmdlet processing.""" + class URLDownloadResult: + def __init__(self, u): + self.target = u + self.url = u + self.path: str | None = None + self.hash_hex: str | None = None + self.media_kind = "url" + + return URLDownloadResult(url) + + def action_cancel(self) -> None: + """Cancel the download request.""" + self.dismiss() + + def on_key(self, event) -> None: + """Handle key presses to implement context-sensitive Ctrl+T.""" + if event.key == "ctrl+t": + # Check which widget has focus + focused_widget = self.app.focused + if focused_widget and focused_widget.id == "paragraph_textarea": + # URL textarea: scrape fresh metadata, wipe tags and source + self._action_scrape_url_metadata() + event.prevent_default() + elif focused_widget and focused_widget.id == "tags_textarea": + # Tags textarea: scrape special fields and adjectives + self._action_scrape_tags() + event.prevent_default() + + def _action_scrape_url_metadata(self) -> None: + """Scrape metadata from URL(s) in URL textarea - wipes tags and source. + + This is triggered by Ctrl+T when URL textarea is focused. + Supports single URL or multiple URLs (newline/comma-separated). + For multiple PDF URLs, creates pseudo-playlist for merge workflow. + """ + try: + text = self.paragraph_textarea.text.strip() + if not text: + logger.warning("No URL to scrape metadata from") + return + + # Parse multiple URLs (newline or comma-separated) + urls = [] + for line in text.split('\n'): + line = line.strip() + if line: + # Handle comma-separated URLs within a line + for url in line.split(','): + url = url.strip() + if url: + urls.append(url) + + # Check if multiple URLs provided + if len(urls) > 1: + logger.info(f"Detected {len(urls)} URLs - checking for PDF pseudo-playlist") + # Check if all URLs appear to be PDFs + all_pdfs = all(url.endswith('.pdf') or 'pdf' in url.lower() for url in urls) + if all_pdfs: + logger.info(f"All URLs are PDFs - creating pseudo-playlist") + self._handle_pdf_playlist(urls) + return + + # Single URL - proceed with normal metadata scraping + url = urls[0] if urls else text.strip() + logger.info(f"Scraping fresh metadata from: {url}") + + # Check if tags are already provided in textarea + existing_tags = self.tags_textarea.text.strip() + wipe_tags = not existing_tags # Only wipe if no tags exist + + # Run in background to prevent UI freezing + self._scrape_metadata_worker(url, wipe_tags_and_source=wipe_tags, skip_tag_scraping=not wipe_tags) + + except Exception as e: + logger.error(f"Error in _action_scrape_url_metadata: {e}", exc_info=True) + + def _action_scrape_tags(self) -> None: + """Process tags from tags textarea, expanding tag lists like {philosophy}. + + This is triggered by Ctrl+T when tags textarea is focused. + Processes tag list references from adjective.json (e.g., {psychology}) + and expands them to the full list of tags. + """ + try: + current_tags = self.tags_textarea.text.strip() + if not current_tags: + logger.warning("No tags to process") + return + + if not expand_tag_lists or not process_tags_from_string: + logger.warning("tag_helpers not available") + self.app.notify( + "Tag processing unavailable", + title="Error", + severity="error", + timeout=2 + ) + return + + logger.info(f"Processing tags: {current_tags[:50]}...") + + # Parse tags from current text + tags_set = process_tags_from_string(current_tags, expand_lists=False) + if not tags_set: + logger.warning("No tags parsed from text") + return + + # Expand tag list references like {psychology} + expanded_tags = expand_tag_lists(tags_set) + + if len(expanded_tags) > len(tags_set): + # Tags were expanded + tags_count_added = len(expanded_tags) - len(tags_set) + logger.info(f"Expanded tags: added {tags_count_added} new tags") + self.app.notify( + f"Expanded: {tags_count_added} new tags added from tag lists", + title="Tags Expanded", + severity="information", + timeout=2 + ) + else: + logger.info("No tag list expansions found") + self.app.notify( + "No {list} references found to expand", + title="Info", + severity="information", + timeout=2 + ) + + # Update textarea with expanded tags (one per line) + self.tags_textarea.text = '\n'.join(sorted(expanded_tags)) + logger.info(f"Updated tags textarea with {len(expanded_tags)} tags") + + except Exception as e: + logger.error(f"Error in _action_scrape_tags: {e}", exc_info=True) + self.app.notify( + f"Error processing tags: {e}", + title="Error", + severity="error" + ) + + + def _handle_pdf_playlist(self, pdf_urls: list) -> None: + """Handle multiple PDF URLs as a pseudo-playlist. + + Creates a playlist-like structure with PDF metadata for merge workflow. + Extracts title from URL or uses default naming. + + Args: + pdf_urls: List of PDF URLs to process + """ + try: + logger.info(f"Creating PDF pseudo-playlist with {len(pdf_urls)} items") + + # Create playlist items from PDF URLs + playlist_items = [] + for idx, url in enumerate(pdf_urls, 1): + # Extract filename from URL for display + try: + # Get filename from URL path + from urllib.parse import urlparse + parsed = urlparse(url) + filename = parsed.path.split('/')[-1] + if not filename or filename.endswith('.pdf'): + filename = filename or f'pdf_{idx}.pdf' + # Remove .pdf extension for display + title = filename.replace('.pdf', '').replace('_', ' ').replace('-', ' ') + except Exception as e: + logger.debug(f"Could not extract filename: {e}") + title = f'PDF {idx}' + + item = { + 'id': str(idx - 1), # 0-based index + 'title': title, + 'duration': '', # PDFs don't have duration, leave empty + 'url': url # Store the URL for later download + } + playlist_items.append(item) + + # Build minimal metadata structure for UI population + metadata = { + 'title': f'{len(pdf_urls)} PDF Documents', + 'tags': [], + 'formats': [('pdf', 'pdf')], # Default format is PDF + 'playlist_items': playlist_items, + 'is_pdf_playlist': True # Mark as PDF pseudo-playlist + } + + # Store URLs for later use during merge + self.pdf_urls = pdf_urls + self.is_pdf_playlist = True + + # Populate the modal with metadata + logger.info(f"Populating modal with {len(playlist_items)} PDF items") + self._populate_from_metadata(metadata, wipe_tags_and_source=True) + + self.app.notify( + f"Loaded {len(pdf_urls)} PDFs as playlist", + title="PDF Playlist", + severity="information", + timeout=3 + ) + + except Exception as e: + logger.error(f"Error handling PDF playlist: {e}", exc_info=True) + self.app.notify( + f"Error loading PDF playlist: {e}", + title="Error", + severity="error", + timeout=3 + ) + + + def _handle_pdf_playlist_download(self, pdf_urls: list, tags: list, selection: str, merge_enabled: bool) -> None: + """Download and merge PDF playlist. + + Args: + pdf_urls: List of PDF URLs to download + tags: Tags to apply to the merged PDF + selection: Selection string like "1-3" or "1,3,5" + merge_enabled: Whether to merge the PDFs + """ + # Check if PyPDF2 is available for merge (needed at function start) + try: + from PyPDF2 import PdfWriter, PdfReader + HAS_PYPDF2 = True + except ImportError: + HAS_PYPDF2 = False + PdfWriter = None + PdfReader = None + + try: + from pathlib import Path + import requests + from config import resolve_output_dir + + # Create temporary list of playlist items for selection parsing + # We need this because _parse_playlist_selection uses self.playlist_items + temp_items = [] + for url in pdf_urls: + temp_items.append({'title': url}) + self.playlist_items = temp_items + + # Parse selection to get which PDFs to download + selected_indices = self._parse_playlist_selection(selection) + if not selected_indices: + # No valid selection, use all + selected_indices = list(range(len(pdf_urls))) + + selected_urls = [pdf_urls[i] for i in selected_indices] + + logger.info(f"Downloading {len(selected_urls)} selected PDFs for merge") + + # Download PDFs to temporary directory + temp_dir = Path.home() / ".downlow_temp_pdfs" + temp_dir.mkdir(exist_ok=True) + + downloaded_files = [] + for idx, url in enumerate(selected_urls, 1): + try: + logger.info(f"Downloading PDF {idx}/{len(selected_urls)}: {url}") + + response = requests.get(url, timeout=30) + response.raise_for_status() + + # Generate filename from URL + from urllib.parse import urlparse + parsed = urlparse(url) + filename = parsed.path.split('/')[-1] + if not filename.endswith('.pdf'): + filename = f'pdf_{idx}.pdf' + + pdf_path = temp_dir / filename + with open(pdf_path, 'wb') as f: + f.write(response.content) + + downloaded_files.append(pdf_path) + logger.info(f"Downloaded to: {pdf_path}") + + except Exception as e: + logger.error(f"Failed to download PDF {idx}: {e}") + self.app.call_from_thread( + self.app.notify, + f"Failed to download PDF {idx}: {e}", + title="Download Error", + severity="error" + ) + return + + # Merge PDFs if requested + if merge_enabled and len(downloaded_files) > 1: + if not HAS_PYPDF2: + logger.error("PyPDF2 not available for PDF merge") + self.app.call_from_thread( + self.app.notify, + "PyPDF2 required for PDF merge. Install with: pip install PyPDF2", + title="Missing Dependency", + severity="error" + ) + return + + logger.info(f"Merging {len(downloaded_files)} PDFs") + + try: + writer = PdfWriter() + for pdf_file in downloaded_files: + reader = PdfReader(pdf_file) + for page in reader.pages: + writer.add_page(page) + logger.info(f"Added {len(reader.pages)} pages from {pdf_file.name}") + + # Save merged PDF to output directory + output_dir = Path(resolve_output_dir(self.config)) + output_dir.mkdir(parents=True, exist_ok=True) + + output_path = output_dir / "merged_pdfs.pdf" + # Make filename unique if it exists + counter = 1 + while output_path.exists(): + output_path = output_dir / f"merged_pdfs_{counter}.pdf" + counter += 1 + + with open(output_path, 'wb') as f: + writer.write(f) + + logger.info(f"Merged PDF saved to: {output_path}") + + # Tag the file if tags provided + if tags and get_cmdlet: + tag_cmdlet = get_cmdlet("add-tags") + if tag_cmdlet: + logger.info(f"Tagging merged PDF with {len(tags)} tags") + + # Create a result object for the PDF + class PDFResult: + def __init__(self, p): + self.path = str(p) + self.target = str(p) + self.hash_hex = None + + result_obj = PDFResult(output_path) + + import io + from contextlib import redirect_stdout, redirect_stderr + + stdout_buf = io.StringIO() + stderr_buf = io.StringIO() + + with redirect_stdout(stdout_buf), redirect_stderr(stderr_buf): + tag_returncode = tag_cmdlet(result_obj, tags, self.config) + + if tag_returncode != 0: + logger.warning(f"Tag stage returned code {tag_returncode}") + + self.app.call_from_thread( + self.app.notify, + f"Successfully merged {len(downloaded_files)} PDFs", + title="Merge Complete", + severity="information", + timeout=3 + ) + + except Exception as e: + logger.error(f"PDF merge error: {e}", exc_info=True) + self.app.call_from_thread( + self.app.notify, + f"PDF merge failed: {e}", + title="Merge Error", + severity="error" + ) + + else: + # Save individual PDFs to output + output_dir = Path(resolve_output_dir(self.config)) + output_dir.mkdir(parents=True, exist_ok=True) + + for pdf_file in downloaded_files: + output_path = output_dir / pdf_file.name + # Make filename unique if it exists + counter = 1 + base_name = pdf_file.stem + while output_path.exists(): + output_path = output_dir / f"{base_name}_{counter}.pdf" + counter += 1 + + import shutil + shutil.copy2(pdf_file, output_path) + logger.info(f"Saved PDF to: {output_path}") + + self.app.call_from_thread( + self.app.notify, + f"Downloaded {len(downloaded_files)} PDFs", + title="Download Complete", + severity="information", + timeout=3 + ) + + except Exception as e: + logger.error(f"Error in PDF playlist download: {e}", exc_info=True) + self.app.call_from_thread( + self.app.notify, + f"Error processing PDF playlist: {e}", + title="Error", + severity="error" + ) + + + @work(thread=True) + def _scrape_metadata_worker(self, url: str, wipe_tags_and_source: bool = False, skip_tag_scraping: bool = False) -> None: + """Background worker to scrape metadata using get-tag cmdlet. + + Args: + url: URL to scrape metadata from + wipe_tags_and_source: If True, clear tags and source before populating + skip_tag_scraping: If True, don't scrape tags (only title/formats) + """ + try: + logger.info(f"Metadata worker started for: {url}") + + # Call get-tag cmdlet to scrape URL + if not get_cmdlet: + logger.error("cmdlets module not available") + self.app.call_from_thread( + self.app.notify, + "cmdlets module not available", + title="Error", + severity="error" + ) + return + + # Get the get-tag cmdlet + get_tag_cmdlet = get_cmdlet("get-tag") + if not get_tag_cmdlet: + logger.error("get-tag cmdlet not found") + self.app.call_from_thread( + self.app.notify, + "get-tag cmdlet not found", + title="Error", + severity="error" + ) + return + + # Create a simple result object for the cmdlet + class URLResult: + def __init__(self, u): + self.target = u + self.hash_hex = None + self.path = None + + result_obj = URLResult(url) + + # Call the cmdlet with -scrape flag (unless skipping tag scraping) + import io + from contextlib import redirect_stdout, redirect_stderr + + output_buffer = io.StringIO() + error_buffer = io.StringIO() + + # Only scrape if not skipping tag scraping + args = [] if skip_tag_scraping else ["-scrape", url] + + with redirect_stdout(output_buffer), redirect_stderr(error_buffer): + returncode = get_tag_cmdlet(result_obj, args, {}) + + if returncode != 0: + error_msg = error_buffer.getvalue() + logger.error(f"get-tag cmdlet failed: {error_msg}") + try: + self.app.call_from_thread( + self.app.notify, + f"Failed to scrape metadata: {error_msg}", + title="Error", + severity="error" + ) + except Exception as e: + logger.debug(f"Could not notify user: {e}") + return + + # Parse the JSON output + output = output_buffer.getvalue().strip() + if not output: + logger.warning("get-tag returned no output") + try: + self.app.call_from_thread( + self.app.notify, + "No metadata returned from get-tag", + title="Error", + severity="error" + ) + except Exception as e: + logger.debug(f"Could not notify user: {e}") + return + + # Extract the JSON line (skip debug messages that start with [get-tag]) + json_line = None + for line in output.split('\n'): + if line.strip().startswith('{'): + json_line = line.strip() + break + + if not json_line: + logger.error(f"No JSON found in get-tag output") + logger.debug(f"Raw output: {output}") + try: + self.app.call_from_thread( + self.app.notify, + "No metadata found in response", + title="Error", + severity="error" + ) + except Exception as e: + logger.debug(f"Could not notify user: {e}") + return + + try: + metadata_result = json.loads(json_line) + except json.JSONDecodeError as e: + logger.error(f"Failed to parse JSON: {e}") + logger.debug(f"JSON line: {json_line}") + try: + self.app.call_from_thread( + self.app.notify, + f"Failed to parse metadata: {e}", + title="Error", + severity="error" + ) + except Exception as ne: + logger.debug(f"Could not notify user: {ne}") + return + + # Build metadata dict in the format expected by _populate_from_metadata + # If skipping tag scraping, preserve existing tags + existing_tags = self.tags_textarea.text.strip().split('\n') if skip_tag_scraping else [] + existing_tags = [tag.strip() for tag in existing_tags if tag.strip()] + + # Extract playlist items if present + playlist_items = metadata_result.get('playlist_items', []) + + metadata = { + 'title': metadata_result.get('title', 'Unknown'), + 'url': url, + 'tags': metadata_result.get('tags', []) or existing_tags, # Use existing if new are empty + 'formats': metadata_result.get('formats', []), + 'playlist_items': playlist_items, + } + + logger.info(f"Retrieved metadata: title={metadata['title']}, tags={len(metadata['tags'])}, formats={len(metadata['formats'])}, playlist_items={len(playlist_items)}") + + # Update UI on main thread + self.app.call_from_thread( + self._populate_from_metadata, + metadata, + wipe_tags_and_source + ) + + except Exception as e: + logger.error(f"Metadata worker error: {e}", exc_info=True) + try: + self.app.call_from_thread( + self.app.notify, + f"Failed to scrape metadata: {e}", + title="Error", + severity="error" + ) + except Exception as ne: + logger.debug(f"Could not notify user of error: {ne}") + + def _convert_selection_to_ytdlp(self, selection_str: str) -> str: + """Convert playlist selection string to yt-dlp --playlist-items format. + + Args: + selection_str: Selection string like "1-3", "all", "merge", "1,3,5-8" + Can also include multiple keywords separated by spaces + + Returns: + yt-dlp format string like "1-3,5,8" or "1-10" for all + """ + if not selection_str: + return "" + + selection_str = selection_str.strip().upper() + max_idx = len(self.playlist_items) + + # Handle keywords (all, merge, a, m) - can be space or comma separated + # "ALL MERGE", "A M", "ALL,MERGE" etc all mean download all items + if any(kw in selection_str.replace(',', ' ').split() for kw in {'A', 'ALL', 'M', 'MERGE'}): + # User said to get all items (merge is same as all in this context) + return f"1-{max_idx}" + + # Parse ranges like "1,3,5-8" and convert to yt-dlp format + # The selection is already in 1-based format from user, keep it that way + # yt-dlp expects 1-based indices + try: + parts = [] + for part in selection_str.split(','): + part = part.strip() + if part: # Skip empty parts + parts.append(part) + + return ','.join(parts) + except (ValueError, AttributeError): + logger.error(f"Failed to convert playlist selection: {selection_str}") + return "" + + def _parse_playlist_selection(self, selection_str: str) -> list: + """Parse playlist selection string into list of track indices (0-based). + + Args: + selection_str: Selection string like "1-3", "all", "merge", "1,3,5-8" + + Returns: + List of 0-based indices, or empty list if invalid + """ + if not selection_str: + return [] + + selection_str = selection_str.strip().upper() + max_idx = len(self.playlist_items) + + # Handle keywords (all, merge, a, m) - can be space or comma separated + # "ALL MERGE", "A M", "ALL,MERGE" etc all mean download all items + if any(kw in selection_str.replace(',', ' ').split() for kw in {'A', 'ALL', 'M', 'MERGE'}): + # User said to get all items + return list(range(max_idx)) + + # Parse ranges like "1,3,5-8" + indices = set() + try: + for part in selection_str.split(','): + part = part.strip() + if '-' in part: + # Range like "5-8" + start_str, end_str = part.split('-', 1) + start = int(start_str.strip()) - 1 # Convert to 0-based + end = int(end_str.strip()) # end is inclusive in user terms + for i in range(start, end): + if 0 <= i < max_idx: + indices.add(i) + else: + # Single number + idx = int(part.strip()) - 1 # Convert to 0-based + if 0 <= idx < max_idx: + indices.add(idx) + + return sorted(list(indices)) + except (ValueError, AttributeError): + logger.error(f"Failed to parse playlist selection: {selection_str}") + return [] + + def _execute_download_pipeline(self, result_obj: Any, tags: list, source: str, download_enabled: bool, worker=None) -> None: + """Execute the download pipeline for a single item. + + Args: + result_obj: URL result object + tags: List of tags to apply + source: Source for metadata + download_enabled: Whether to download the file + worker: Optional Worker instance for logging + """ + # Import cmdlet system + if not get_cmdlet: + error_msg = "cmdlets module not available" + logger.error(error_msg) + if worker: + worker.append_stdout(f"❌ ERROR: {error_msg}\n") + self.app.call_from_thread( + self.app.notify, + "Cmdlets system unavailable", + title="Error", + severity="error" + ) + return + + # Stage 1: Download data if enabled + if download_enabled: + download_cmdlet = get_cmdlet("download-data") + if download_cmdlet: + stage_msg = "📥 Executing download-data stage" + logger.info(stage_msg) + if worker: + worker.append_stdout(f"{stage_msg}\n") + try: + import io + from contextlib import redirect_stdout, redirect_stderr + + stdout_buf = io.StringIO() + stderr_buf = io.StringIO() + + with redirect_stdout(stdout_buf), redirect_stderr(stderr_buf): + returncode = download_cmdlet(result_obj, [], self.config) + + stdout_text = stdout_buf.getvalue() + stderr_text = stderr_buf.getvalue() + + if stdout_text: + logger.debug(f"download-data stdout: {stdout_text}") + if worker: + worker.append_stdout(stdout_text) + + if stderr_text: + logger.debug(f"download-data stderr: {stderr_text}") + if worker: + worker.append_stdout(f"⚠️ stderr: {stderr_text}\n") + + if returncode != 0: + error_msg = f"❌ download-data stage failed with code {returncode}\nstderr: {stderr_text}" + logger.error(error_msg) + if worker: + worker.append_stdout(f"{error_msg}\n") + self.app.call_from_thread( + self.app.notify, + f"Download failed: {stderr_text[:100]}", + title="Download Error", + severity="error" + ) + return + else: + success_msg = "✅ download-data completed successfully" + logger.info(success_msg) + if worker: + worker.append_stdout(f"{success_msg}\n") + except Exception as e: + error_msg = f"❌ download-data error: {e}" + logger.error(error_msg, exc_info=True) + if worker: + worker.append_stdout(f"{error_msg}\nTraceback:\n{__import__('traceback').format_exc()}\n") + self.app.call_from_thread( + self.app.notify, + str(e)[:100], + title="Download Error", + severity="error" + ) + return + + # Stage 2: Tag the file if tags provided + if tags: + tag_cmdlet = get_cmdlet("add-tags") + if tag_cmdlet and result_obj.get('path'): + stage_msg = f"🏷️ Tagging with {len(tags)} tags" + logger.info(stage_msg) + if worker: + worker.append_stdout(f"{stage_msg}\n") + try: + tag_args = tags + import io + from contextlib import redirect_stdout, redirect_stderr + + stdout_buf = io.StringIO() + stderr_buf = io.StringIO() + + with redirect_stdout(stdout_buf), redirect_stderr(stderr_buf): + tag_returncode = tag_cmdlet(result_obj, tag_args, {}) + + stdout_text = stdout_buf.getvalue() + stderr_text = stderr_buf.getvalue() + + if stdout_text: + logger.debug(f"tag stdout: {stdout_text}") + if worker: + worker.append_stdout(stdout_text) + + if tag_returncode != 0: + warning_msg = f"⚠️ Tag stage returned code {tag_returncode}: {stderr_text}" + logger.warning(warning_msg) + if worker: + worker.append_stdout(f"{warning_msg}\n") + else: + if worker: + worker.append_stdout("✅ Tags applied successfully\n") + except Exception as e: + error_msg = f"❌ Tagging error: {e}" + logger.error(error_msg, exc_info=True) + if worker: + worker.append_stdout(f"{error_msg}\n") + else: + if not result_obj.get('path'): + warning_msg = "⚠️ No file path in result - skipping tagging" + logger.warning(warning_msg) + if worker: + worker.append_stdout(f"{warning_msg}\n") + else: + if worker: + worker.append_stdout("✅ Download complete (no tags to apply)\n") + + def _show_format_select(self) -> None: + """Show format select (always visible for single files).""" + try: + files_container = self.query_one("#files_container", Container) + playlist_container = self.query_one("#playlist_container", Container) + # Format select always visible, playlist hidden by default + files_container.styles.height = "1fr" + playlist_container.styles.height = "0" + self.is_playlist = False + except Exception as e: + logger.error(f"Error showing format select: {e}") + + def _show_playlist_controls(self) -> None: + """Show playlist tree and input alongside format select (for playlists).""" + try: + playlist_container = self.query_one("#playlist_container", Container) + # Just make playlist visible - format select remains visible above it + playlist_container.styles.height = "auto" + self.is_playlist = True + except Exception as e: + logger.error(f"Error showing playlist controls: {e}") + + def _populate_playlist_tree(self, items: list) -> None: + """Populate the playlist tree with track items. + + Args: + items: List of track info dicts with 'id', 'title', 'duration', etc. + """ + try: + self.playlist_tree.clear() + self.playlist_items = items + + for idx, item in enumerate(items, 1): + title = item.get('title', f'Track {idx}') + duration = item.get('duration', '') + # Format: "1. Song Title (3:45)" + label = f"{idx}. {title}" + if duration: + label += f" ({duration})" + + self.playlist_tree.root.add_leaf(label) + + logger.info(f"Populated playlist tree with {len(items)} items") + except Exception as e: + logger.error(f"Error populating playlist tree: {e}") + + def _populate_from_metadata(self, metadata: dict, wipe_tags_and_source: bool = False) -> None: + """Populate modal fields from extracted metadata. + + Args: + metadata: Dictionary with title, tags, formats + wipe_tags_and_source: If True, clear tags and source before populating + """ + try: + # Wipe tags and source if requested (fresh scrape from URL) + if wipe_tags_and_source: + self.tags_textarea.text = "" + # Reset source to first available option + try: + # Get all options and select the first one + source_options = self._build_source_options() + if source_options: + self.source_select.value = source_options[0][1] + except Exception as e: + logger.warning(f"Could not reset source select: {e}") + + # Populate tags - using extracted tags (one per line format) + tags = metadata.get('tags', []) + existing_tags = self.tags_textarea.text.strip() + title = metadata.get('title', 'Unknown') + + # Extract meaningful tags: + # 1. Freeform tags (tag:value) + # 2. Creator/artist metadata (creator:, artist:, channel:) + # 3. Other meaningful namespaces (genre:, album:, track:, etc.) + meaningful_tags = [] + + # Add title tag first (so user can edit it) + if title and title != 'Unknown': + meaningful_tags.append(f"title:{title}") + + # Namespaces to exclude (metadata-only, not user-facing) + excluded_namespaces = { + 'hash', # Hash values (internal) + 'known_url', # URLs (internal) + 'relationship', # Internal relationships + 'url', # URLs (internal) + } + + # Add all other tags + for tag in tags: + if ':' in tag: + namespace, value = tag.split(':', 1) + # Skip internal/metadata namespaces + if namespace.lower() not in excluded_namespaces: + meaningful_tags.append(tag) + else: + # Tags without namespace are freeform - always include + meaningful_tags.append(tag) + + # Build tags string (one per line) + tags_str = '\n'.join(meaningful_tags) + + if existing_tags: + self.tags_textarea.text = existing_tags + '\n' + tags_str + else: + self.tags_textarea.text = tags_str + + # Check if this is a playlist + playlist_items = metadata.get('playlist_items', []) + formats = metadata.get('formats', []) + + # Always show format select (single file or default for playlist) + self._show_format_select() + if formats: + # formats may be lists (from JSON) or tuples, convert to tuples + format_tuples = [] + for fmt in formats: + if isinstance(fmt, (list, tuple)) and len(fmt) == 2: + format_tuples.append(tuple(fmt)) + + if format_tuples: + self.files_select.set_options(format_tuples) + # Select the first format by default + self.files_select.value = format_tuples[0][1] + self.selected_files = {format_tuples[0][0]} + + # If playlist, also show the tree for track selection + if playlist_items and len(playlist_items) > 0: + logger.info(f"Detected playlist with {len(playlist_items)} items") + self._populate_playlist_tree(playlist_items) + # Show playlist tree alongside format select (height: auto to show) + playlist_container = self.query_one("#playlist_container", Container) + playlist_container.styles.height = "auto" + # SET FLAG SO action_submit() KNOWS THIS IS A PLAYLIST + self.is_playlist = True + + logger.info(f"Populated modal from metadata: {len(meaningful_tags)} tags, {len(playlist_items)} playlist items, {len(formats)} formats") + + # Notify user + self.app.notify( + f"Scraped metadata: {title}", + title="Metadata Loaded", + severity="information", + timeout=3 + ) + + except Exception as e: + logger.error(f"Error populating metadata: {e}", exc_info=True) + self.app.notify( + f"Failed to populate metadata: {e}", + title="Error", + severity="error" + ) + + def on_select_changed(self, event: Select.Changed) -> None: + """Handle Select widget changes (format selection).""" + if event.select.id == "files_select": + # Update selected_files to track the chosen format value + if event.value: + self.selected_files = {str(event.value)} + logger.debug(f"Selected format: {event.value}") + + def on_button_pressed(self, event) -> None: + """Handle button clicks.""" + if event.button.id == "submit_btn": + self.action_submit() + elif event.button.id == "cancel_btn": + self.action_cancel() + + def _show_progress(self) -> None: + """Show the progress bar and hide buttons.""" + try: + # Show progress bar by setting height + self.progress_bar.styles.height = 1 + self.progress_bar.update(total=100) + # Hide buttons during download + button_row = self.query_one("#button_row", Horizontal) + button_row.display = False + except Exception as e: + logger.error(f"Error showing progress bar: {e}") + + def _hide_progress(self) -> None: + """Hide the progress bar and show buttons again.""" + try: + # Hide progress bar by setting height to 0 + self.progress_bar.styles.height = 0 + # Show buttons again + button_row = self.query_one("#button_row", Horizontal) + button_row.display = True + except Exception as e: + logger.error(f"Error hiding progress bar: {e}") diff --git a/TUI/modalscreen/download.tcss b/TUI/modalscreen/download.tcss new file mode 100644 index 0000000..ec97187 --- /dev/null +++ b/TUI/modalscreen/download.tcss @@ -0,0 +1,183 @@ +/* Download Modal Screen Stylesheet */ + +Screen { + background: $surface; + overlay: screen; +} + +#download_modal { + width: 100%; + height: 100%; + border: heavy $primary; + background: $boost; +} + +#download_title { + dock: top; + height: 1; + content-align: center middle; + background: $primary; + color: $text; + text-style: bold; + padding: 0 1; +} + +/* Main horizontal layout: 2 columns left/right split */ +#main_layout { + width: 1fr; + height: 1fr; + layout: horizontal; + padding: 1; + border: none; +} + +/* Left column */ +#left_column { + width: 2fr; + height: 1fr; + layout: vertical; +} + +/* Right column */ +#right_column { + width: 1fr; + height: 1fr; + layout: vertical; +} + +/* All containers styling */ +.grid_container { + width: 1fr; + height: 1fr; + padding: 1; + layout: vertical; + margin: 0 0 1 0; +} + +#tags_container { + border: mediumpurple; +} + +#url_container { + border: solid $accent; +} + +#files_container { + border: solid $accent; +} + +#playlist_container { + border: solid $accent; + layout: vertical; + height: 0; +} + +#playlist_tree { + width: 1fr; + height: auto; + border: none; + padding: 0; +} + +#playlist_input { + width: 1fr; + height: 1; + border: none; + padding: 0 1; + margin: 1 0 0 0; +} + +#playlist_input_row { + width: 1fr; + height: auto; + layout: horizontal; + margin: 1 0 0 0; +} + +.section_title { + width: 1fr; + height: 1; + text-align: left; + color: $text-muted; + text-style: bold; + margin: 0 0 0 0; + padding: 0; +} + +/* TextArea widgets in containers */ +#tags_textarea { + width: 1fr; + height: 1fr; + border: none; + padding: 0; +} + +#paragraph_textarea { + width: 1fr; + height: 1fr; + border: none; + padding: 0; +} + +/* Select widgets in containers */ +#files_select { + width: 1fr; + height: 1fr; + border: none; +} + +/* Footer layout - horizontal: checkboxes left, source middle, buttons right */ +#footer_layout { + width: 1fr; + height: auto; + layout: horizontal; + padding: 1; + margin: 0; + background: $boost; +} + +#checkbox_row { + width: auto; + height: auto; + layout: horizontal; + align: left middle; +} + +#source_select { + width: 30; + height: 1; + border: none; + padding: 0 1; + margin: 0; +} + +#button_row { + width: auto; + height: auto; + layout: horizontal; + align: right middle; +} + +/* Progress bar - shown during download */ +#progress_bar { + width: 1fr; + height: 0; +} + +/* Checkbox and Button styling */ +Checkbox { + margin: 0 2 0 0; +} + +Button { + margin: 0 1 0 0; + width: 12; +} + +#cancel_btn { + width: 12; +} + +#submit_btn { + width: 12; +} diff --git a/TUI/modalscreen/export.py b/TUI/modalscreen/export.py new file mode 100644 index 0000000..3ab7e2c --- /dev/null +++ b/TUI/modalscreen/export.py @@ -0,0 +1,512 @@ +"""Export modal screen for exporting files with metadata.""" + +from textual.app import ComposeResult +from textual.screen import ModalScreen +from textual.containers import Container, Horizontal, Vertical +from textual.widgets import Static, Button, Input, TextArea, Tree, Select +from textual.binding import Binding +import logging +from typing import Optional, Any +from pathlib import Path +import json +import sys +import subprocess +from datetime import datetime + +# Add parent directory to path for imports +sys.path.insert(0, str(Path(__file__).parent.parent)) +from helper.utils import format_metadata_value +from config import load_config + +logger = logging.getLogger(__name__) + + +class ExportModal(ModalScreen): + """Modal screen for exporting files with metadata and tags.""" + + BINDINGS = [ + Binding("escape", "cancel", "Cancel"), + ] + + CSS_PATH = "export.tcss" + + def __init__(self, result_data: Optional[dict] = None, hydrus_available: bool = False, debrid_available: bool = False): + """Initialize the export modal with result data. + + Args: + result_data: Dictionary containing: + - title: str - Item title + - tags: str - Comma-separated tags + - metadata: dict - File metadata (source-specific from item.metadata or local DB) + - source: str - Source identifier ('local', 'hydrus', 'debrid', etc) + - current_result: object - The full search result object + hydrus_available: bool - Whether Hydrus API is available + debrid_available: bool - Whether Debrid API is available + """ + super().__init__() + self.result_data = result_data or {} + self.hydrus_available = hydrus_available + self.debrid_available = debrid_available + self.metadata_display: Optional[Static] = None + self.tags_textarea: Optional[TextArea] = None + self.export_to_select: Optional[Select] = None + self.custom_path_input: Optional[Input] = None + self.libraries_select: Optional[Select] = None + self.size_input: Optional[Input] = None + self.format_select: Optional[Select] = None + self.file_ext: Optional[str] = None # Store the file extension for format filtering + self.file_type: Optional[str] = None # Store the file type (audio, video, image, document) + self.default_format: Optional[str] = None # Store the default format to set after mount + + def _determine_file_type(self, ext: str) -> tuple[str, list]: + """Determine file type from extension and return type and format options. + + Args: + ext: File extension (e.g., '.mp3', '.mp4', '.jpg') + + Returns: + Tuple of (file_type, format_options) where format_options is a list of (label, value) tuples + """ + ext_lower = ext.lower() if ext else '' + + # Audio formats + audio_exts = {'.mp3', '.flac', '.wav', '.aac', '.ogg', '.m4a', '.wma', '.opus', '.mka'} + audio_formats = [("MKA", "mka"), ("MP3", "mp3"), ("M4A", "m4a"), ("FLAC", "flac"), ("WAV", "wav"), ("AAC", "aac"), ("OGG", "ogg"), ("Opus", "opus")] + + # Video formats (can have audio too) + video_exts = {'.mp4', '.mkv', '.webm', '.avi', '.mov', '.flv', '.wmv', '.m4v', '.ts', '.mpg', '.mpeg'} + video_formats = [("MP4", "mp4"), ("MKV", "mkv"), ("WebM", "webm"), ("AVI", "avi"), ("MOV", "mov")] + + # Image formats + image_exts = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff', '.ico'} + image_formats = [("JPG", "jpg"), ("PNG", "png"), ("WebP", "webp"), ("GIF", "gif"), ("BMP", "bmp")] + + # Document formats - no conversion for now + document_exts = {'.pdf', '.epub', '.txt', '.docx', '.doc', '.rtf', '.md', '.html', '.mobi', '.cbz', '.cbr'} + document_formats = [] + + if ext_lower in audio_exts: + return ('audio', audio_formats) + elif ext_lower in video_exts: + return ('video', video_formats) + elif ext_lower in image_exts: + return ('image', image_formats) + elif ext_lower in document_exts: + return ('document', document_formats) + else: + # Default to audio if unknown + return ('unknown', audio_formats) + + def _get_library_options(self) -> list: + """Get available library options from config.json.""" + options = [("Local", "local")] + + try: + # Try to load config + config_path = Path(__file__).parent.parent / "config.json" + if not config_path.exists(): + return options + + with open(config_path, 'r') as f: + config = json.load(f) + + # Check if Hydrus is configured AND available (supports both new and old format) + from config import get_hydrus_instance + hydrus_instance = get_hydrus_instance(config, "home") + if self.hydrus_available and hydrus_instance and hydrus_instance.get("key") and hydrus_instance.get("url"): + options.append(("Hydrus Network", "hydrus")) + + # Check if Debrid is configured AND available (supports both new and old format) + from config import get_debrid_api_key + debrid_api_key = get_debrid_api_key(config) + if self.debrid_available and debrid_api_key: + options.append(("Debrid", "debrid")) + + except Exception as e: + logger.error(f"Error loading config for libraries: {e}") + + return options + + def _get_metadata_text(self) -> str: + """Format metadata from result data in a consistent display format.""" + metadata = self.result_data.get('metadata', {}) + source = self.result_data.get('source', 'unknown') + logger.info(f"_get_metadata_text called - source: {source}, metadata type: {type(metadata)}, keys: {list(metadata.keys()) if metadata else 'empty'}") + + if not metadata: + logger.info(f"_get_metadata_text - No metadata found, returning 'No metadata available'") + return "No metadata available" + + lines = [] + + # Only display these specific fields in this order + display_fields = [ + 'duration', 'size', 'ext', 'media_type', 'time_imported', 'time_modified', 'hash' + ] + + # Display fields in a consistent order + for field in display_fields: + if field in metadata: + value = metadata[field] + # Skip complex types and None values + if isinstance(value, (dict, list)) or value is None: + continue + # Use central formatting rule + formatted_value = format_metadata_value(field, value) + # Format: "Field Name: value" + field_label = field.replace('_', ' ').title() + lines.append(f"{field_label}: {formatted_value}") + + # If we found any fields, display them + if lines: + logger.info(f"_get_metadata_text - Returning {len(lines)} formatted metadata lines") + return "\n".join(lines) + else: + logger.info(f"_get_metadata_text - No matching fields found in metadata") + return "No metadata available" + + def compose(self) -> ComposeResult: + """Compose the export modal screen.""" + with Container(id="export-container"): + yield Static("Export File with Metadata", id="export-title") + + # Row 1: Three columns (Tags, Metadata, Export-To Options) + self.tags_textarea = TextArea( + text=self._format_tags(), + id="tags-area", + read_only=False, + ) + yield self.tags_textarea + self.tags_textarea.border_title = "Tags" + + # Metadata display instead of files tree + self.metadata_display = Static( + self._get_metadata_text(), + id="metadata-display", + ) + yield self.metadata_display + self.metadata_display.border = ("solid", "dodgerblue") + + # Right column: Export options + with Vertical(id="export-options"): + # Export To selector + self.export_to_select = Select( + [("0x0", "0x0"), ("Libraries", "libraries"), ("Custom Path", "path")], + id="export-to-select" + ) + yield self.export_to_select + + # Libraries selector (initially hidden) + library_options = self._get_library_options() + self.libraries_select = Select( + library_options, + id="libraries-select" + ) + yield self.libraries_select + + # Custom path input (initially hidden) + self.custom_path_input = Input( + placeholder="Enter custom export path", + id="custom-path-input" + ) + yield self.custom_path_input + + # Get metadata for size and format options + metadata = self.result_data.get('metadata', {}) + original_size = metadata.get('size', '') + ext = metadata.get('ext', '') + + # Store the extension and determine file type + self.file_ext = ext + self.file_type, format_options = self._determine_file_type(ext) + + # Format size in MB for display + if original_size: + size_mb = int(original_size / (1024 * 1024)) if isinstance(original_size, (int, float)) else original_size + size_display = f"{size_mb}Mb" + else: + size_display = "" + + # Size input + self.size_input = Input( + value=size_display, + placeholder="Size (can reduce)", + id="size-input", + disabled=(self.file_type == 'document') # Disable for documents - no resizing needed + ) + yield self.size_input + + # Determine the default format value (match current extension to format options) + default_format = None + if ext and format_options: + # Map extension to format value (e.g., .flac -> "flac", .mp3 -> "mp3", .m4a -> "m4a") + ext_lower = ext.lower().lstrip('.') # Remove leading dot if present + # Try to find matching format option + for _, value in format_options: + if value and (ext_lower == value or f".{ext_lower}" == ext or ext.endswith(f".{value}")): + default_format = value + logger.debug(f"Matched extension {ext} to format {value}") + break + # If no exact match, use first option + if not default_format and format_options: + default_format = format_options[0][1] + logger.debug(f"No format match for {ext}, using first option: {default_format}") + + # Store the default format to apply after mount + self.default_format = default_format + + # Format selector based on file type + self.format_select = Select( + format_options if format_options else [("No conversion", "")], + id="format-select", + disabled=not format_options # Disable if no format options (e.g., documents) + ) + yield self.format_select + + # Row 2: Buttons + with Horizontal(id="export-buttons"): + yield Button("Cancel", id="cancel-btn", variant="default") + yield Button("Export", id="export-btn", variant="primary") + + def _format_tags(self) -> str: + """Format tags from result data.""" + tags = self.result_data.get('tags', '') + if isinstance(tags, str): + # Split by comma and rejoin with newlines + tags_list = [tag.strip() for tag in tags.split(',') if tag.strip()] + return '\n'.join(tags_list) + elif isinstance(tags, list): + return '\n'.join(tags) + return '' + + def on_button_pressed(self, event: Button.Pressed) -> None: + """Handle button press events.""" + button_id = event.button.id + + if button_id == "export-btn": + self._handle_export() + elif button_id == "cancel-btn": + self.action_cancel() + + def on_select_changed(self, event: Select.Changed) -> None: + """Handle select widget changes.""" + if event.control.id == "export-to-select": + # Show/hide custom path and libraries based on selection + if self.custom_path_input: + self.custom_path_input.display = (event.value == "path") + if self.libraries_select: + self.libraries_select.display = (event.value == "libraries") + elif event.control.id == "libraries-select": + # Handle library selection (no special action needed currently) + logger.debug(f"Library selected: {event.value}") + + def on_mount(self) -> None: + """Handle mount event.""" + # Initially hide custom path and libraries inputs (default is "0x0") + if self.custom_path_input: + self.custom_path_input.display = False + if self.libraries_select: + self.libraries_select.display = False + + # Set the default format value to show it selected instead of "Select" + if self.default_format and self.format_select: + self.format_select.value = self.default_format + logger.debug(f"Set format selector to default value: {self.default_format}") + + # Refresh metadata display after mount to ensure data is loaded + if self.metadata_display: + metadata_text = self._get_metadata_text() + self.metadata_display.update(metadata_text) + logger.debug(f"Updated metadata display on mount: {bool(self.result_data.get('metadata'))}") + def _handle_export(self) -> None: + """Handle the export action.""" + try: + tags_text = self.tags_textarea.text.strip() + export_to = self.export_to_select.value if self.export_to_select else "0x0" + custom_path = self.custom_path_input.value.strip() if self.custom_path_input else "" + + # Get library value - handle Select.BLANK case + library = "local" # default + if self.libraries_select and str(self.libraries_select.value) != "Select.BLANK": + library = str(self.libraries_select.value) + elif self.libraries_select and self.libraries_select: + # If value is Select.BLANK, try to get from the options + try: + # Get first available library option as fallback + options = self._get_library_options() + if options: + library = options[0][1] # Get the value part of first option tuple + except Exception: + library = "local" + + size = self.size_input.value.strip() if self.size_input else "" + file_format = self.format_select.value if self.format_select else "mp4" + + # Parse tags from textarea (one per line) + export_tags = set() + for line in tags_text.split('\n'): + tag = line.strip() + if tag: + export_tags.add(tag) + + # For Hydrus export, filter out metadata-only tags (hash:, known_url:, relationship:) + if export_to == "libraries" and library == "hydrus": + metadata_prefixes = {'hash:', 'known_url:', 'relationship:'} + export_tags = {tag for tag in export_tags if not any(tag.lower().startswith(prefix) for prefix in metadata_prefixes)} + logger.info(f"Filtered tags for Hydrus - removed metadata tags, {len(export_tags)} tags remaining") + + # Extract title and add as searchable tags if not already present + title = self.result_data.get('title', '').strip() + if title: + # Add the full title as a tag if not already present + title_tag = f"title:{title}" + if title_tag not in export_tags and not any(t.startswith('title:') for t in export_tags): + export_tags.add(title_tag) + + # Extract individual words from title as searchable tags (if reasonable length) + # Skip very short words and common stop words + if len(title) < 100: # Only for reasonably short titles + stop_words = {'the', 'a', 'an', 'and', 'or', 'of', 'in', 'to', 'for', 'is', 'it', 'at', 'by', 'from', 'with', 'as', 'be', 'on', 'that', 'this', 'this'} + words = title.lower().split() + for word in words: + # Clean up word (remove punctuation) + clean_word = ''.join(c for c in word if c.isalnum()) + # Only add if not a stop word and has some length + if clean_word and len(clean_word) > 2 and clean_word not in stop_words: + if clean_word not in export_tags: + export_tags.add(clean_word) + logger.info(f"Extracted {len(words)} words from title, added searchable title tags") + + # Validate required fields - allow export to continue for Hydrus even with 0 actual tags + # (metadata tags will still be in the sidecar, and tags can be added later) + if not export_tags and export_to != "libraries": + logger.warning("No tags provided for export") + return + + if export_to == "libraries" and not export_tags: + logger.warning("No actual tags for Hydrus export (only metadata was present)") + # Don't return - allow export to continue, file will be added to Hydrus even without tags + + # Determine export path + export_path = None + if export_to == "path": + if not custom_path: + logger.warning("Custom path required but not provided") + return + export_path = custom_path + elif export_to == "libraries": + export_path = library # "local", "hydrus", "debrid" + else: + export_path = export_to # "0x0" + + # Get metadata from result_data + metadata = self.result_data.get('metadata', {}) + + # Extract file source info from result_data (passed by hub-ui) + file_hash = self.result_data.get('file_hash') + file_url = self.result_data.get('file_url') + file_path = self.result_data.get('file_path') # For local files + source = self.result_data.get('source', 'unknown') + + # Prepare export data + export_data = { + 'export_to': export_to, + 'export_path': export_path, + 'library': library if export_to == "libraries" else None, + 'tags': export_tags, + 'size': size if size else None, + 'format': file_format, + 'metadata': metadata, + 'original_data': self.result_data, + 'file_hash': file_hash, + 'file_url': file_url, + 'file_path': file_path, # Pass file path for local files + 'source': source, + } + + logger.info(f"Export initiated: destination={export_path}, format={file_format}, size={size}, tags={export_tags}, source={source}, hash={file_hash}, path={file_path}") + + # Dismiss the modal and return the export data + self.dismiss(export_data) + + except Exception as e: + logger.error(f"Error during export: {e}", exc_info=True) + + def action_cancel(self) -> None: + """Handle cancel action.""" + self.dismiss(None) + + + +def create_notes_sidecar(file_path: Path, notes: str) -> None: + """Create a .notes sidecar file with notes text. + + Only creates file if notes are not empty. + + Args: + file_path: Path to the exported file + notes: Notes text + """ + if not notes or not notes.strip(): + return + + notes_path = file_path.with_suffix(file_path.suffix + '.notes') + try: + with open(notes_path, 'w', encoding='utf-8') as f: + f.write(notes.strip()) + logger.info(f"Created notes sidecar: {notes_path}") + except Exception as e: + logger.error(f"Failed to create notes sidecar: {e}", exc_info=True) + + +def determine_needs_conversion(current_ext: str, target_format: str) -> bool: + """Determine if conversion is needed between two formats. + + Args: + current_ext: Current file extension (e.g., '.flac') + target_format: Target format name (e.g., 'mp3') or NoSelection object + + Returns: + True if conversion is needed, False if it's already the target format + """ + # Handle NoSelection or None + if not target_format or target_format == "" or str(target_format.__class__.__name__) == 'NoSelection': + return False # No conversion requested + + # Normalize the current extension + current_ext_lower = current_ext.lower().lstrip('.') + target_format_lower = str(target_format).lower() + + # Check if they match + return current_ext_lower != target_format_lower + + +def calculate_size_tolerance(metadata: dict, user_size_mb: Optional[str]) -> tuple[Optional[int], Optional[int]]: + """Calculate target size with 1MB grace period. + + Args: + metadata: File metadata containing 'size' in bytes + user_size_mb: User-entered size like "756Mb" or empty string + + Returns: + Tuple of (target_bytes, grace_bytes) where grace_bytes is 1MB (1048576), + or (None, None) if no size specified + """ + grace_bytes = 1 * 1024 * 1024 # 1MB grace period + + if not user_size_mb or not user_size_mb.strip(): + return None, grace_bytes + + try: + # Parse the size string (format like "756Mb") + size_str = user_size_mb.strip().lower() + if size_str.endswith('mb'): + size_str = size_str[:-2] + elif size_str.endswith('m'): + size_str = size_str[:-1] + + size_mb = float(size_str) + target_bytes = int(size_mb * 1024 * 1024) + return target_bytes, grace_bytes + except (ValueError, AttributeError): + return None, grace_bytes diff --git a/TUI/modalscreen/export.tcss b/TUI/modalscreen/export.tcss new file mode 100644 index 0000000..d45280f --- /dev/null +++ b/TUI/modalscreen/export.tcss @@ -0,0 +1,85 @@ +/* Export Modal Screen Styling */ + +ExportModal { + align: center middle; +} + +#export-container { + width: 140; + height: 55; + background: $panel; + border: solid $primary; + layout: grid; + grid-columns: 1fr 1fr 1fr; + grid-rows: auto 1fr auto; +} + +#export-title { + height: 1; + text-align: center; + text-style: bold; + color: $accent; + background: $boost; + padding: 1 2; + column-span: 3; +} + +/* Row 1: Three columns */ +#tags-area { + height: 1fr; + column-span: 1; + border: solid mediumvioletred; +} + +#metadata-display { + height: 1fr; + column-span: 1; + border: solid dodgerblue; + overflow: auto; + padding: 1; +} + +#export-options { + height: 1fr; + column-span: 1; + border: solid mediumpurple; + layout: vertical; + padding: 1; +} + +#export-options Select, +#export-options Input { + height: 3; + margin: 0 0 1 0; +} + +#custom-path-input { + height: 3; + margin: 0 0 1 0; +} + +#libraries-select { + height: 3; + margin: 0 0 1 0; +} + +#size-input { + height: 3; + margin: 0 0 1 0; +} + +#format-select { + height: 3; +} + +/* Row 2: Buttons */ +#export-buttons { + height: auto; + column-span: 3; + layout: horizontal; +} + +#export-buttons Button { + width: 1fr; + margin: 0 1; +} diff --git a/TUI/modalscreen/search.py b/TUI/modalscreen/search.py new file mode 100644 index 0000000..7ba37bf --- /dev/null +++ b/TUI/modalscreen/search.py @@ -0,0 +1,505 @@ +"""Search modal screen for OpenLibrary and Soulseek.""" + +from textual.app import ComposeResult +from textual.screen import ModalScreen +from textual.containers import Container, Horizontal, Vertical +from textual.widgets import Static, Button, Input, Select, DataTable, TextArea +from textual.binding import Binding +from textual.message import Message +import logging +from typing import Optional, Any, List +from pathlib import Path +import sys +import asyncio + +# Add parent directory to path for imports +sys.path.insert(0, str(Path(__file__).parent.parent)) +from config import load_config + +logger = logging.getLogger(__name__) + + +class SearchModal(ModalScreen): + """Modal screen for searching OpenLibrary and Soulseek.""" + + BINDINGS = [ + Binding("escape", "cancel", "Cancel"), + Binding("enter", "search_focused", "Search"), + Binding("ctrl+t", "scrape_tags", "Scrape Tags"), + ] + + CSS_PATH = "search.tcss" + + class SearchSelected(Message): + """Posted when user selects a search result.""" + def __init__(self, result: dict) -> None: + self.result = result + super().__init__() + + def __init__(self, app_instance=None): + """Initialize the search modal. + + Args: + app_instance: Reference to the main App instance for worker creation + """ + super().__init__() + self.app_instance = app_instance + self.source_select: Optional[Select] = None + self.search_input: Optional[Input] = None + self.results_table: Optional[DataTable] = None + self.tags_textarea: Optional[TextArea] = None + self.library_source_select: Optional[Select] = None + self.current_results: List[dict] = [] + self.is_searching = False + self.current_worker = None # Track worker for search operations + + def compose(self) -> ComposeResult: + """Create child widgets for the search modal.""" + with Vertical(id="search-container"): + yield Static("Search Books & Music", id="search-title") + + with Horizontal(id="search-controls"): + # Source selector + self.source_select = Select( + [("OpenLibrary", "openlibrary"), ("Soulseek", "soulseek")], + value="openlibrary", + id="source-select" + ) + yield self.source_select + + # Search input + self.search_input = Input( + placeholder="Enter search query...", + id="search-input" + ) + yield self.search_input + + # Search button + yield Button("Search", id="search-button", variant="primary") + + # Results table + self.results_table = DataTable(id="results-table") + yield self.results_table + + # Two-column layout: tags on left, source/submit on right + with Horizontal(id="bottom-controls"): + # Left column: Tags textarea + with Vertical(id="tags-column"): + self.tags_textarea = TextArea( + text="", + id="result-tags-textarea", + read_only=False + ) + self.tags_textarea.border_title = "Tags [Ctrl+T: Scrape]" + yield self.tags_textarea + + # Right column: Library source and submit button + with Vertical(id="source-submit-column"): + # Library source selector (for OpenLibrary results) + self.library_source_select = Select( + [("Local", "local"), ("Download", "download")], + value="local", + id="library-source-select" + ) + yield self.library_source_select + + # Submit button + yield Button("Submit", id="submit-button", variant="primary") + + # Buttons at bottom + with Horizontal(id="search-buttons"): + yield Button("Select", id="select-button", variant="primary") + yield Button("Download", id="download-button", variant="primary") + yield Button("Cancel", id="cancel-button", variant="default") + + def on_mount(self) -> None: + """Set up the table columns and focus.""" + # Set up results table columns + self.results_table.add_columns( + "Title", + "Author/Artist", + "Year/Album", + "Details" + ) + + # Focus on search input + self.search_input.focus() + + async def _search_openlibrary(self, query: str) -> List[dict]: + """Search OpenLibrary for books.""" + try: + from helper.search_provider import get_provider + + logger.info(f"[search-modal] Searching OpenLibrary for: {query}") + + # Get the OpenLibrary provider (now has smart search built-in) + provider = get_provider("openlibrary") + if not provider: + logger.error("[search-modal] OpenLibrary provider not available") + return [] + + # Search using the provider (smart search is now default) + search_results = provider.search(query, limit=20) + + formatted_results = [] + for result in search_results: + # Extract metadata from SearchResult.full_metadata + metadata = result.full_metadata or {} + + formatted_results.append({ + "title": result.title, + "author": ", ".join(metadata.get("authors", [])) if metadata.get("authors") else "Unknown", + "year": metadata.get("year", ""), + "publisher": metadata.get("publisher", ""), + "isbn": metadata.get("isbn", ""), + "oclc": metadata.get("oclc", ""), + "lccn": metadata.get("lccn", ""), + "openlibrary_id": metadata.get("olid", ""), + "pages": metadata.get("pages", ""), + "language": metadata.get("language", ""), + "source": "openlibrary", + "columns": result.columns, + "raw_data": metadata + }) + + logger.info(f"[search-modal] Found {len(formatted_results)} OpenLibrary results") + return formatted_results + + except Exception as e: + logger.error(f"[search-modal] OpenLibrary search error: {e}", exc_info=True) + import traceback + traceback.print_exc() + return [] + + async def _search_soulseek(self, query: str) -> List[dict]: + """Search Soulseek for music with automatic worker tracking.""" + try: + from helper.search_provider import get_provider + + # Create worker for tracking + worker = None + if self.app_instance and hasattr(self.app_instance, 'create_worker'): + worker = self.app_instance.create_worker( + 'soulseek', + title=f"Soulseek Search: {query[:40]}", + description=f"Searching P2P network for music" + ) + self.current_worker = worker + + if worker: + worker.log_step("Connecting to Soulseek peer network...") + + logger.info(f"[search-modal] Searching Soulseek for: {query}") + provider = get_provider("soulseek") + search_results = provider.search(query, limit=20) + + if worker: + worker.log_step(f"Search returned {len(search_results)} results") + + logger.info(f"[search-modal] Found {len(search_results)} Soulseek results") + + # Format results for display + formatted_results = [] + for idx, result in enumerate(search_results): + metadata = result.full_metadata or {} + artist = metadata.get('artist', '') + album = metadata.get('album', '') + title = result.title + track_num = metadata.get('track_num', '') + size_bytes = result.size_bytes or 0 + + # Format size as human-readable + if size_bytes > 1024 * 1024: + size_str = f"{size_bytes / (1024 * 1024):.1f} MB" + elif size_bytes > 1024: + size_str = f"{size_bytes / 1024:.1f} KB" + else: + size_str = f"{size_bytes} B" + + # Build columns for display + columns = [ + ("#", str(idx + 1)), + ("Title", title[:50] if title else "Unknown"), + ("Artist", artist[:30] if artist else "(no artist)"), + ("Album", album[:30] if album else ""), + ] + + formatted_results.append({ + "title": title if title else "Unknown", + "artist": artist if artist else "(no artist)", + "album": album, + "track": track_num, + "filesize": size_str, + "bitrate": "", # Not available in Soulseek results + "source": "soulseek", + "columns": columns, + "raw_data": result.to_dict() + }) + + return formatted_results + except Exception as e: + logger.error(f"[search-modal] Soulseek search error: {e}") + import traceback + traceback.print_exc() + return [] + + async def _perform_search(self) -> None: + """Perform the actual search based on selected source.""" + if not self.search_input or not self.source_select or not self.results_table: + logger.error("[search-modal] Widgets not initialized") + return + + query = self.search_input.value.strip() + if not query: + logger.warning("[search-modal] Empty search query") + return + + source = self.source_select.value + + # Clear existing results + self.results_table.clear() + self.current_results = [] + + self.is_searching = True + + try: + if source == "openlibrary": + results = await self._search_openlibrary(query) + elif source == "soulseek": + results = await self._search_soulseek(query) + else: + logger.warning(f"[search-modal] Unknown source: {source}") + if self.current_worker: + self.current_worker.finish("error", "Unknown search source") + return + + self.current_results = results + + # Populate table with results + if results: + # Check if first result has columns field + first_result = results[0] + if "columns" in first_result and first_result["columns"]: + # Use dynamic columns from result + # Clear existing columns and rebuild based on result columns + self.results_table.clear() + + # Extract column headers from first result's columns field + column_headers = [col[0] for col in first_result["columns"]] + + # Remove existing columns (we'll readd them with the right headers) + # Note: This is a workaround since Textual's DataTable doesn't support dynamic column management well + # For now, we just use the dynamic column headers from the result + logger.info(f"[search-modal] Using dynamic columns: {column_headers}") + + # Populate rows using the column order from results + for result in results: + if "columns" in result and result["columns"]: + # Extract values in column order + row_data = [col[1] for col in result["columns"]] + self.results_table.add_row(*row_data) + else: + # Fallback for results without columns + logger.warning(f"[search-modal] Result missing columns field: {result.get('title', 'Unknown')}") + else: + # Fallback to original hardcoded behavior if columns not available + logger.info("[search-modal] No dynamic columns found, using default formatting") + + for result in results: + if source == "openlibrary": + # Format OpenLibrary results (original hardcoded) + year = str(result.get("year", ""))[:4] if result.get("year") else "" + details = f"ISBN: {result.get('isbn', '')}" if result.get('isbn') else "" + if result.get('openlibrary_id'): + details += f" | OL: {result.get('openlibrary_id')}" + + row_data = [ + result["title"][:60], + result["author"][:35], + year, + details[:40] + ] + else: # soulseek + row_data = [ + result["title"][:50], + result["artist"][:30], + result["album"][:30], + result['filesize'] + ] + + self.results_table.add_row(*row_data) + else: + # Add a "no results" message + self.results_table.add_row("No results found", "", "", "") + + # Finish worker if tracking + if self.current_worker: + self.current_worker.finish("completed", f"Found {len(results)} results") + + except Exception as e: + logger.error(f"[search-modal] Search error: {e}") + if self.current_worker: + self.current_worker.finish("error", f"Search failed: {str(e)}") + + finally: + self.is_searching = False + + def on_button_pressed(self, event: Button.Pressed) -> None: + """Handle button presses.""" + button_id = event.button.id + + if button_id == "search-button": + # Run search asynchronously + asyncio.create_task(self._perform_search()) + + elif button_id == "select-button": + # Get selected row and populate tags textarea + if self.results_table and self.results_table.row_count > 0: + selected_row = self.results_table.cursor_row + if 0 <= selected_row < len(self.current_results): + result = self.current_results[selected_row] + # Populate tags textarea with result metadata + self._populate_tags_from_result(result) + else: + logger.warning("[search-modal] No results to select") + + elif button_id == "download-button": + # Download the selected result + if self.current_results and self.results_table.row_count > 0: + selected_row = self.results_table.cursor_row + if 0 <= selected_row < len(self.current_results): + result = self.current_results[selected_row] + if result.get("source") == "openlibrary": + asyncio.create_task(self._download_book(result)) + else: + logger.warning("[search-modal] Download only supported for OpenLibrary results") + else: + logger.warning("[search-modal] No result selected for download") + + elif button_id == "submit-button": + # Submit the current result with tags and source + if self.current_results and self.results_table.row_count > 0: + selected_row = self.results_table.cursor_row + if 0 <= selected_row < len(self.current_results): + result = self.current_results[selected_row] + # Get tags from textarea + tags_text = self.tags_textarea.text if self.tags_textarea else "" + # Get library source (if OpenLibrary) + library_source = self.library_source_select.value if self.library_source_select else "local" + + # Add tags and source to result + result["tags_text"] = tags_text + result["library_source"] = library_source + + # Post message and dismiss + self.post_message(self.SearchSelected(result)) + self.dismiss(result) + else: + logger.warning("[search-modal] No result selected for submission") + + elif button_id == "cancel-button": + self.dismiss(None) + + def _populate_tags_from_result(self, result: dict) -> None: + """Populate the tags textarea from a selected result.""" + if not self.tags_textarea: + return + + # Format tags based on result source + if result.get("source") == "openlibrary": + # For OpenLibrary: title, author, year + title = result.get("title", "") + author = result.get("author", "") + year = result.get("year", "") + tags = [] + if title: + tags.append(title) + if author: + tags.append(author) + if year: + tags.append(year) + tags_text = "\n".join(tags) + else: # soulseek + # For Soulseek: artist, album, title, track + tags = [] + if result.get("artist"): + tags.append(result["artist"]) + if result.get("album"): + tags.append(result["album"]) + if result.get("track"): + tags.append(f"Track {result['track']}") + if result.get("title"): + tags.append(result["title"]) + tags_text = "\n".join(tags) + + self.tags_textarea.text = tags_text + logger.info(f"[search-modal] Populated tags textarea from result") + + async def _download_book(self, result: dict) -> None: + """Download a book from OpenLibrary using unified downloader.""" + try: + from helper.unified_book_downloader import UnifiedBookDownloader + from config import load_config + + logger.info(f"[search-modal] Starting download for: {result.get('title')}") + + config = load_config() + downloader = UnifiedBookDownloader(config=config) + + # Get download options for this book + options = downloader.get_download_options(result) + + if not options['methods']: + logger.warning(f"[search-modal] No download methods available for: {result.get('title')}") + # Could show a modal dialog here + return + + # For now, use the first available method (we could show a dialog to choose) + method = options['methods'][0] + logger.info(f"[search-modal] Using download method: {method.get('label')}") + + # Perform the download + success, message = await downloader.download_book(method) + + if success: + logger.info(f"[search-modal] Download successful: {message}") + # Could show success dialog + else: + logger.warning(f"[search-modal] Download failed: {message}") + # Could show error dialog + + downloader.close() + + except Exception as e: + logger.error(f"[search-modal] Download error: {e}", exc_info=True) + + def action_search_focused(self) -> None: + """Action for Enter key - only search if search input is focused.""" + if self.search_input and self.search_input.has_focus and not self.is_searching: + asyncio.create_task(self._perform_search()) + + def action_scrape_tags(self) -> None: + """Action for Ctrl+T - populate tags from selected result.""" + if self.current_results and self.results_table and self.results_table.row_count > 0: + try: + selected_row = self.results_table.cursor_row + if 0 <= selected_row < len(self.current_results): + result = self.current_results[selected_row] + self._populate_tags_from_result(result) + logger.info(f"[search-modal] Ctrl+T: Populated tags from result at row {selected_row}") + else: + logger.warning(f"[search-modal] Ctrl+T: Invalid row index {selected_row}") + except Exception as e: + logger.error(f"[search-modal] Ctrl+T error: {e}") + else: + logger.warning("[search-modal] Ctrl+T: No results selected") + + def action_cancel(self) -> None: + """Action for Escape key - close modal.""" + self.dismiss(None) + + def on_input_submitted(self, event: Input.Submitted) -> None: + """Handle Enter key in search input - only trigger search here.""" + if event.input.id == "search-input": + if not self.is_searching: + asyncio.create_task(self._perform_search()) diff --git a/TUI/modalscreen/search.tcss b/TUI/modalscreen/search.tcss new file mode 100644 index 0000000..4a67ca8 --- /dev/null +++ b/TUI/modalscreen/search.tcss @@ -0,0 +1,121 @@ +/* Search Modal Screen Styling */ + +SearchModal { + align: center middle; +} + +Screen { + layout: vertical; +} + +#search-container { + width: 140; + height: 40; + background: $panel; + border: solid $primary; + layout: vertical; +} + +Static#search-title { + height: 3; + dock: top; + text-align: center; + text-style: bold; + color: $accent; + background: $boost; + padding: 1 2; +} + +#search-controls { + height: auto; + layout: horizontal; + padding: 1; + border: solid $primary; +} + +#source-select { + width: 20; + margin-right: 1; +} + +#search-input { + width: 1fr; + margin-right: 1; +} + +#search-button { + width: 12; +} + +#results-table { + height: 1fr; + border: solid $primary; +} + +DataTable { + border: solid $accent; +} + +DataTable > .datatable--header { + background: $boost; + color: $accent; + text-style: bold; +} + +DataTable > .datatable--cursor-row { + background: $accent; +} + +#bottom-controls { + height: auto; + layout: horizontal; + padding: 1; + border: solid $primary; +} + +#tags-column { + width: 1fr; + layout: vertical; + padding-right: 1; + height: auto; +} + +#result-tags-textarea { + height: 10; + width: 1fr; + border: solid $accent; +} + +#source-submit-column { + width: 20; + layout: vertical; + padding-left: 1; + height: auto; +} + +#library-source-select { + width: 1fr; + margin-bottom: 1; +} + +#submit-button { + width: 1fr; +} + +#search-buttons { + height: 3; + dock: bottom; + layout: horizontal; + padding: 1; + border: solid $primary; + align: center middle; +} + +#select-button { + width: 12; + margin-right: 2; +} + +#cancel-button { + width: 12; +} diff --git a/TUI/modalscreen/workers.py b/TUI/modalscreen/workers.py new file mode 100644 index 0000000..a48de42 --- /dev/null +++ b/TUI/modalscreen/workers.py @@ -0,0 +1,585 @@ +"""Workers modal screen for monitoring and managing background tasks.""" + +from textual.app import ComposeResult +from textual.screen import ModalScreen +from textual.containers import Horizontal, Vertical +from textual.widgets import Static, Button, DataTable, TextArea +from textual.binding import Binding +from textual.message import Message +import logging +from typing import Optional, Dict, List, Any +from pathlib import Path +import sys + +# Add parent directory to path for imports +sys.path.insert(0, str(Path(__file__).parent.parent)) + +logger = logging.getLogger(__name__) + + +class WorkersModal(ModalScreen): + """Modal screen for monitoring running and finished workers.""" + + BINDINGS = [ + Binding("escape", "cancel", "Cancel"), + ] + + CSS_PATH = "workers.tcss" + + class WorkerUpdated(Message): + """Posted when worker list is updated.""" + def __init__(self, workers: List[Dict[str, Any]]) -> None: + self.workers = workers + super().__init__() + + class WorkerCancelled(Message): + """Posted when user cancels a worker.""" + def __init__(self, worker_id: str) -> None: + self.worker_id = worker_id + super().__init__() + + def __init__(self, app_instance=None): + """Initialize the workers modal. + + Args: + app_instance: Reference to the hub app for accessing worker info + """ + super().__init__() + self.app_instance = app_instance + self.running_table: Optional[DataTable] = None + self.finished_table: Optional[DataTable] = None + self.stdout_display: Optional[TextArea] = None + self.running_workers: List[Dict[str, Any]] = [] + self.finished_workers: List[Dict[str, Any]] = [] + self.selected_worker_id: Optional[str] = None + self.show_running = False # Start with finished tab + + def compose(self) -> ComposeResult: + """Create child widgets for the workers modal.""" + with Vertical(id="workers-container"): + # Title with toggle buttons + with Horizontal(id="workers-title-bar"): + yield Static("Workers Monitor", id="workers-title") + yield Button("Running", id="toggle-running-btn", variant="primary") + yield Button("Finished", id="toggle-finished-btn", variant="default") + + # Running tab content (initially hidden) + with Vertical(id="running-section"): + self.running_table = DataTable(id="running-table") + yield self.running_table + + with Horizontal(id="running-controls"): + yield Button("Refresh", id="running-refresh-btn", variant="primary") + yield Button("Stop Selected", id="running-stop-btn", variant="warning") + yield Button("Stop All", id="running-stop-all-btn", variant="error") + + # Finished tab content (initially visible) + with Vertical(id="finished-section"): + self.finished_table = DataTable(id="finished-table") + yield self.finished_table + + with Horizontal(id="finished-controls"): + yield Button("Refresh", id="finished-refresh-btn", variant="primary") + yield Button("Clear Selected", id="finished-clear-btn", variant="warning") + yield Button("Clear All", id="finished-clear-all-btn", variant="error") + + # Shared textarea for displaying worker logs + with Vertical(id="logs-section"): + yield Static("Worker Logs:", id="logs-label") + self.stdout_display = TextArea(id="stdout-display", read_only=True) + yield self.stdout_display + + with Horizontal(id="workers-buttons"): + yield Button("Close", id="close-btn", variant="primary") + + def on_mount(self) -> None: + """Set up the tables and load worker data.""" + # Set up running workers table + if self.running_table: + self.running_table.add_columns( + "ID", + "Type", + "Status", + "Pipe", + "Progress", + "Started", + "Details" + ) + self.running_table.zebra_stripes = True + + # Set up finished workers table + if self.finished_table: + self.finished_table.add_columns( + "ID", + "Type", + "Result", + "Pipe", + "Started", + "Completed", + "Duration", + "Details" + ) + self.finished_table.zebra_stripes = True + + # Set initial view (show finished by default) + self._update_view_visibility() + + # Load initial data + self.refresh_workers() + + # Don't set up periodic refresh - it was causing issues with stdout display + # Users can click the Refresh button to update manually + + def refresh_workers(self) -> None: + """Refresh the workers data from app instance.""" + try: + if not self.app_instance: + logger.warning("[workers-modal] No app instance provided") + return + + # Get running workers from app instance + # This assumes the app has a get_running_workers() method + if hasattr(self.app_instance, 'get_running_workers'): + self.running_workers = self.app_instance.get_running_workers() + else: + self.running_workers = [] + + # Get finished workers from app instance + if hasattr(self.app_instance, 'get_finished_workers'): + self.finished_workers = self.app_instance.get_finished_workers() + if self.finished_workers: + logger.info(f"[workers-modal-refresh] Got {len(self.finished_workers)} finished workers from app") + # Log the keys in the first worker to verify structure + if isinstance(self.finished_workers[0], dict): + logger.info(f"[workers-modal-refresh] First worker keys: {list(self.finished_workers[0].keys())}") + logger.info(f"[workers-modal-refresh] First worker: {self.finished_workers[0]}") + else: + logger.warning(f"[workers-modal-refresh] First worker is not a dict: {type(self.finished_workers[0])}") + else: + self.finished_workers = [] + + # Update tables + self._update_running_table() + self._update_finished_table() + + logger.info(f"[workers-modal] Refreshed: {len(self.running_workers)} running, {len(self.finished_workers)} finished") + except Exception as e: + logger.error(f"[workers-modal] Error refreshing workers: {e}") + + def _update_view_visibility(self) -> None: + """Toggle visibility between running and finished views.""" + try: + running_section = self.query_one("#running-section", Vertical) + finished_section = self.query_one("#finished-section", Vertical) + toggle_running_btn = self.query_one("#toggle-running-btn", Button) + toggle_finished_btn = self.query_one("#toggle-finished-btn", Button) + + if self.show_running: + running_section.display = True + finished_section.display = False + toggle_running_btn.variant = "primary" + toggle_finished_btn.variant = "default" + logger.debug("[workers-modal] Switched to Running view") + else: + running_section.display = False + finished_section.display = True + toggle_running_btn.variant = "default" + toggle_finished_btn.variant = "primary" + logger.debug("[workers-modal] Switched to Finished view") + except Exception as e: + logger.error(f"[workers-modal] Error updating view visibility: {e}") + + def _update_running_table(self) -> None: + """Update the running workers table.""" + try: + if not self.running_table: + logger.error("[workers-modal] Running table not initialized") + return + + self.running_table.clear() + + if not self.running_workers: + self.running_table.add_row("---", "---", "---", "---", "---", "---", "No workers running") + logger.debug(f"[workers-modal] No running workers to display") + return + + logger.debug(f"[workers-modal] Updating running table with {len(self.running_workers)} workers") + + for idx, worker_info in enumerate(self.running_workers): + try: + worker_id = worker_info.get('id', 'unknown') + worker_type = worker_info.get('type', 'unknown') + status = worker_info.get('status', 'running') + progress = worker_info.get('progress', '') + started = worker_info.get('started', '') + details = worker_info.get('details', '') + pipe = worker_info.get('pipe', '') + + # Ensure values are strings + worker_id = str(worker_id) if worker_id else 'unknown' + worker_type = str(worker_type) if worker_type else 'unknown' + status = str(status) if status else 'running' + progress = str(progress) if progress else '---' + started = str(started) if started else '---' + details = str(details) if details else '---' + pipe_display = self._summarize_pipe(pipe) + + # Truncate long strings + progress = progress[:20] + started = started[:19] + details = details[:30] + pipe_display = pipe_display[:40] + + self.running_table.add_row( + worker_id[:8], + worker_type[:15], + status[:10], + pipe_display, + progress, + started, + details + ) + + if idx == 0: # Log first entry + logger.debug(f"[workers-modal] Added running row {idx}: {worker_id[:8]} {worker_type[:15]} {status}") + except Exception as row_error: + logger.error(f"[workers-modal] Error adding running row {idx}: {row_error}", exc_info=True) + + logger.debug(f"[workers-modal] Updated running table with {len(self.running_workers)} workers") + except Exception as e: + logger.error(f"[workers-modal] Error updating running table: {e}", exc_info=True) + + def _update_finished_table(self) -> None: + """Update the finished workers table.""" + try: + if not self.finished_table: + logger.error("[workers-modal] Finished table not initialized") + return + + self.finished_table.clear() + + if not self.finished_workers: + self.finished_table.add_row("---", "---", "---", "---", "---", "---", "---", "No finished workers") + logger.debug(f"[workers-modal] No finished workers to display") + return + + logger.info(f"[workers-modal-update] STARTING to update finished table with {len(self.finished_workers)} workers") + added_count = 0 + error_count = 0 + + for idx, worker_info in enumerate(self.finished_workers): + try: + worker_id = worker_info.get('id', 'unknown') + worker_type = worker_info.get('type', 'unknown') + result = worker_info.get('result', 'unknown') + completed = worker_info.get('completed', '') + duration = worker_info.get('duration', '') + details = worker_info.get('details', '') + pipe = worker_info.get('pipe', '') + started = worker_info.get('started', '') + + # Ensure values are strings + worker_id = str(worker_id) if worker_id else 'unknown' + worker_type = str(worker_type) if worker_type else 'unknown' + result = str(result) if result else 'unknown' + completed = str(completed) if completed else '---' + duration = str(duration) if duration else '---' + details = str(details) if details else '---' + started = str(started) if started else '---' + pipe_display = self._summarize_pipe(pipe) + + # Truncate long strings + result = result[:15] + completed = completed[:19] + started = started[:19] + duration = duration[:10] + details = details[:30] + pipe_display = pipe_display[:40] + + self.finished_table.add_row( + worker_id[:8], + worker_type[:15], + result, + pipe_display, + started, + completed, + duration, + details + ) + added_count += 1 + + except Exception as row_error: + error_count += 1 + logger.error(f"[workers-modal-update] Error adding finished row {idx}: {row_error}", exc_info=True) + + logger.info(f"[workers-modal-update] COMPLETED: Added {added_count}/{len(self.finished_workers)} finished workers (errors: {error_count})") + logger.debug(f"[workers-modal-update] Finished table row_count after update: {self.finished_table.row_count}") + except Exception as e: + logger.error(f"[workers-modal] Error updating finished table: {e}", exc_info=True) + + def on_data_table_row_highlighted(self, event: DataTable.RowHighlighted) -> None: + """Handle row highlight in tables - display stdout.""" + try: + logger.info(f"[workers-modal] Row highlighted, cursor_row: {event.cursor_row}") + + # Get the selected worker from the correct table + workers_list = None + if event.control == self.running_table: + workers_list = self.running_workers + logger.debug(f"[workers-modal] Highlighted in running table") + elif event.control == self.finished_table: + workers_list = self.finished_workers + logger.debug(f"[workers-modal] Highlighted in finished table, list size: {len(workers_list)}") + else: + logger.warning(f"[workers-modal] Unknown table: {event.control}") + return + + # Get the worker at this row + if workers_list and 0 <= event.cursor_row < len(workers_list): + worker = workers_list[event.cursor_row] + worker_id = worker.get('id', '') + logger.info(f"[workers-modal] Highlighted worker: {worker_id}") + + if worker_id: + self.selected_worker_id = worker_id + # Display the stdout + self._update_stdout_display(worker_id, worker) + else: + logger.warning(f"[workers-modal] Row {event.cursor_row} out of bounds for list of size {len(workers_list) if workers_list else 0}") + except Exception as e: + logger.error(f"[workers-modal] Error handling row highlight: {e}", exc_info=True) + + def on_data_table_cell_highlighted(self, event: DataTable.CellHighlighted) -> None: + """Handle cell highlight in tables - display stdout (backup for row selection).""" + try: + # CellHighlighted has coordinate (row, column) not cursor_row + cursor_row = event.coordinate.row + logger.debug(f"[workers-modal] Cell highlighted, row: {cursor_row}, column: {event.coordinate.column}") + + # Get the selected worker from the correct table + workers_list = None + if event.data_table == self.running_table: + workers_list = self.running_workers + logger.debug(f"[workers-modal] Cell highlighted in running table") + elif event.data_table == self.finished_table: + workers_list = self.finished_workers + logger.debug(f"[workers-modal] Cell highlighted in finished table, list size: {len(workers_list)}") + else: + return + + # Get the worker at this row + if workers_list and 0 <= cursor_row < len(workers_list): + worker = workers_list[cursor_row] + worker_id = worker.get('id', '') + + if worker_id and worker_id != self.selected_worker_id: + logger.info(f"[workers-modal] Cell-highlighted worker: {worker_id}") + self.selected_worker_id = worker_id + # Display the stdout + self._update_stdout_display(worker_id, worker) + except Exception as e: + logger.debug(f"[workers-modal] Error handling cell highlight: {e}") + + def _update_stdout_display(self, worker_id: str, worker: Optional[Dict[str, Any]] = None) -> None: + """Update the stdout textarea with logs from the selected worker.""" + try: + if not self.stdout_display: + logger.error("[workers-modal] stdout_display not initialized") + return + logger.debug(f"[workers-modal] Updating stdout display for worker: {worker_id}") + worker_data = worker or self._locate_worker(worker_id) + stdout_text = self._resolve_worker_stdout(worker_id, worker_data) + pipe_text = self._resolve_worker_pipe(worker_id, worker_data) + events = self._get_worker_events(worker_id) + timeline_text = self._format_worker_timeline(events) + sections = [] + if pipe_text: + sections.append(f"Pipe:\n{pipe_text}") + if timeline_text: + sections.append("Timeline:\n" + timeline_text) + logs_body = (stdout_text or "").strip() + sections.append("Logs:\n" + (logs_body if logs_body else "(no logs recorded)")) + combined_text = "\n\n".join(sections) + logger.debug(f"[workers-modal] Setting textarea to {len(combined_text)} chars (stdout_len={len(stdout_text or '')})") + self.stdout_display.text = combined_text + if len(combined_text) > 10: + try: + self.stdout_display.cursor_location = (len(combined_text) - 1, 0) + except Exception: + pass + logger.info(f"[workers-modal] Updated stdout display successfully") + except Exception as e: + logger.error(f"[workers-modal] Error updating stdout display: {e}", exc_info=True) + + def _locate_worker(self, worker_id: str) -> Optional[Dict[str, Any]]: + for worker in self.running_workers or []: + if isinstance(worker, dict) and worker.get('id') == worker_id: + return worker + for worker in self.finished_workers or []: + if isinstance(worker, dict) and worker.get('id') == worker_id: + return worker + return None + + def _resolve_worker_stdout(self, worker_id: str, worker: Optional[Dict[str, Any]]) -> str: + if worker and worker.get('stdout'): + return worker.get('stdout', '') or '' + manager = getattr(self.app_instance, 'worker_manager', None) + if manager: + try: + return manager.get_stdout(worker_id) or '' + except Exception as exc: + logger.debug(f"[workers-modal] Could not fetch stdout for {worker_id}: {exc}") + return '' + + def _resolve_worker_pipe(self, worker_id: str, worker: Optional[Dict[str, Any]]) -> str: + if worker and worker.get('pipe'): + return str(worker.get('pipe')) + record = self._fetch_worker_record(worker_id) + if record and record.get('pipe'): + return str(record.get('pipe')) + return '' + + def _fetch_worker_record(self, worker_id: str) -> Optional[Dict[str, Any]]: + manager = getattr(self.app_instance, 'worker_manager', None) + if not manager: + return None + try: + return manager.get_worker(worker_id) + except Exception as exc: + logger.debug(f"[workers-modal] Could not fetch worker record {worker_id}: {exc}") + return None + + def _get_worker_events(self, worker_id: str, limit: int = 250) -> List[Dict[str, Any]]: + manager = getattr(self.app_instance, 'worker_manager', None) + if not manager: + return [] + try: + return manager.get_worker_events(worker_id, limit=limit) + except Exception as exc: + logger.debug(f"[workers-modal] Could not fetch worker events {worker_id}: {exc}") + return [] + + def _format_worker_timeline(self, events: List[Dict[str, Any]]) -> str: + if not events: + return "" + lines: List[str] = [] + for event in events: + timestamp = self._format_event_timestamp(event.get('created_at')) + label = (event.get('event_type') or '').upper() or 'EVENT' + channel = (event.get('channel') or '').upper() + if channel and channel not in label: + label = f"{label}/{channel}" + step = event.get('step') or '' + message = event.get('message') or '' + prefix = '' + if event.get('event_type') == 'step' and step: + prefix = f"{step} :: " + elif step and step not in message: + prefix = f"{step} :: " + formatted_message = self._format_message_block(message) + lines.append(f"[{timestamp}] {label}: {prefix}{formatted_message}") + return "\n".join(lines) + + def _format_event_timestamp(self, raw_timestamp: Any) -> str: + if not raw_timestamp: + return "--:--:--" + text = str(raw_timestamp) + if "T" in text: + time_part = text.split("T", 1)[1] + elif " " in text: + time_part = text.split(" ", 1)[1] + else: + time_part = text + return time_part[:8] if len(time_part) >= 8 else time_part + + def _format_message_block(self, message: str) -> str: + clean = (message or '').strip() + if not clean: + return "(empty)" + lines = clean.splitlines() + if len(lines) == 1: + return lines[0] + head, *rest = lines + indented = "\n".join(f" {line}" for line in rest) + return f"{head}\n{indented}" + + def _summarize_pipe(self, pipe_value: Any, limit: int = 40) -> str: + text = str(pipe_value or '').strip() + if not text: + return "(none)" + return text if len(text) <= limit else text[: limit - 3] + '...' + + def on_button_pressed(self, event: Button.Pressed) -> None: + """Handle button presses.""" + button_id = event.button.id + + try: + if button_id == "toggle-running-btn": + self.show_running = True + self._update_view_visibility() + return + + elif button_id == "toggle-finished-btn": + self.show_running = False + self._update_view_visibility() + return + + if button_id == "running-refresh-btn": + self.refresh_workers() + + elif button_id == "running-stop-btn": + # Stop selected running worker + if self.running_table and self.running_table.row_count > 0: + try: + selected_row = self.running_table.cursor_row + if 0 <= selected_row < len(self.running_workers): + worker = self.running_workers[selected_row] + worker_id = worker.get('id') + if self.app_instance and hasattr(self.app_instance, 'stop_worker'): + self.app_instance.stop_worker(worker_id) + logger.info(f"[workers-modal] Stopped worker: {worker_id}") + self.refresh_workers() + except Exception as e: + logger.error(f"[workers-modal] Error stopping worker: {e}") + + elif button_id == "running-stop-all-btn": + # Stop all running workers + if self.app_instance and hasattr(self.app_instance, 'stop_all_workers'): + self.app_instance.stop_all_workers() + logger.info("[workers-modal] Stopped all workers") + self.refresh_workers() + + elif button_id == "finished-refresh-btn": + self.refresh_workers() + + elif button_id == "finished-clear-btn": + # Clear selected finished worker + if self.finished_table and self.finished_table.row_count > 0: + try: + selected_row = self.finished_table.cursor_row + if 0 <= selected_row < len(self.finished_workers): + worker = self.finished_workers[selected_row] + worker_id = worker.get('id') + if self.app_instance and hasattr(self.app_instance, 'clear_finished_worker'): + self.app_instance.clear_finished_worker(worker_id) + logger.info(f"[workers-modal] Cleared worker: {worker_id}") + self.refresh_workers() + except Exception as e: + logger.error(f"[workers-modal] Error clearing worker: {e}") + + elif button_id == "finished-clear-all-btn": + # Clear all finished workers + if self.app_instance and hasattr(self.app_instance, 'clear_all_finished_workers'): + self.app_instance.clear_all_finished_workers() + logger.info("[workers-modal] Cleared all finished workers") + self.refresh_workers() + + elif button_id == "close-btn": + self.dismiss(None) + + except Exception as e: + logger.error(f"[workers-modal] Error in on_button_pressed: {e}") + + def action_cancel(self) -> None: + """Action for Escape key - close modal.""" + self.dismiss(None) diff --git a/TUI/modalscreen/workers.tcss b/TUI/modalscreen/workers.tcss new file mode 100644 index 0000000..fa528a5 --- /dev/null +++ b/TUI/modalscreen/workers.tcss @@ -0,0 +1,119 @@ +/* Workers Modal Stylesheet */ + +Screen { + background: $surface; + color: $text; +} + +#workers-container { + width: 100%; + height: 100%; + layout: vertical; + background: $panel; +} + +#workers-title-bar { + dock: top; + height: 3; + layout: horizontal; + background: $boost; + border: solid $accent; + padding: 0 1; +} + +#workers-title { + width: 1fr; + height: 100%; + content-align-vertical: middle; + color: $text; + text-style: bold; +} + +#toggle-running-btn, +#toggle-finished-btn { + width: auto; + height: 100%; + margin: 0; +} + +#running-section, +#finished-section { + width: 100%; + height: 40%; + layout: vertical; + border: solid $accent; +} + +#running-table, +#finished-table { + width: 100%; + height: 1fr; + border: solid $accent; +} + +#running-controls, +#finished-controls { + width: 100%; + height: auto; + min-height: 3; + layout: horizontal; + background: $boost; + padding: 1; + border-top: solid $accent; +} + +#running-controls Button, +#finished-controls Button { + margin-right: 1; + min-width: 15; +} + +#logs-label { + height: 1; + margin: 0 1; + text-style: bold; +} + +#logs-section { + width: 100%; + height: 1fr; + layout: vertical; + border: solid $accent; + background: $panel; +} + +#stdout-display { + width: 100%; + height: 1fr; + border: solid $accent; + margin: 1; +} + +#workers-buttons { + dock: bottom; + height: auto; + min-height: 3; + layout: horizontal; + border: solid $accent; + padding: 1; +} + +#workers-buttons Button { + margin-right: 1; + min-width: 15; +} + +DataTable { + border: solid $accent; +} + +DataTable > .datatable--header { + background: $boost; + color: $text; + text-style: bold; +} + +DataTable > .datatable--cursor { + background: $accent; + color: $panel; +} diff --git a/TUI/pipeline_runner.py b/TUI/pipeline_runner.py new file mode 100644 index 0000000..fb6d264 --- /dev/null +++ b/TUI/pipeline_runner.py @@ -0,0 +1,356 @@ +"""Pipeline execution utilities for the Textual UI. + +This module mirrors the CLI pipeline behaviour while exposing a class-based +interface that the TUI can call. It keeps all pipeline/cmdlet integration in +one place so the interface layer stays focused on presentation. +""" +from __future__ import annotations + +import contextlib +import io +import shlex +import uuid +from dataclasses import dataclass, field +import sys +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Sequence + +BASE_DIR = Path(__file__).resolve().parent +ROOT_DIR = BASE_DIR.parent +for path in (ROOT_DIR, BASE_DIR): + str_path = str(path) + if str_path not in sys.path: + sys.path.insert(0, str_path) + +import pipeline as ctx +from cmdlets import REGISTRY +from config import get_local_storage_path, load_config +from helper.worker_manager import WorkerManager + +try: # Reuse the CLI selection parser instead of reimplementing it. + from CLI import _parse_selection_syntax +except ImportError: # pragma: no cover - fallback for atypical environments + _parse_selection_syntax = None # type: ignore + + +@dataclass(slots=True) +class PipelineStageResult: + """Summary for a single pipeline stage.""" + + name: str + args: Sequence[str] + emitted: List[Any] = field(default_factory=list) + status: str = "pending" + error: Optional[str] = None + + +@dataclass(slots=True) +class PipelineRunResult: + """Aggregate result for a pipeline run.""" + + pipeline: str + success: bool + stages: List[PipelineStageResult] = field(default_factory=list) + emitted: List[Any] = field(default_factory=list) + stdout: str = "" + stderr: str = "" + error: Optional[str] = None + + def to_summary(self) -> Dict[str, Any]: + """Provide a JSON-friendly representation for logging or UI.""" + return { + "pipeline": self.pipeline, + "success": self.success, + "error": self.error, + "stages": [ + { + "name": stage.name, + "status": stage.status, + "error": stage.error, + "emitted": len(stage.emitted), + } + for stage in self.stages + ], + } + + +class PipelineExecutor: + """Thin wrapper over the cmdlet registry + pipeline context.""" + + def __init__( + self, + *, + config: Optional[Dict[str, Any]] = None, + worker_manager: Optional[WorkerManager] = None, + ) -> None: + self._config = config or load_config() + self._worker_manager = worker_manager + if self._worker_manager is None: + self._worker_manager = self._ensure_worker_manager() + if self._worker_manager: + self._config["_worker_manager"] = self._worker_manager + + @property + def worker_manager(self) -> Optional[WorkerManager]: + return self._worker_manager + + def run_pipeline( + self, + pipeline_text: str, + *, + on_log: Optional[Callable[[str], None]] = None, + ) -> PipelineRunResult: + """Execute a pipeline string and return structured results. + + Args: + pipeline_text: Raw pipeline text entered by the user. + on_log: Optional callback that receives human-readable log lines. + """ + normalized = pipeline_text.strip() + result = PipelineRunResult(pipeline=normalized, success=False) + if not normalized: + result.error = "Pipeline is empty" + return result + + tokens = self._tokenize(normalized) + stages = self._split_stages(tokens) + if not stages: + result.error = "Pipeline contains no stages" + return result + + ctx.reset() + ctx.set_current_command_text(normalized) + + stdout_buffer = io.StringIO() + stderr_buffer = io.StringIO() + piped_result: Any = None + worker_session = self._start_worker_session(normalized) + + try: + with contextlib.redirect_stdout(stdout_buffer), contextlib.redirect_stderr( + stderr_buffer + ): + for index, stage_tokens in enumerate(stages): + stage = self._execute_stage( + index=index, + total=len(stages), + stage_tokens=stage_tokens, + piped_input=piped_result, + on_log=on_log, + ) + result.stages.append(stage) + + if stage.status != "completed": + result.error = stage.error or f"Stage {stage.name} failed" + return result + + if index == len(stages) - 1: + result.emitted = stage.emitted + else: + piped_result = stage.emitted + + result.success = True + return result + finally: + result.stdout = stdout_buffer.getvalue() + result.stderr = stderr_buffer.getvalue() + ctx.clear_current_command_text() + if worker_session is not None: + status = "completed" if result.success else "error" + worker_session.finish(status=status, message=result.error or "") + + # ------------------------------------------------------------------ + # Stage execution helpers + # ------------------------------------------------------------------ + def _execute_stage( + self, + *, + index: int, + total: int, + stage_tokens: Sequence[str], + piped_input: Any, + on_log: Optional[Callable[[str], None]], + ) -> PipelineStageResult: + if not stage_tokens: + return PipelineStageResult(name="(empty)", args=[], status="skipped") + + cmd_name = stage_tokens[0].replace("_", "-").lower() + stage_args = stage_tokens[1:] + stage = PipelineStageResult(name=cmd_name, args=stage_args) + + if cmd_name.startswith("@"): + return self._apply_selection_stage( + token=cmd_name, + stage=stage, + piped_input=piped_input, + on_log=on_log, + ) + + cmd_fn = REGISTRY.get(cmd_name) + if not cmd_fn: + stage.status = "failed" + stage.error = f"Unknown command: {cmd_name}" + return stage + + pipeline_ctx = ctx.PipelineStageContext(stage_index=index, total_stages=total) + ctx.set_stage_context(pipeline_ctx) + ctx.set_active(True) + ctx.set_last_stage(index == total - 1) + + try: + return_code = cmd_fn(piped_input, list(stage_args), self._config) + except Exception as exc: # pragma: no cover - surfaced in UI + stage.status = "failed" + stage.error = f"{type(exc).__name__}: {exc}" + if on_log: + on_log(stage.error) + return stage + finally: + ctx.set_stage_context(None) + ctx.set_active(False) + + emitted = list(getattr(pipeline_ctx, "emits", []) or []) + stage.emitted = emitted + + if return_code != 0: + stage.status = "failed" + stage.error = f"Exit code {return_code}" + else: + stage.status = "completed" + stage.error = None + + worker_id = self._current_worker_id() + if self._worker_manager and worker_id: + label = f"[Stage {index + 1}/{total}] {cmd_name} {stage.status}" + self._worker_manager.log_step(worker_id, label) + + ctx.set_last_result_table(None, emitted) + ctx.set_last_items(emitted) + return stage + + def _apply_selection_stage( + self, + *, + token: str, + stage: PipelineStageResult, + piped_input: Any, + on_log: Optional[Callable[[str], None]], + ) -> PipelineStageResult: + selection = self._parse_selection(token) + items = piped_input or [] + if not isinstance(items, list): + items = list(items if isinstance(items, Sequence) else [items]) + + if not items: + stage.status = "failed" + stage.error = "Selection requested but there is no upstream data" + return stage + + if selection is None: + stage.emitted = list(items) + else: + zero_based = sorted(i - 1 for i in selection if i > 0) + stage.emitted = [items[i] for i in zero_based if 0 <= i < len(items)] + + if not stage.emitted: + stage.status = "failed" + stage.error = "Selection matched no rows" + return stage + + ctx.set_last_items(stage.emitted) + ctx.set_last_result_table(None, stage.emitted) + stage.status = "completed" + if on_log: + on_log(f"Selected {len(stage.emitted)} item(s) via {token}") + return stage + + # ------------------------------------------------------------------ + # Worker/session helpers + # ------------------------------------------------------------------ + def _start_worker_session(self, pipeline_text: str) -> Optional[_WorkerSession]: + manager = self._ensure_worker_manager() + if manager is None: + return None + + worker_id = f"tui_pipeline_{uuid.uuid4().hex[:8]}" + tracked = manager.track_worker( + worker_id, + worker_type="pipeline", + title="Pipeline run", + description=pipeline_text, + pipe=pipeline_text, + ) + if not tracked: + return None + + manager.log_step(worker_id, "Pipeline started") + self._config["_current_worker_id"] = worker_id + return _WorkerSession(manager=manager, worker_id=worker_id, config=self._config) + + def _ensure_worker_manager(self) -> Optional[WorkerManager]: + if self._worker_manager: + return self._worker_manager + library_root = get_local_storage_path(self._config) + if not library_root: + return None + try: + self._worker_manager = WorkerManager(Path(library_root), auto_refresh_interval=0) + self._config["_worker_manager"] = self._worker_manager + except Exception: + self._worker_manager = None + return self._worker_manager + + def _current_worker_id(self) -> Optional[str]: + worker_id = self._config.get("_current_worker_id") + return str(worker_id) if worker_id else None + + # ------------------------------------------------------------------ + # Parsing helpers + # ------------------------------------------------------------------ + @staticmethod + def _tokenize(pipeline_text: str) -> List[str]: + try: + return shlex.split(pipeline_text) + except ValueError: + return pipeline_text.split() + + @staticmethod + def _split_stages(tokens: Sequence[str]) -> List[List[str]]: + stages: List[List[str]] = [] + current: List[str] = [] + for token in tokens: + if token == "|": + if current: + stages.append(current) + current = [] + else: + current.append(token) + if current: + stages.append(current) + return stages + + @staticmethod + def _parse_selection(token: str) -> Optional[Sequence[int]]: + if _parse_selection_syntax: + parsed = _parse_selection_syntax(token) + if parsed: + return sorted(parsed) + return None + + +class _WorkerSession: + """Minimal worker session wrapper for the TUI executor.""" + + def __init__(self, *, manager: WorkerManager, worker_id: str, config: Optional[Dict[str, Any]] = None) -> None: + self._manager = manager + self.worker_id = worker_id + self._config = config + + def finish(self, *, status: str, message: str) -> None: + try: + self._manager.finish_worker(self.worker_id, result=status, error_msg=message) + self._manager.log_step(self.worker_id, f"Pipeline {status}") + except Exception: + pass + if self._config and self._config.get("_current_worker_id") == self.worker_id: + self._config.pop("_current_worker_id", None) diff --git a/TUI/tui.py b/TUI/tui.py new file mode 100644 index 0000000..0f634bd --- /dev/null +++ b/TUI/tui.py @@ -0,0 +1,332 @@ +"""Modern Textual UI for driving Medeia-Macina pipelines.""" +from __future__ import annotations + +import sys +from pathlib import Path +from typing import Any, Dict, List, Optional, Sequence + +from textual import work +from textual.app import App, ComposeResult +from textual.binding import Binding +from textual.containers import Container, Horizontal, Vertical, VerticalScroll +from textual.widgets import ( + Button, + DataTable, + Footer, + Header, + Input, + ListItem, + ListView, + Static, + TextArea, + Tree, +) + +BASE_DIR = Path(__file__).resolve().parent +ROOT_DIR = BASE_DIR.parent +for path in (BASE_DIR, ROOT_DIR): + str_path = str(path) + if str_path not in sys.path: + sys.path.insert(0, str_path) + +from menu_actions import ( # type: ignore # noqa: E402 + PIPELINE_PRESETS, + PipelinePreset, + build_metadata_snapshot, + summarize_result, +) +from pipeline_runner import PipelineExecutor, PipelineRunResult # type: ignore # noqa: E402 + + +class PresetListItem(ListItem): + """List entry that stores its pipeline preset.""" + + def __init__(self, preset: PipelinePreset) -> None: + super().__init__( + Static( + f"[b]{preset.label}[/b]\n[pale_green4]{preset.description}[/pale_green4]", + classes="preset-entry", + ) + ) + self.preset = preset + + +class PipelineHubApp(App): + """Textual front-end that executes cmdlet pipelines inline.""" + + CSS_PATH = "tui.tcss" + BINDINGS = [ + Binding("ctrl+enter", "run_pipeline", "Run Pipeline"), + Binding("f5", "refresh_workers", "Refresh Workers"), + Binding("ctrl+l", "focus_command", "Focus Input", show=False), + ] + + def __init__(self) -> None: + super().__init__() + self.executor = PipelineExecutor() + self.result_items: List[Any] = [] + self.log_lines: List[str] = [] + self.command_input: Optional[Input] = None + self.log_output: Optional[TextArea] = None + self.results_table: Optional[DataTable] = None + self.metadata_tree: Optional[Tree] = None + self.worker_table: Optional[DataTable] = None + self.preset_list: Optional[ListView] = None + self.status_panel: Optional[Static] = None + self._pipeline_running = False + + # ------------------------------------------------------------------ + # Layout + # ------------------------------------------------------------------ + def compose(self) -> ComposeResult: # noqa: D401 - Textual compose hook + yield Header(show_clock=True) + with Container(id="app-shell"): + with Horizontal(id="command-row"): + self.command_input = Input( + placeholder='download-data "" | merge-file | add-tag | add-file -storage local', + id="pipeline-input", + ) + yield self.command_input + yield Button("Run", id="run-button", variant="primary") + self.status_panel = Static("Idle", id="status-panel") + yield self.status_panel + with Horizontal(id="content-row"): + with VerticalScroll(id="left-pane"): + yield Static("Pipeline Presets", classes="section-title") + self.preset_list = ListView( + *(PresetListItem(preset) for preset in PIPELINE_PRESETS), + id="preset-list", + ) + yield self.preset_list + yield Static("Logs", classes="section-title") + self.log_output = TextArea(id="log-output", read_only=True) + yield self.log_output + yield Static("Workers", classes="section-title") + self.worker_table = DataTable(id="workers-table") + yield self.worker_table + with Vertical(id="right-pane"): + yield Static("Results", classes="section-title") + self.results_table = DataTable(id="results-table") + yield self.results_table + yield Static("Metadata", classes="section-title") + self.metadata_tree = Tree("Run a pipeline", id="metadata-tree") + yield self.metadata_tree + yield Footer() + + def on_mount(self) -> None: + if self.results_table: + self.results_table.add_columns("Row", "Title", "Source", "File") + if self.worker_table: + self.worker_table.add_columns("ID", "Type", "Status", "Details") + if self.executor.worker_manager: + self.set_interval(2.0, self.refresh_workers) + self.refresh_workers() + if self.command_input: + self.command_input.focus() + + # ------------------------------------------------------------------ + # Actions + # ------------------------------------------------------------------ + def action_focus_command(self) -> None: + if self.command_input: + self.command_input.focus() + + def action_run_pipeline(self) -> None: + if self._pipeline_running: + self.notify("Pipeline already running", severity="warning", timeout=3) + return + if not self.command_input: + return + pipeline_text = self.command_input.value.strip() + if not pipeline_text: + self.notify("Enter a pipeline to run", severity="warning", timeout=3) + return + + self._pipeline_running = True + self._set_status("Running…", level="info") + self._clear_log() + self._append_log_line(f"$ {pipeline_text}") + self._clear_results() + self._run_pipeline_background(pipeline_text) + + def action_refresh_workers(self) -> None: + self.refresh_workers() + + # ------------------------------------------------------------------ + # Event handlers + # ------------------------------------------------------------------ + def on_button_pressed(self, event: Button.Pressed) -> None: + if event.button.id == "run-button": + self.action_run_pipeline() + + def on_input_submitted(self, event: Input.Submitted) -> None: + if event.input.id == "pipeline-input": + self.action_run_pipeline() + + def on_list_view_selected(self, event: ListView.Selected) -> None: + if isinstance(event.item, PresetListItem) and self.command_input: + self.command_input.value = event.item.preset.pipeline + self.notify(f"Loaded preset: {event.item.preset.label}", timeout=2) + event.stop() + + def on_data_table_row_highlighted(self, event: DataTable.RowHighlighted) -> None: + if not self.results_table or event.control is not self.results_table: + return + index = event.cursor_row + if 0 <= index < len(self.result_items): + self._display_metadata(self.result_items[index]) + + # ------------------------------------------------------------------ + # Pipeline execution helpers + # ------------------------------------------------------------------ + @work(exclusive=True, thread=True) + def _run_pipeline_background(self, pipeline_text: str) -> None: + run_result = self.executor.run_pipeline(pipeline_text, on_log=self._log_from_worker) + self.call_from_thread(self._on_pipeline_finished, run_result) + + def _on_pipeline_finished(self, run_result: PipelineRunResult) -> None: + self._pipeline_running = False + status_level = "success" if run_result.success else "error" + status_text = "Completed" if run_result.success else "Failed" + self._set_status(status_text, level=status_level) + + if not run_result.success: + self.notify(run_result.error or "Pipeline failed", severity="error", timeout=6) + else: + self.notify("Pipeline completed", timeout=3) + + if run_result.stdout.strip(): + self._append_log_line("stdout:") + self._append_block(run_result.stdout) + if run_result.stderr.strip(): + self._append_log_line("stderr:") + self._append_block(run_result.stderr) + + for stage in run_result.stages: + summary = f"[{stage.status}] {stage.name} -> {len(stage.emitted)} item(s)" + if stage.error: + summary += f" ({stage.error})" + self._append_log_line(summary) + + emitted = run_result.emitted + if isinstance(emitted, list): + self.result_items = emitted + elif emitted: + self.result_items = [emitted] + else: + self.result_items = [] + + self._populate_results_table() + self.refresh_workers() + + def _log_from_worker(self, message: str) -> None: + self.call_from_thread(self._append_log_line, message) + + # ------------------------------------------------------------------ + # UI helpers + # ------------------------------------------------------------------ + def _populate_results_table(self) -> None: + if not self.results_table: + return + self.results_table.clear() + if not self.result_items: + self.results_table.add_row("—", "No results", "", "") + return + for idx, item in enumerate(self.result_items, start=1): + if isinstance(item, dict): + title = summarize_result(item) + source = item.get("source") or item.get("cmdlet_name") or item.get("cmdlet") or "—" + file_path = item.get("file_path") or item.get("path") or "—" + else: + title = str(item) + source = "—" + file_path = "—" + self.results_table.add_row(str(idx), title, source, file_path, key=str(idx - 1)) + + def _display_metadata(self, item: Any) -> None: + if not self.metadata_tree: + return + root = self.metadata_tree.root + root.label = "Metadata" + root.remove_children() + + payload: Dict[str, Any] + if isinstance(item, dict): + file_path = item.get("file_path") or item.get("path") + if file_path: + payload = build_metadata_snapshot(Path(file_path)) + else: + payload = item + else: + payload = {"value": str(item)} + + self._populate_tree_node(root, payload) + root.expand_all() + + def _populate_tree_node(self, node, data: Any) -> None: + if isinstance(data, dict): + for key, value in data.items(): + child = node.add(f"[b]{key}[/b]") + self._populate_tree_node(child, value) + elif isinstance(data, Sequence) and not isinstance(data, (str, bytes)): + for idx, value in enumerate(data): + child = node.add(f"[{idx}]") + self._populate_tree_node(child, value) + else: + node.add(str(data)) + + def _clear_log(self) -> None: + self.log_lines = [] + if self.log_output: + self.log_output.value = "" + + def _append_log_line(self, line: str) -> None: + self.log_lines.append(line) + if len(self.log_lines) > 500: + self.log_lines = self.log_lines[-500:] + if self.log_output: + self.log_output.value = "\n".join(self.log_lines) + + def _append_block(self, text: str) -> None: + for line in text.strip().splitlines(): + self._append_log_line(f" {line}") + + def _clear_results(self) -> None: + self.result_items = [] + if self.results_table: + self.results_table.clear() + if self.metadata_tree: + self.metadata_tree.root.label = "Awaiting results" + self.metadata_tree.root.remove_children() + + def _set_status(self, message: str, *, level: str = "info") -> None: + if not self.status_panel: + return + for css in ("status-info", "status-success", "status-error"): + self.status_panel.remove_class(css) + css_class = f"status-{level if level in {'success', 'error'} else 'info'}" + self.status_panel.add_class(css_class) + self.status_panel.update(message) + + def refresh_workers(self) -> None: + if not self.worker_table: + return + manager = self.executor.worker_manager + self.worker_table.clear() + if manager is None: + self.worker_table.add_row("—", "—", "—", "Worker manager unavailable") + return + workers = manager.get_active_workers() + if not workers: + self.worker_table.add_row("—", "—", "—", "No active workers") + return + for worker in workers: + worker_id = str(worker.get("worker_id") or worker.get("id") or "?")[:8] + worker_type = str(worker.get("worker_type") or worker.get("type") or "?") + status = str(worker.get("status") or worker.get("result") or "running") + details = worker.get("current_step") or worker.get("description") or worker.get("pipe") or "" + self.worker_table.add_row(worker_id, worker_type, status, str(details)[:80]) + + +if __name__ == "__main__": + PipelineHubApp().run() diff --git a/TUI/tui.tcss b/TUI/tui.tcss new file mode 100644 index 0000000..3f2fd19 --- /dev/null +++ b/TUI/tui.tcss @@ -0,0 +1,100 @@ +#app-shell { + width: 100%; + height: 100%; + padding: 1 2; + background: $surface; + layout: vertical; +} + +#command-row { + width: 100%; + height: auto; + background: $boost; + padding: 1; + border: round $primary; +} + +#pipeline-input { + width: 1fr; + min-height: 3; + padding: 0 1; + margin-right: 1; + background: $surface; + color: $text; + border: round $primary; +} + +#pipeline-input:focus { + border: double $primary; + background: $surface; +} + +#status-panel { + min-width: 20; + text-style: bold; + content-align: center middle; + padding: 0 1; + border: solid $panel-darken-1; +} + +#content-row { + width: 100%; + height: 1fr; +} + +#left-pane, +#right-pane { + width: 1fr; + height: 100%; + padding: 1; + background: $panel; + border: round $panel-darken-2; +} + +#left-pane { + max-width: 48; +} + +.section-title { + text-style: bold; + color: $text-muted; + margin-top: 1; +} + +.preset-entry { + padding: 1; + border: tall $panel-darken-1; + margin-bottom: 1; +} + +#log-output { + height: 16; +} + +#workers-table { + height: auto; +} + +#results-table { + height: 1fr; +} + +#metadata-tree { + height: 1fr; + border: round $panel-darken-1; +} + +.status-info { + background: $boost; + color: $text; +} + +.status-success { + background: $success 20%; + color: $success; +} + +.status-error { + background: $error 20%; + color: $error; +} \ No newline at end of file diff --git a/cmdlets/__init__.py b/cmdlets/__init__.py new file mode 100644 index 0000000..5912b6a --- /dev/null +++ b/cmdlets/__init__.py @@ -0,0 +1,139 @@ +from __future__ import annotations + +from typing import Any, Callable, Dict, Iterable, Sequence +from importlib import import_module as _import_module + +# A cmdlet is a callable taking (result, args, config) -> int +Cmdlet = Callable[[Any, Sequence[str], Dict[str, Any]], int] + +# Registry of command-name -> cmdlet function +REGISTRY: Dict[str, Cmdlet] = {} + + +def register(names: Iterable[str]): + """Decorator to register a function under one or more command names. + + Usage: + @register(["add-tag", "add-tags"]) + def _run(result, args, config) -> int: ... + """ + def _wrap(fn: Cmdlet) -> Cmdlet: + for name in names: + REGISTRY[name.replace('_', '-').lower()] = fn + return fn + return _wrap + + +class AutoRegister: + """Decorator that automatically registers a cmdlet function using CMDLET.aliases. + + Usage: + CMDLET = Cmdlet( + name="delete-file", + aliases=["del", "del-file"], + ... + ) + + @AutoRegister(CMDLET) + def _run(result, args, config) -> int: + ... + + Registers the cmdlet under: + - Its main name from CMDLET.name + - All aliases from CMDLET.aliases + + This allows the help display to show: "cmd: delete-file | alias: del, del-file" + """ + def __init__(self, cmdlet): + self.cmdlet = cmdlet + + def __call__(self, fn: Cmdlet) -> Cmdlet: + """Register fn for the main name and all aliases in cmdlet.""" + normalized_name = None + + # Register for main name first + if hasattr(self.cmdlet, 'name') and self.cmdlet.name: + normalized_name = self.cmdlet.name.replace('_', '-').lower() + REGISTRY[normalized_name] = fn + + # Register for all aliases + if hasattr(self.cmdlet, 'aliases') and self.cmdlet.aliases: + for alias in self.cmdlet.aliases: + normalized_alias = alias.replace('_', '-').lower() + # Always register (aliases are separate from main name) + REGISTRY[normalized_alias] = fn + + return fn + + +def get(cmd_name: str) -> Cmdlet | None: + return REGISTRY.get(cmd_name.replace('_', '-').lower()) + + +def format_cmd_help(cmdlet) -> str: + """Format a cmdlet for help display showing cmd:name and aliases. + + Example output: "delete-file | aliases: del, del-file" + """ + if not hasattr(cmdlet, 'name'): + return str(cmdlet) + + cmd_str = f"cmd: {cmdlet.name}" + + if hasattr(cmdlet, 'aliases') and cmdlet.aliases: + aliases_str = ", ".join(cmdlet.aliases) + cmd_str += f" | aliases: {aliases_str}" + + return cmd_str + + +# Dynamically import all cmdlet modules in this directory (ignore files starting with _ and __init__.py) +import os +cmdlet_dir = os.path.dirname(__file__) +for filename in os.listdir(cmdlet_dir): + if ( + filename.endswith(".py") + and not filename.startswith("_") + and filename != "__init__.py" + ): + mod_name = filename[:-3] + try: + module = _import_module(f".{mod_name}", __name__) + + # Auto-register based on CMDLET object with exec function + # This allows cmdlets to be fully self-contained in the CMDLET object + if hasattr(module, 'CMDLET'): + cmdlet_obj = module.CMDLET + + # Get the execution function from the CMDLET object + run_fn = getattr(cmdlet_obj, 'exec', None) if hasattr(cmdlet_obj, 'exec') else None + + if callable(run_fn): + # Register main name + if hasattr(cmdlet_obj, 'name') and cmdlet_obj.name: + normalized_name = cmdlet_obj.name.replace('_', '-').lower() + REGISTRY[normalized_name] = run_fn + + # Register all aliases + if hasattr(cmdlet_obj, 'aliases') and cmdlet_obj.aliases: + for alias in cmdlet_obj.aliases: + normalized_alias = alias.replace('_', '-').lower() + REGISTRY[normalized_alias] = run_fn + except Exception: + continue + +# Import root-level modules that also register cmdlets +# Note: search_libgen, search_soulseek, and search_debrid are now consolidated into search_provider.py +# Use search-file -provider libgen, -provider soulseek, or -provider debrid instead +for _root_mod in ("select_cmdlet",): + try: + _import_module(_root_mod) + except Exception: + # Allow missing optional modules + continue + +# Also import helper modules that register cmdlets +try: + import helper.alldebrid as _alldebrid +except Exception: + pass diff --git a/cmdlets/_shared.py b/cmdlets/_shared.py new file mode 100644 index 0000000..4e6fa83 --- /dev/null +++ b/cmdlets/_shared.py @@ -0,0 +1,1229 @@ +"""Shared utilities for cmdlets and funacts. + +This module provides common utility functions for working with hashes, tags, +relationship data, and other frequently-needed operations. +""" + +from __future__ import annotations + +import json +import sys +import inspect +from collections.abc import Iterable as IterableABC + +from helper.logger import log +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional, Sequence, Set +from dataclasses import dataclass, field +import models + + +@dataclass +class CmdletArg: + """Represents a single cmdlet argument with optional enum choices.""" + name: str + """Argument name, e.g., '-path' or 'location'""" + type: str = "string" + """Argument type: 'string', 'int', 'flag', 'enum', etc.""" + required: bool = False + """Whether this argument is required""" + description: str = "" + """Human-readable description of the argument""" + choices: List[str] = field(default_factory=list) + """Optional list of valid choices for enum/autocomplete, e.g., ['hydrus', 'local', '0x0.st']""" + alias: str = "" + """Optional alias for the argument name, e.g., 'loc' for 'location'""" + handler: Optional[Any] = None + """Optional handler function/callable for processing this argument's value""" + variadic: bool = False + """Whether this argument accepts multiple values (consumes remaining positional args)""" + + def to_dict(self) -> Dict[str, Any]: + """Convert to dict for backward compatibility.""" + d = { + "name": self.name, + "type": self.type, + "required": self.required, + "description": self.description, + "variadic": self.variadic, + } + if self.choices: + d["choices"] = self.choices + if self.alias: + d["alias"] = self.alias + return d + + def resolve(self, value: Any) -> Any: + """Resolve/process the argument value using the handler if available. + + Args: + value: The raw argument value to process + + Returns: + Processed value from handler, or original value if no handler + + Example: + # For STORAGE arg with a handler + storage_path = SharedArgs.STORAGE.resolve('local') # Returns Path.home() / "Videos" + """ + if self.handler is not None and callable(self.handler): + return self.handler(value) + return value + + def to_flags(self) -> tuple[str, ...]: + """Generate all flag variants (short and long form) for this argument. + + Returns a tuple of all valid flag forms for this argument, including: + - Long form with double dash: --name + - Single dash multi-char form: -name (for convenience) + - Short form with single dash: -alias (if alias exists) + + For flags, also generates negation forms: + - --no-name, -name (negation of multi-char form) + - --no-name, -nalias (negation with alias) + + Returns: + Tuple of flag strings, e.g., ('--archive', '-archive', '-arch') + or for flags: ('--archive', '-archive', '-arch', '--no-archive', '-narch') + + Example: + archive_flags = SharedArgs.ARCHIVE.to_flags() + # Returns: ('--archive', '-archive', '-arch', '--no-archive', '-narch') + + storage_flags = SharedArgs.STORAGE.to_flags() + # Returns: ('--storage', '-storage', '-s') + """ + flags = [f'--{self.name}', f'-{self.name}'] # Both double-dash and single-dash variants + + # Add short form if alias exists + if self.alias: + flags.append(f'-{self.alias}') + + # Add negation forms for flag type + if self.type == 'flag': + flags.append(f'--no-{self.name}') + flags.append(f'-no{self.name}') # Single-dash negation variant + if self.alias: + flags.append(f'-n{self.alias}') + + return tuple(flags) + + +# ============================================================================ +# SHARED ARGUMENTS - Reusable argument definitions across cmdlets +# ============================================================================ + +class SharedArgs: + """Registry of shared CmdletArg definitions used across multiple cmdlets. + + This class provides a centralized location for common arguments so they're + defined once and used consistently everywhere. Reduces duplication and ensures + all cmdlets handle the same arguments identically. + + Example: + CMDLET = Cmdlet( + name="my-cmdlet", + summary="Does something", + usage="my-cmdlet", + args=[ + SharedArgs.HASH, # Use predefined shared arg + SharedArgs.LOCATION, # Use another shared arg + CmdletArg(...), # Mix with custom args + ] + ) + """ + + # File/Hash arguments + HASH = CmdletArg( + "hash", + type="string", + description="Override the Hydrus file hash (SHA256) to target instead of the selected result." + ) + + LOCATION = CmdletArg( + "location", + type="enum", + choices=["hydrus", "0x0", "local"], + required=True, + description="Destination location" + ) + + DELETE_FLAG = CmdletArg( + "delete", + type="flag", + description="Delete the file and its .tags after successful operation." + ) + + # Metadata arguments + ARTIST = CmdletArg( + "artist", + type="string", + description="Filter by artist name (case-insensitive, partial match)." + ) + + ALBUM = CmdletArg( + "album", + type="string", + description="Filter by album name (case-insensitive, partial match)." + ) + + TRACK = CmdletArg( + "track", + type="string", + description="Filter by track title (case-insensitive, partial match)." + ) + + # Library/Search arguments + LIBRARY = CmdletArg( + "library", + type="string", + choices=["hydrus", "local", "soulseek", "libgen", "debrid", "ftp"], + description="Search library or source location." + ) + + TIMEOUT = CmdletArg( + "timeout", + type="integer", + description="Search or operation timeout in seconds." + ) + + LIMIT = CmdletArg( + "limit", + type="integer", + description="Maximum number of results to return." + ) + + # Path/File arguments + PATH = CmdletArg( + "path", + type="string", + description="File or directory path." + ) + + OUTPUT = CmdletArg( + "output", + type="string", + description="Output file path." + ) + + STORAGE = CmdletArg( + "storage", + type="enum", + choices=["hydrus", "local", "debrid", "ftp"], + required=False, + description="Storage location or destination for saving/uploading files.", + alias="s", + handler=lambda val: SharedArgs.resolve_storage(val) if val else None + ) + + # Generic arguments + QUERY = CmdletArg( + "query", + type="string", + description="Search query string." + ) + + REASON = CmdletArg( + "reason", + type="string", + description="Reason or explanation for the operation." + ) + + ARCHIVE = CmdletArg( + "archive", + type="flag", + description="Archive the URL to Wayback Machine, Archive.today, and Archive.ph (requires URL argument in cmdlet).", + alias="arch" + ) + + @staticmethod + def resolve_storage(storage_value: Optional[str], default: Optional[Path] = None) -> Path: + """Resolve a storage location name to a filesystem Path. + + Maps storage identifiers (hydrus, local, debrid, ftp) to their actual + filesystem paths. This is the single source of truth for storage location resolution. + Note: 0x0.st is now accessed via file providers (-provider 0x0), not storage. + + Args: + storage_value: One of 'hydrus', 'local', 'debrid', 'ftp', or None + default: Path to return if storage_value is None (defaults to Videos) + + Returns: + Resolved Path object for the storage location + + Raises: + ValueError: If storage_value is not a recognized storage type + + Example: + # In a cmdlet: + storage_path = SharedArgs.resolve_storage(parsed.storage) + + # With defaults: + path = SharedArgs.resolve_storage(None) # Returns home/Videos + path = SharedArgs.resolve_storage('local') # Returns home/Videos + path = SharedArgs.resolve_storage('hydrus') # Returns home/.hydrus/client_files + """ + storage_map = { + 'local': Path.home() / "Videos", + 'hydrus': Path.home() / ".hydrus" / "client_files", + 'debrid': Path.home() / "Debrid", + 'ftp': Path.home() / "FTP", + } + + if storage_value is None: + return default or (Path.home() / "Videos") + + storage_lower = storage_value.lower() + if storage_lower not in storage_map: + raise ValueError( + f"Unknown storage location '{storage_value}'. " + f"Must be one of: {', '.join(storage_map.keys())}" + ) + + return storage_map[storage_lower] + + @classmethod + def get(cls, name: str) -> Optional[CmdletArg]: + """Get a shared argument by name. + + Args: + name: Uppercase name like 'HASH', 'LOCATION', etc. + + Returns: + CmdletArg if found, None otherwise + + Example: + arg = SharedArgs.get('HASH') # Returns SharedArgs.HASH + """ + try: + return getattr(cls, name.upper()) + except AttributeError: + return None + + +@dataclass +class Cmdlet: + """Represents a cmdlet with metadata and arguments. + + Example: + cmd = Cmdlet( + name="add-file", + summary="Upload a media file", + usage="add-file ", + aliases=["add-file-alias"], + args=[ + CmdletArg("location", required=True, description="Destination location"), + CmdletArg("-delete", type="flag", description="Delete after upload"), + ], + details=[ + "- This is a detail line", + "- Another detail", + ] + ) + + # Access properties + log(cmd.name) # "add-file" + log(cmd.summary) # "Upload a media file" + log(cmd.args[0].name) # "location" + + # Convert to dict for JSON serialization + log(json.dumps(cmd.to_dict())) + """ + name: str + """Cmdlet name, e.g., 'add-file'""" + summary: str + """One-line summary of the cmdlet""" + usage: str + """Usage string, e.g., 'add-file [-delete]'""" + aliases: List[str] = field(default_factory=list) + """List of aliases for this cmdlet, e.g., ['add', 'add-f']""" + args: List[CmdletArg] = field(default_factory=list) + """List of arguments accepted by this cmdlet""" + details: List[str] = field(default_factory=list) + """Detailed explanation lines (for help text)""" + exec: Optional[Any] = field(default=None) + """The execution function: func(result, args, config) -> int""" + + def __post_init__(self) -> None: + """Auto-discover _run function if exec not explicitly provided. + + If exec is None, looks for a _run function in the module where + this Cmdlet was instantiated and uses it automatically. + """ + if self.exec is None: + # Walk up the call stack to find _run in the calling module + frame = inspect.currentframe() + try: + # Walk up frames until we find one with _run in globals + while frame: + if '_run' in frame.f_globals: + self.exec = frame.f_globals['_run'] + break + frame = frame.f_back + finally: + del frame # Avoid reference cycles + + def to_dict(self) -> Dict[str, Any]: + """Convert to dict for backward compatibility with existing code. + + Returns a dict matching the old CMDLET format so existing code + that expects a dict will still work. + """ + # Format command for display: "cmd: name alias: alias1, alias2" + cmd_display = f"cmd: {self.name}" + if self.aliases: + aliases_str = ", ".join(self.aliases) + cmd_display += f" alias: {aliases_str}" + + return { + "name": self.name, + "summary": self.summary, + "usage": self.usage, + "cmd": cmd_display, # Display-friendly command name with aliases on one line + "aliases": self.aliases, + "args": [arg.to_dict() for arg in self.args], + "details": self.details, + } + + def __getitem__(self, key: str) -> Any: + """Dict-like access for backward compatibility. + + Allows code like: cmdlet["name"] or cmdlet["args"] + """ + d = self.to_dict() + return d.get(key) + + def get(self, key: str, default: Any = None) -> Any: + """Dict-like get() method for backward compatibility.""" + d = self.to_dict() + return d.get(key, default) + + def get_flags(self, arg_name: str) -> set[str]: + """Generate -name and --name flag variants for an argument. + + Args: + arg_name: The argument name (e.g., 'library', 'tag', 'size') + + Returns: + Set containing both single-dash and double-dash variants + (e.g., {'-library', '--library'}) + + Example: + if low in cmdlet.get_flags('library'): + # handle library flag + """ + return {f"-{arg_name}", f"--{arg_name}"} + + def build_flag_registry(self) -> Dict[str, set[str]]: + """Build a registry of all flag variants for this cmdlet's arguments. + + Automatically generates all -name and --name variants for each argument. + Useful for parsing command-line arguments without hardcoding flags. + + Returns: + Dict mapping argument names to their flag sets + (e.g., {'library': {'-library', '--library'}, 'tag': {'-tag', '--tag'}}) + + Example: + flags = cmdlet.build_flag_registry() + + if low in flags.get('library', set()): + # handle library + elif low in flags.get('tag', set()): + # handle tag + """ + return {arg.name: self.get_flags(arg.name) for arg in self.args} + + +# Tag groups cache (loaded from JSON config file) +_TAG_GROUPS_CACHE: Optional[Dict[str, List[str]]] = None +_TAG_GROUPS_MTIME: Optional[float] = None + +# Path to tag groups configuration (set by caller or lazily discovered) +TAG_GROUPS_PATH: Optional[Path] = None + + +def set_tag_groups_path(path: Path) -> None: + """Set the path to the tag groups JSON file.""" + global TAG_GROUPS_PATH + TAG_GROUPS_PATH = path + + +def parse_cmdlet_args(args: Sequence[str], cmdlet_spec: Dict[str, Any] | Cmdlet) -> Dict[str, Any]: + """Parse command-line arguments based on cmdlet specification. + + Extracts argument values from command-line tokens using the argument names + and types defined in the cmdlet metadata. Automatically supports single-dash + and double-dash variants of flag names. Arguments without dashes in definition + are treated as positional arguments. + + Args: + args: Command-line arguments (e.g., ["-path", "/home/file.txt", "-foo", "bar"]) + cmdlet_spec: Cmdlet metadata dict with "args" key containing list of arg specs, + or a Cmdlet object. Each arg spec should have at least "name" key. + Argument names can be defined with or without prefixes. + + Returns: + Dict mapping canonical arg names to their parsed values. If an arg is not + provided, it will not be in the dict. Lookup will normalize prefixes. + + Example: + cmdlet = { + "args": [ + {"name": "path", "type": "string"}, # Positional - matches bare value or -path/--path + {"name": "count", "type": "int"} # Positional - matches bare value or -count/--count + ] + } + result = parse_cmdlet_args(["value1", "-count", "5"], cmdlet) + # result = {"path": "value1", "count": "5"} + """ + result: Dict[str, Any] = {} + + # Handle both dict and Cmdlet objects + if isinstance(cmdlet_spec, Cmdlet): + cmdlet_spec = cmdlet_spec.to_dict() + + # Build arg specs tracking which are positional vs flagged + arg_specs: List[Dict[str, Any]] = cmdlet_spec.get("args", []) + positional_args: List[Dict[str, Any]] = [] # args without prefix in definition + flagged_args: List[Dict[str, Any]] = [] # args with prefix in definition + + arg_spec_map: Dict[str, str] = {} # prefix variant -> canonical name (without prefix) + + for spec in arg_specs: + name = spec.get("name") + if not name: + continue + + name_str = str(name) + canonical_name = name_str.lstrip("-") + + # Determine if this is positional (no dashes in original definition) + if "-" not in name_str: + positional_args.append(spec) + else: + flagged_args.append(spec) + + # Register all prefix variants for flagged lookup + arg_spec_map[canonical_name.lower()] = canonical_name # bare name + arg_spec_map[f"-{canonical_name}".lower()] = canonical_name # single dash + arg_spec_map[f"--{canonical_name}".lower()] = canonical_name # double dash + + # Parse arguments + i = 0 + positional_index = 0 # Track which positional arg we're on + + while i < len(args): + token = str(args[i]) + token_lower = token.lower() + + # Check if this token is a known flagged argument + if token_lower in arg_spec_map: + canonical_name = arg_spec_map[token_lower] + spec = next((s for s in arg_specs if str(s.get("name", "")).lstrip("-").lower() == canonical_name.lower()), None) + + # Check if it's a flag type (which doesn't consume next value, just marks presence) + is_flag = spec and spec.get("type") == "flag" + + if is_flag: + # For flags, just mark presence without consuming next token + result[canonical_name] = True + i += 1 + else: + # For non-flags, consume next token as the value + if i + 1 < len(args) and not str(args[i + 1]).startswith("-"): + value = args[i + 1] + + # Check if variadic + is_variadic = spec and spec.get("variadic", False) + if is_variadic: + if canonical_name not in result: + result[canonical_name] = [] + elif not isinstance(result[canonical_name], list): + result[canonical_name] = [result[canonical_name]] + result[canonical_name].append(value) + else: + result[canonical_name] = value + i += 2 + else: + i += 1 + # Otherwise treat as positional if we have positional args remaining + elif positional_index < len(positional_args): + positional_spec = positional_args[positional_index] + canonical_name = str(positional_spec.get("name", "")).lstrip("-") + is_variadic = positional_spec.get("variadic", False) + + if is_variadic: + # For variadic args, append to a list + if canonical_name not in result: + result[canonical_name] = [] + elif not isinstance(result[canonical_name], list): + # Should not happen if logic is correct, but safety check + result[canonical_name] = [result[canonical_name]] + + result[canonical_name].append(token) + # Do not increment positional_index so subsequent tokens also match this arg + # Note: Variadic args should typically be the last positional argument + i += 1 + else: + result[canonical_name] = token + positional_index += 1 + i += 1 + else: + # Unknown token, skip it + i += 1 + + return result + + +def normalize_hash(hash_hex: Optional[str]) -> Optional[str]: + """Normalize a hash string to lowercase, or return None if invalid. + + Args: + hash_hex: String that should be a hex hash + + Returns: + Lowercase hash string, or None if input is not a string or is empty + """ + if not isinstance(hash_hex, str): + return None + text = hash_hex.strip() + return text.lower() if text else None + + +def looks_like_hash(candidate: Optional[str]) -> bool: + """Check if a string looks like a SHA256 hash (64 hex chars). + + Args: + candidate: String to test + + Returns: + True if the string is 64 lowercase hex characters + """ + if not isinstance(candidate, str): + return False + text = candidate.strip().lower() + return len(text) == 64 and all(ch in "0123456789abcdef" for ch in text) + + +def pipeline_item_local_path(item: Any) -> Optional[str]: + """Extract local file path from a pipeline item. + + Supports both dataclass objects with .target attribute and dicts. + Returns None for HTTP/HTTPS URLs. + + Args: + item: Pipeline item (PipelineItem dataclass, dict, or other) + + Returns: + Local file path string, or None if item is not a local file + """ + target: Optional[str] = None + if hasattr(item, "target"): + target = getattr(item, "target", None) + elif isinstance(item, dict): + raw = item.get("target") or item.get("path") or item.get("url") + target = str(raw) if raw is not None else None + if not isinstance(target, str): + return None + text = target.strip() + if not text: + return None + if text.lower().startswith(("http://", "https://")): + return None + return text + + +def collect_relationship_labels(payload: Any, label_stack: List[str] | None = None, mapping: Dict[str, str] | None = None) -> Dict[str, str]: + """Recursively extract hash-to-label mappings from nested relationship data. + + Walks through nested dicts/lists looking for sha256-like strings (64 hex chars) + and builds a mapping from hash to its path in the structure. + + Example: + data = { + "duplicates": [ + "abc123...", # Will be mapped to "duplicates" + {"type": "related", "items": ["def456..."]} # Will be mapped to "duplicates / type / items" + ] + } + result = collect_relationship_labels(data) + # result = {"abc123...": "duplicates", "def456...": "duplicates / type / items"} + + Args: + payload: Nested data structure (dict, list, string, etc.) + label_stack: Internal use - tracks path during recursion + mapping: Internal use - accumulates hash->label mappings + + Returns: + Dict mapping hash strings to their path labels + """ + if label_stack is None: + label_stack = [] + if mapping is None: + mapping = {} + + if isinstance(payload, dict): + for key, value in payload.items(): + next_stack = label_stack + if isinstance(key, str) and key: + formatted = key.replace('_', ' ').strip() + next_stack = label_stack + [formatted] + collect_relationship_labels(value, next_stack, mapping) + elif isinstance(payload, (list, tuple, set)): + for value in payload: + collect_relationship_labels(value, label_stack, mapping) + elif isinstance(payload, str) and looks_like_hash(payload): + hash_value = payload.lower() + if label_stack: + label = " / ".join(item for item in label_stack if item) + else: + label = "related" + mapping.setdefault(hash_value, label) + + return mapping + + +def parse_tag_arguments(arguments: Sequence[str]) -> List[str]: + """Parse tag arguments from command line tokens. + + Handles both space-separated and comma-separated tags. + Example: parse_tag_arguments(["tag1,tag2", "tag3"]) -> ["tag1", "tag2", "tag3"] + + Args: + arguments: Sequence of argument strings + + Returns: + List of normalized tag strings (empty strings filtered out) + """ + tags: List[str] = [] + for argument in arguments: + for token in argument.split(','): + text = token.strip() + if text: + tags.append(text) + return tags + + +def fmt_bytes(n: Optional[int]) -> str: + """Format bytes as human-readable with 1 decimal place (MB/GB). + + Args: + n: Number of bytes, or None + + Returns: + Formatted string like "1.5 MB" or "2.0 GB", or "unknown" + """ + if n is None or n < 0: + return "unknown" + gb = n / (1024.0 * 1024.0 * 1024.0) + if gb >= 1.0: + return f"{gb:.1f} GB" + mb = n / (1024.0 * 1024.0) + return f"{mb:.1f} MB" + + +def _normalise_tag_group_entry(value: Any) -> Optional[str]: + """Internal: Normalize a single tag group entry.""" + if not isinstance(value, str): + value = str(value) + text = value.strip() + return text or None + + +def _load_tag_groups() -> Dict[str, List[str]]: + """Load tag group definitions from JSON file with caching.""" + global _TAG_GROUPS_CACHE, _TAG_GROUPS_MTIME, TAG_GROUPS_PATH + + # Auto-discover adjective.json if not set + if TAG_GROUPS_PATH is None: + # Try to find adjective.json in the script directory or helper subdirectory + try: + script_dir = Path(__file__).parent.parent + + # Check root directory + candidate = script_dir / "adjective.json" + if candidate.exists(): + TAG_GROUPS_PATH = candidate + else: + # Check helper directory + candidate = script_dir / "helper" / "adjective.json" + if candidate.exists(): + TAG_GROUPS_PATH = candidate + except Exception: + pass + + if TAG_GROUPS_PATH is None: + return {} + + path = TAG_GROUPS_PATH + try: + stat_result = path.stat() + except FileNotFoundError: + _TAG_GROUPS_CACHE = {} + _TAG_GROUPS_MTIME = None + return {} + except OSError as exc: + log(f"Failed to read tag groups: {exc}", file=sys.stderr) + _TAG_GROUPS_CACHE = {} + _TAG_GROUPS_MTIME = None + return {} + + mtime = stat_result.st_mtime + if _TAG_GROUPS_CACHE is not None and _TAG_GROUPS_MTIME == mtime: + return _TAG_GROUPS_CACHE + + try: + payload = json.loads(path.read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError) as exc: + log(f"Invalid tag group JSON ({path}): {exc}", file=sys.stderr) + _TAG_GROUPS_CACHE = {} + _TAG_GROUPS_MTIME = mtime + return {} + + groups: Dict[str, List[str]] = {} + if isinstance(payload, dict): + for key, value in payload.items(): + if not isinstance(key, str): + continue + name = key.strip().lower() + if not name: + continue + members: List[str] = [] + if isinstance(value, list): + for entry in value: + normalised = _normalise_tag_group_entry(entry) + if normalised: + members.append(normalised) + elif isinstance(value, str): + normalised = _normalise_tag_group_entry(value) + if normalised: + members.extend(token.strip() for token in normalised.split(",") if token.strip()) + if members: + groups[name] = members + + _TAG_GROUPS_CACHE = groups + _TAG_GROUPS_MTIME = mtime + return groups + + +def expand_tag_groups(raw_tags: Iterable[str]) -> List[str]: + """Expand tag group references (e.g., {my_group}) into member tags. + + Tag groups are defined in JSON and can be nested. Groups are referenced + with curly braces: {group_name}. + + Args: + raw_tags: Sequence of tag strings, some may reference groups like "{group_name}" + + Returns: + List of expanded tags with group references replaced + """ + groups = _load_tag_groups() + if not groups: + return [tag for tag in raw_tags if isinstance(tag, str) and tag.strip()] + + def _expand(tokens: Iterable[str], seen: Set[str]) -> List[str]: + result: List[str] = [] + for token in tokens: + if not isinstance(token, str): + continue + candidate = token.strip() + if not candidate: + continue + if candidate.startswith("{") and candidate.endswith("}") and len(candidate) > 2: + name = candidate[1:-1].strip().lower() + if not name: + continue + if name in seen: + log(f"Tag group recursion detected for {{{name}}}; skipping", file=sys.stderr) + continue + members = groups.get(name) + if not members: + log(f"Unknown tag group {{{name}}}", file=sys.stderr) + result.append(candidate) + continue + result.extend(_expand(members, seen | {name})) + else: + result.append(candidate) + return result + + return _expand(raw_tags, set()) + + +def first_title_tag(source: Optional[Iterable[str]]) -> Optional[str]: + """Find the first tag starting with "title:" in a collection. + + Args: + source: Iterable of tag strings + + Returns: + First title: tag found, or None + """ + if not source: + return None + for item in source: + if not isinstance(item, str): + continue + candidate = item.strip() + if candidate and candidate.lower().startswith("title:"): + return candidate + return None + + +def apply_preferred_title(tags: List[str], preferred: Optional[str]) -> List[str]: + """Replace any title: tags with a preferred title tag. + + Args: + tags: List of tags (may contain multiple "title:" entries) + preferred: Preferred title tag to use (full "title: ..." format) + + Returns: + List with old title tags removed and preferred title added (at most once) + """ + if not preferred: + return tags + preferred_clean = preferred.strip() + if not preferred_clean: + return tags + preferred_lower = preferred_clean.lower() + filtered: List[str] = [] + has_preferred = False + for tag in tags: + candidate = tag.strip() + if not candidate: + continue + if candidate.lower().startswith("title:"): + if candidate.lower() == preferred_lower: + if not has_preferred: + filtered.append(candidate) + has_preferred = True + continue + filtered.append(candidate) + if not has_preferred: + filtered.append(preferred_clean) + return filtered + + +# ============================================================================ +# PIPEOBJECT UTILITIES (for chainable cmdlets and multi-action pipelines) +# ============================================================================ + +def create_pipe_object_result( + source: str, + identifier: str, + file_path: str, + cmdlet_name: str, + title: Optional[str] = None, + file_hash: Optional[str] = None, + is_temp: bool = False, + parent_hash: Optional[str] = None, + tags: Optional[List[str]] = None, + **extra: Any +) -> Dict[str, Any]: + """Create a PipeObject-compatible result dict for pipeline chaining. + + This is a helper to emit results in the standard format that downstream + cmdlets can process (filter, tag, cleanup, etc.). + + Args: + source: Source system (e.g., 'local', 'hydrus', 'download') + identifier: Unique ID from source + file_path: Path to the file + cmdlet_name: Name of the cmdlet that created this (e.g., 'download-data', 'screen-shot') + title: Human-readable title + file_hash: SHA-256 hash of file (for integrity) + is_temp: If True, this is a temporary/intermediate artifact + parent_hash: Hash of the parent file in the chain (for provenance) + tags: List of tags to apply + **extra: Additional fields + + Returns: + Dict with all PipeObject fields for emission + """ + result = { + 'source': source, + 'id': identifier, + 'file_path': file_path, + 'action': f'cmdlet:{cmdlet_name}', # Format: cmdlet:cmdlet_name + } + + if title: + result['title'] = title + if file_hash: + result['file_hash'] = file_hash + if is_temp: + result['is_temp'] = True + if parent_hash: + result['parent_id'] = parent_hash # parent_id is the parent's file_hash + if tags: + result['tags'] = tags + + # Add any extra fields + result.update(extra) + + return result + + +def mark_as_temp(pipe_object: Dict[str, Any]) -> Dict[str, Any]: + """Mark a PipeObject dict as temporary (intermediate artifact). + + Args: + pipe_object: Result dict from cmdlet emission + + Returns: + Modified dict with is_temp=True + """ + pipe_object['is_temp'] = True + return pipe_object + + +def set_parent_id(pipe_object: Dict[str, Any], parent_hash: str) -> Dict[str, Any]: + """Set the parent_id for provenance tracking. + + Args: + pipe_object: Result dict + parent_hash: Parent file's hash + + Returns: + Modified dict with parent_id set to the hash + """ + pipe_object['parent_id'] = parent_hash + return pipe_object + + +def get_pipe_object_path(pipe_object: Any) -> Optional[str]: + """Extract file path from PipeObject, dict, or pipeline-friendly object.""" + if pipe_object is None: + return None + for attr in ('file_path', 'path', 'target'): + if hasattr(pipe_object, attr): + value = getattr(pipe_object, attr) + if value: + return value + if isinstance(pipe_object, dict): + for key in ('file_path', 'path', 'target'): + value = pipe_object.get(key) + if value: + return value + return None + + +def get_pipe_object_hash(pipe_object: Any) -> Optional[str]: + """Extract file hash from PipeObject, dict, or pipeline-friendly object.""" + if pipe_object is None: + return None + for attr in ('file_hash', 'hash_hex', 'hash'): + if hasattr(pipe_object, attr): + value = getattr(pipe_object, attr) + if value: + return value + if isinstance(pipe_object, dict): + for key in ('file_hash', 'hash_hex', 'hash'): + value = pipe_object.get(key) + if value: + return value + return None + + +def normalize_result_input(result: Any) -> List[Dict[str, Any]]: + """Normalize input result to a list of dicts. + + Handles: + - None -> [] + - Dict -> [dict] + - List of dicts -> list as-is + - PipeObject -> [dict] + - List of PipeObjects -> list of dicts + + Args: + result: Result from piped input + + Returns: + List of result dicts (may be empty) + """ + if result is None: + return [] + + # Single dict + if isinstance(result, dict): + return [result] + + # List - convert each item to dict if needed + if isinstance(result, list): + output = [] + for item in result: + if isinstance(item, dict): + output.append(item) + elif hasattr(item, 'to_dict'): + output.append(item.to_dict()) + else: + # Try as-is + output.append(item) + return output + + # PipeObject or other object with to_dict + if hasattr(result, 'to_dict'): + return [result.to_dict()] + + # Fallback: wrap it + if isinstance(result, dict): + return [result] + + return [] + + +def filter_results_by_temp(results: List[Any], include_temp: bool = False) -> List[Any]: + """Filter results by temporary status. + + Args: + results: List of result dicts or PipeObjects + include_temp: If True, keep temp files; if False, exclude them + + Returns: + Filtered list + """ + if include_temp: + return results + + filtered = [] + for result in results: + is_temp = False + + # Check PipeObject + if hasattr(result, 'is_temp'): + is_temp = result.is_temp + # Check dict + elif isinstance(result, dict): + is_temp = result.get('is_temp', False) + + if not is_temp: + filtered.append(result) + + return filtered + + +def merge_sequences(*sources: Optional[Iterable[Any]], case_sensitive: bool = True) -> list[str]: + """Merge iterable sources while preserving order and removing duplicates.""" + seen: set[str] = set() + merged: list[str] = [] + for source in sources: + if not source: + continue + if isinstance(source, str) or not isinstance(source, IterableABC): + iterable = [source] + else: + iterable = source + for value in iterable: + if value is None: + continue + text = str(value).strip() + if not text: + continue + key = text if case_sensitive else text.lower() + if key in seen: + continue + seen.add(key) + merged.append(text) + return merged + + +def extract_tags_from_result(result: Any) -> list[str]: + tags: list[str] = [] + if isinstance(result, models.PipeObject): + tags.extend(result.tags or []) + tags.extend(result.extra.get('tags', [])) + elif hasattr(result, 'tags'): + # Handle objects with tags attribute (e.g. SearchResult) + val = getattr(result, 'tags') + if isinstance(val, (list, set, tuple)): + tags.extend(val) + elif isinstance(val, str): + tags.append(val) + + if isinstance(result, dict): + raw_tags = result.get('tags') + if isinstance(raw_tags, list): + tags.extend(raw_tags) + elif isinstance(raw_tags, str): + tags.append(raw_tags) + extra = result.get('extra') + if isinstance(extra, dict): + extra_tags = extra.get('tags') + if isinstance(extra_tags, list): + tags.extend(extra_tags) + elif isinstance(extra_tags, str): + tags.append(extra_tags) + return merge_sequences(tags, case_sensitive=True) + + +def extract_title_from_result(result: Any) -> Optional[str]: + """Extract the title from a result dict or PipeObject.""" + if isinstance(result, models.PipeObject): + return result.title + elif hasattr(result, 'title'): + return getattr(result, 'title') + elif isinstance(result, dict): + return result.get('title') + return None + + +def extract_known_urls_from_result(result: Any) -> list[str]: + urls: list[str] = [] + + def _extend(candidate: Any) -> None: + if not candidate: + return + if isinstance(candidate, list): + urls.extend(candidate) + elif isinstance(candidate, str): + urls.append(candidate) + + if isinstance(result, models.PipeObject): + _extend(result.extra.get('known_urls')) + if isinstance(result.metadata, dict): + _extend(result.metadata.get('known_urls')) + _extend(result.metadata.get('urls')) + elif hasattr(result, 'known_urls') or hasattr(result, 'urls'): + # Handle objects with known_urls/urls attribute + _extend(getattr(result, 'known_urls', None)) + _extend(getattr(result, 'urls', None)) + + if isinstance(result, dict): + _extend(result.get('known_urls')) + _extend(result.get('urls')) + extra = result.get('extra') + if isinstance(extra, dict): + _extend(extra.get('known_urls')) + _extend(extra.get('urls')) + + return merge_sequences(urls, case_sensitive=True) + + +def extract_relationships(result: Any) -> Optional[Dict[str, Any]]: + if isinstance(result, models.PipeObject): + relationships = result.get_relationships() + return relationships or None + if isinstance(result, dict): + relationships = result.get('relationships') + if isinstance(relationships, dict) and relationships: + return relationships + return None + + +def extract_duration(result: Any) -> Optional[float]: + duration = None + if isinstance(result, models.PipeObject): + duration = result.duration + elif isinstance(result, dict): + duration = result.get('duration') + if duration is None: + metadata = result.get('metadata') + if isinstance(metadata, dict): + duration = metadata.get('duration') + if duration is None: + return None + try: + return float(duration) + except (TypeError, ValueError): + return None diff --git a/cmdlets/add_file.py b/cmdlets/add_file.py new file mode 100644 index 0000000..235d421 --- /dev/null +++ b/cmdlets/add_file.py @@ -0,0 +1,910 @@ +from __future__ import annotations + +from typing import Any, Dict, Optional, Sequence, Iterable, Tuple +from collections.abc import Iterable as IterableABC +import json +from pathlib import Path +import sys + +import models +import pipeline as ctx +from helper import hydrus as hydrus_wrapper +from helper.logger import log, debug +from helper.file_storage import FileStorage +from ._shared import ( + Cmdlet, CmdletArg, parse_cmdlet_args, SharedArgs, create_pipe_object_result, + extract_tags_from_result, extract_title_from_result, extract_known_urls_from_result, + merge_sequences, extract_relationships, extract_duration +) +from helper.local_library import read_sidecar, find_sidecar, write_sidecar, LocalLibraryDB +from helper.utils import sha256_file +from metadata import embed_metadata_in_file + +# Use official Hydrus supported filetypes from hydrus_wrapper +SUPPORTED_MEDIA_EXTENSIONS = hydrus_wrapper.ALL_SUPPORTED_EXTENSIONS + +# Initialize file storage system +storage = FileStorage() + + +def _guess_media_kind_from_suffix(media_path: Path) -> str: + suffix = media_path.suffix.lower() + if suffix in {'.mp3', '.flac', '.wav', '.m4a', '.aac', '.ogg', '.opus', '.wma', '.mka'}: + return 'audio' + if suffix in {'.mp4', '.mkv', '.webm', '.mov', '.avi', '.flv', '.mpg', '.mpeg', '.ts', '.m4v', '.wmv'}: + return 'video' + if suffix in {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff'}: + return 'image' + if suffix in {'.pdf', '.epub', '.txt', '.mobi', '.azw3', '.cbz', '.cbr', '.doc', '.docx'}: + return 'document' + return 'other' + + +def _resolve_media_kind(result: Any, media_path: Path) -> str: + if isinstance(result, models.PipeObject): + if getattr(result, 'media_kind', None): + return str(result.media_kind) + elif isinstance(result, dict): + media_kind = result.get('media_kind') + if media_kind: + return str(media_kind) + metadata = result.get('metadata') + if isinstance(metadata, dict) and metadata.get('media_kind'): + return str(metadata['media_kind']) + return _guess_media_kind_from_suffix(media_path) + + +def _load_sidecar_bundle(media_path: Path, origin: Optional[str] = None, config: Optional[dict] = None) -> tuple[Optional[Path], Optional[str], list[str], list[str]]: + # For local origin, try to read from local database first + if origin and origin.lower() == "local" and config: + try: + from helper.local_library import LocalLibraryDB + from config import get_local_storage_path + + try: + db_root = get_local_storage_path(config) + except Exception: + db_root = None + + if db_root: + try: + db = LocalLibraryDB(Path(db_root)) + try: + # Get tags and metadata from database + tags = db.get_tags(media_path) or [] + metadata = db.get_metadata(media_path) or {} + known_urls = metadata.get("known_urls") or [] + file_hash = metadata.get("hash") + + if tags or known_urls or file_hash: + debug(f"Found metadata in local database: {len(tags)} tag(s), {len(known_urls)} URL(s)") + return None, file_hash, tags, known_urls + finally: + db.close() + except Exception as exc: + log(f"⚠️ Could not query local database: {exc}", file=sys.stderr) + except Exception: + pass + + # Fall back to sidecar file lookup + try: + sidecar_path = find_sidecar(media_path) + except Exception: + sidecar_path = None + if not sidecar_path or not sidecar_path.exists(): + return None, None, [], [] + try: + hash_value, tags, known_urls = read_sidecar(sidecar_path) + return sidecar_path, hash_value, tags or [], known_urls or [] + except Exception as exc: + log(f"⚠️ Failed to read sidecar for {media_path.name}: {exc}", file=sys.stderr) + return sidecar_path, None, [], [] + + +def _resolve_file_hash(result: Any, fallback_hash: Optional[str], file_path: Path) -> Optional[str]: + candidate = None + if isinstance(result, models.PipeObject): + candidate = result.file_hash + elif isinstance(result, dict): + candidate = result.get('file_hash') or result.get('hash') + candidate = candidate or fallback_hash + if candidate: + return str(candidate) + try: + return sha256_file(file_path) + except Exception as exc: + log(f"⚠️ Could not compute SHA-256 for {file_path.name}: {exc}", file=sys.stderr) + return None + + +def _cleanup_sidecar_files(media_path: Path, *extra_paths: Optional[Path]) -> None: + targets = [ + media_path.parent / (media_path.name + '.metadata'), + media_path.parent / (media_path.name + '.notes'), + media_path.parent / (media_path.name + '.tags'), + media_path.parent / (media_path.name + '.tags.txt'), + ] + targets.extend(extra_paths) + for target in targets: + if not target: + continue + try: + path_obj = Path(target) + if path_obj.exists(): + path_obj.unlink() + except Exception: + continue + + +def _persist_local_metadata( + library_root: Path, + dest_path: Path, + tags: list[str], + known_urls: list[str], + file_hash: Optional[str], + relationships: Optional[Dict[str, Any]], + duration: Optional[float], + media_kind: str, +) -> None: + payload = { + 'hash': file_hash, + 'known_urls': known_urls, + 'relationships': relationships or [], + 'duration': duration, + 'size': None, + 'ext': dest_path.suffix.lower(), + 'media_type': media_kind, + 'media_kind': media_kind, + } + try: + payload['size'] = dest_path.stat().st_size + except OSError: + payload['size'] = None + + try: + debug(f"[_persist_local_metadata] Saving metadata to DB at: {library_root}") + db_path = Path(library_root) / ".downlow_library.db" + debug(f"[_persist_local_metadata] Database file: {db_path}, exists: {db_path.exists()}") + debug(f"[_persist_local_metadata] File: {dest_path}, exists: {dest_path.exists()}, Tags: {len(tags)}, Hash: {file_hash}") + debug(f"[_persist_local_metadata] Absolute dest_path: {dest_path.resolve()}") + + with LocalLibraryDB(library_root) as db: + # Save metadata FIRST to ensure file entry is created in DB + if any(payload.values()): + debug(f"[_persist_local_metadata] Saving metadata payload first") + try: + db.save_metadata(dest_path, payload) + debug(f"[_persist_local_metadata] ✅ Metadata saved") + except Exception as meta_exc: + log(f"[_persist_local_metadata] ❌ Failed to save metadata: {meta_exc}", file=sys.stderr) + raise + + # Save tags to DB synchronously in same transaction + # For local storage, DB is the primary source of truth + if tags: + try: + debug(f"[_persist_local_metadata] Saving {len(tags)} tags to DB") + db.save_tags(dest_path, tags) + debug(f"[_persist_local_metadata] ✅ Tags saved to DB") + except Exception as tag_exc: + log(f"[_persist_local_metadata] ⚠️ Failed to save tags to DB: {tag_exc}", file=sys.stderr) + raise + + # NOTE: Sidecar files are intentionally NOT created for local storage + # Local storage uses database as primary source, not sidecar files + + debug(f"[_persist_local_metadata] ✅ Metadata persisted successfully") + except Exception as exc: + log(f"⚠️ Failed to persist metadata to local database: {exc}", file=sys.stderr) + import traceback + log(traceback.format_exc(), file=sys.stderr) + + +def _handle_local_transfer(media_path: Path, destination_root: Path, result: Any, config: Optional[Dict[str, Any]] = None) -> Tuple[int, Optional[Path]]: + """Transfer a file to local storage and return (exit_code, destination_path). + + Args: + media_path: Path to source file + destination_root: Destination directory + result: Result object with metadata + config: Configuration dictionary + + Returns: + Tuple of (exit_code, destination_path) + - exit_code: 0 on success, 1 on failure + - destination_path: Path to moved file on success, None on failure + """ + destination_root = destination_root.expanduser() + try: + destination_root.mkdir(parents=True, exist_ok=True) + except Exception as exc: + log(f"❌ Cannot prepare destination directory {destination_root}: {exc}", file=sys.stderr) + return 1, None + + + tags_from_result = extract_tags_from_result(result) + urls_from_result = extract_known_urls_from_result(result) + # Get origin from result if available + result_origin = None + if hasattr(result, "origin"): + result_origin = result.origin + elif isinstance(result, dict): + result_origin = result.get("origin") or result.get("source") + sidecar_path, sidecar_hash, sidecar_tags, sidecar_urls = _load_sidecar_bundle(media_path, origin=result_origin, config=config) + + # Normalize all title tags to use spaces instead of underscores BEFORE merging + # This ensures that "Radiohead - Creep" and "Radiohead_-_Creep" are treated as the same title + def normalize_title_tag(tag: str) -> str: + """Normalize a title tag by replacing underscores with spaces.""" + if str(tag).strip().lower().startswith("title:"): + parts = tag.split(":", 1) + if len(parts) == 2: + value = parts[1].replace("_", " ").strip() + return f"title:{value}" + return tag + + tags_from_result = [normalize_title_tag(t) for t in tags_from_result] + sidecar_tags = [normalize_title_tag(t) for t in sidecar_tags] + + # Merge tags carefully: if URL has title tag, don't include sidecar title tags + # This prevents duplicate title: tags when URL provides a title + has_url_title = any(str(t).strip().lower().startswith("title:") for t in tags_from_result) + if has_url_title: + # URL has a title, filter out any sidecar title tags to avoid duplication + sidecar_tags_filtered = [t for t in sidecar_tags if not str(t).strip().lower().startswith("title:")] + merged_tags = merge_sequences(tags_from_result, sidecar_tags_filtered, case_sensitive=True) + else: + # No URL title, use all sidecar tags + merged_tags = merge_sequences(tags_from_result, sidecar_tags, case_sensitive=True) + + merged_urls = merge_sequences(urls_from_result, sidecar_urls, case_sensitive=False) + relationships = extract_relationships(result) + duration = extract_duration(result) + + try: + dest_file = storage["local"].upload(media_path, location=str(destination_root), move=True) + except Exception as exc: + log(f"❌ Failed to move file into {destination_root}: {exc}", file=sys.stderr) + return 1, None + + dest_path = Path(dest_file) + file_hash = _resolve_file_hash(result, sidecar_hash, dest_path) + media_kind = _resolve_media_kind(result, dest_path) + + # Ensure only ONE title tag that matches the actual filename + # Remove all existing title tags and add one based on the saved filename + merged_tags_no_titles = [t for t in merged_tags if not str(t).strip().lower().startswith("title:")] + filename_title = dest_path.stem.replace("_", " ").strip() + if filename_title: + merged_tags_no_titles.insert(0, f"title:{filename_title}") + + _persist_local_metadata(destination_root, dest_path, merged_tags_no_titles, merged_urls, file_hash, relationships, duration, media_kind) + _cleanup_sidecar_files(media_path, sidecar_path) + debug(f"✅ Moved to local library: {dest_path}") + return 0, dest_path + + + + + +def _run(result: Any, _args: Sequence[str], config: Dict[str, Any]) -> int: + """Upload/copy a file to specified location. + + Returns 0 on success, non-zero on failure. + """ + import sys # For stderr output + + # Help + try: + if any(str(a).lower() in {"-?", "/?", "--help", "-h", "help", "--cmdlet"} for a in _args): + log(json.dumps(CMDLET, ensure_ascii=False, indent=2)) + return 0 + except Exception: + pass + + debug("Starting add-file cmdlet") + + # Handle list of results (from piped commands that emit multiple items) + if isinstance(result, list): + debug(f"Processing {len(result)} piped files") + success_count = 0 + for item in result: + exit_code = _run(item, _args, config) + if exit_code == 0: + success_count += 1 + return 0 if success_count > 0 else 1 + + # Parse arguments using CMDLET spec + parsed = parse_cmdlet_args(_args, CMDLET) + location: Optional[str] = None + provider_name: Optional[str] = None + delete_after_upload = False + + # Check if -path argument was provided to use direct file path instead of piped result + path_arg = parsed.get("path") + if path_arg: + # Create a pseudo-result object from the file path + media_path = Path(str(path_arg).strip()) + if not media_path.exists(): + log(f"❌ File not found: {media_path}") + return 1 + # Create result dict with the file path and origin 'wild' for direct path inputs + result = {"target": str(media_path), "origin": "wild"} + log(f"Using direct file path: {media_path}") + + # Get location from parsed args - now uses SharedArgs.STORAGE so key is "storage" + location = parsed.get("storage") + if location: + location = str(location).lower().strip() + + # Get file provider from parsed args + provider_name = parsed.get("provider") + if provider_name: + provider_name = str(provider_name).lower().strip() + + # Check for delete flag (presence in parsed dict means it was provided) + delete_after_upload = "delete" in parsed + + # Either storage or provider must be specified, but not both + if location is None and provider_name is None: + log("Either -storage or -provider must be specified") + log(" -storage options: 'hydrus', 'local', or a directory path") + log(" -provider options: '0x0'") + return 1 + + if location is not None and provider_name is not None: + log("❌ Cannot specify both -storage and -provider") + return 1 + + # Validate location (storage backends) + is_valid_location = False + if location is not None: + valid_locations = {'hydrus', 'local'} + is_valid_location = location in valid_locations + is_local_path = not is_valid_location and location is not None and ('/' in location or '\\' in location or ':' in location) + + if location is not None and not (is_valid_location or is_local_path): + log(f"❌ Invalid location: {location}") + log(f"Valid options: 'hydrus', '0x0', 'local', or a directory path (e.g., C:\\Music or /home/user/music)") + return 1 + + # Extract tags/known URLs from pipeline objects if available + pipe_object_tags = extract_tags_from_result(result) + if pipe_object_tags: + log(f"Extracted {len(pipe_object_tags)} tag(s) from pipeline result: {', '.join(pipe_object_tags[:5])}", file=sys.stderr) + pipe_known_urls = extract_known_urls_from_result(result) + + # Resolve media path: get from piped result + # Support both object attributes (getattr) and dict keys (get) + target = None + origin = None + + # Try object attributes first + if hasattr(result, "target"): + target = result.target + elif hasattr(result, "path"): + target = result.path + elif hasattr(result, "file_path"): + target = result.file_path + # Try dict keys if object attributes failed + elif isinstance(result, dict): + target = (result.get("target") or result.get("path") or result.get("file_path") or + result.get("__file_path") or result.get("__path") or result.get("__target")) + + # Get origin to detect Hydrus files + if hasattr(result, "origin"): + origin = result.origin + elif hasattr(result, "source"): + origin = result.source + elif isinstance(result, dict): + origin = result.get("origin") or result.get("source") or result.get("__source") + + # Convert target to string and preserve URLs (don't let Path() mangle them) + target_str = str(target) if target else None + + # Check if this is a playlist item that needs to be downloaded first + is_playlist_item = isinstance(result, dict) and result.get("__source") == "playlist-probe" + if is_playlist_item and target_str and target_str.lower().startswith(("http://", "https://")): + # This is a playlist item URL - we need to download it first + log(f"Detected playlist item, downloading: {target_str}", file=sys.stderr) + + # Extract item number if available + item_num = None + if "__action" in result and result["__action"].startswith("playlist-item:"): + item_num = result["__action"].split(":")[1] + elif "index" in result: + item_num = result["index"] + + # Call download-data to download this specific item + # Pass the item number so it knows which track to download + from cmdlets import download_data as dl_module + + # Capture emissions from download-data to process them + captured_results = [] + original_emit = ctx.emit + + def capture_emit(obj): + captured_results.append(obj) + # Also emit to original so user sees progress/output if needed + # But since add-file is usually terminal, we might not need to + # original_emit(obj) + + # Temporarily hook the pipeline emit function + ctx.emit = capture_emit + + try: + if item_num: + # Pass a marker dict to tell download-data which item to get + download_result = dl_module._run( + { + "__playlist_url": str(target_str), + "__playlist_item": int(item_num) + }, + [], + config + ) + else: + # Fallback: just download the URL (will show all items) + download_result = dl_module._run(None, [str(target_str)], config) + finally: + # Restore original emit function + ctx.emit = original_emit + + if download_result != 0: + log(f"❌ Failed to download playlist item", file=sys.stderr) + return 1 + + log(f"✓ Playlist item downloaded, processing {len(captured_results)} file(s)...", file=sys.stderr) + + # Process the downloaded files recursively + success_count = 0 + for res in captured_results: + # Recursively call add-file with the downloaded result + # This ensures tags and metadata from download-data are applied + if _run(res, _args, config) == 0: + success_count += 1 + + return 0 if success_count > 0 else 1 + # Determine media_path from result + media_path: Optional[Path] = None + is_hydrus_file = origin and origin.lower() == "hydrus" + + if target_str: + # Check if it's a URL or Hydrus hash + if target_str.lower().startswith(("http://", "https://")): + media_path = None # Will handle as Hydrus file below + elif not is_hydrus_file: + # Only treat as local path if not a Hydrus file + media_path = Path(target_str) + + if media_path is None and not is_hydrus_file and (target_str is None or not target_str.lower().startswith(("http://", "https://"))): + # Check if this is a format object from download-data + if isinstance(result, dict) and result.get('format_id') is not None: + log("❌ Format object received, but add-file expects a downloaded file") + log(f" Tip: Use @N to automatically select and download the format") + log(f" Streamlined workflow:") + log(f" download-data \"URL\" | @{result.get('index', 'N')} | add-file -storage local") + log(f" (The @N automatically expands to download-data \"URL\" -item N)") + return 1 + log("❌ File not found: provide a piped file result or local file path") + return 1 + + # Check if this is a Hydrus file - fetch the actual file path from Hydrus + if is_hydrus_file and target_str: + log(f"Detected Hydrus file (hash: {target_str}), fetching local path from Hydrus...", file=sys.stderr) + try: + from helper import hydrus + + # Get the Hydrus client + client = hydrus.get_client(config) + if not client: + log(f"❌ Hydrus client unavailable", file=sys.stderr) + return 1 + + # target_str is the hash - need to get the actual file path from Hydrus + file_hash = target_str + + # Call the /get_files/file_path endpoint to get the actual file path + response = client.get_file_path(file_hash) + if not response or not isinstance(response, dict): + log(f"❌ Hydrus file_path endpoint returned invalid response", file=sys.stderr) + return 1 + + file_path_str = response.get("path") + if not file_path_str: + log(f"❌ Hydrus file_path endpoint did not return a path", file=sys.stderr) + return 1 + + media_path = Path(file_path_str) + if not media_path.exists(): + log(f"❌ Hydrus file path does not exist: {media_path}", file=sys.stderr) + return 1 + + log(f"✓ Retrieved Hydrus file path: {media_path}", file=sys.stderr) + + except Exception as exc: + log(f"❌ Failed to get Hydrus file path: {exc}", file=sys.stderr) + import traceback + log(f"Traceback: {traceback.format_exc()}", file=sys.stderr) + return 1 + + # Generic URL handler: if target is a URL and we haven't resolved a local path yet + # This handles cases like "search-file -provider openlibrary ... | add-file -storage local" + if target_str and target_str.lower().startswith(("http://", "https://")) and not is_hydrus_file and not is_playlist_item and media_path is None: + log(f"Target is a URL, delegating to download-data: {target_str}", file=sys.stderr) + from cmdlets import download_data as dl_module + + dl_args = [] + if location: + dl_args.extend(["-storage", location]) + + # Map provider 0x0 to storage 0x0 for download-data + if provider_name == "0x0": + dl_args.extend(["-storage", "0x0"]) + + return dl_module._run(result, dl_args, config) + + if media_path is None: + log("File path could not be resolved") + return 1 + + if not media_path.exists() or not media_path.is_file(): + log(f"File not found: {media_path}") + return 1 + + # Validate file type - only accept Hydrus-supported files + file_extension = media_path.suffix.lower() + if file_extension not in SUPPORTED_MEDIA_EXTENSIONS: + log(f"❌ Unsupported file type: {file_extension}", file=sys.stderr) + log(f"Hydrus supports the following file types:", file=sys.stderr) + # Display by category from hydrus_wrapper + for category, extensions in sorted(hydrus_wrapper.SUPPORTED_FILETYPES.items()): + ext_list = ', '.join(sorted(e.lstrip('.') for e in extensions.keys())) + log(f"{category.capitalize()}: {ext_list}", file=sys.stderr) + log(f"Skipping this file: {media_path.name}", file=sys.stderr) + return 1 + + # Handle based on provider or storage + if provider_name is not None: + # Use file provider (e.g., 0x0.st) + from helper.search_provider import get_file_provider + + log(f"Uploading via {provider_name} file provider: {media_path.name}", file=sys.stderr) + + try: + file_provider = get_file_provider(provider_name, config) + if file_provider is None: + log(f"❌ File provider '{provider_name}' not available", file=sys.stderr) + return 1 + + hoster_url = file_provider.upload(media_path) + log(f"✅ File uploaded to {provider_name}: {hoster_url}", file=sys.stderr) + + # Associate the URL with the file in Hydrus if possible + current_hash = locals().get('file_hash') + if not current_hash: + current_hash = _resolve_file_hash(result, None, media_path) + + if current_hash: + try: + client = hydrus_wrapper.get_client(config) + if client: + client.associate_url(current_hash, hoster_url) + log(f"✅ Associated URL with file hash {current_hash}", file=sys.stderr) + except Exception as exc: + log(f"⚠️ Could not associate URL with Hydrus file: {exc}", file=sys.stderr) + + except Exception as exc: + log(f"❌ {provider_name} upload failed: {exc}", file=sys.stderr) + return 1 + + if delete_after_upload: + try: + media_path.unlink() + _cleanup_sidecar_files(media_path) + log(f"✅ Deleted file and sidecar", file=sys.stderr) + except Exception as exc: + log(f"⚠️ Could not delete file: {exc}", file=sys.stderr) + + return 0 + + # Handle storage-based operations (location is not None here) + valid_locations = {'hydrus', 'local'} + is_valid_location = location in valid_locations + is_local_path = not is_valid_location and ('/' in location or '\\' in location or ':' in location) + + if not (is_valid_location or is_local_path): + log(f"❌ Invalid location: {location}") + log(f"Valid options: 'hydrus', 'local', or a directory path (e.g., C:\\Music or /home/user/music)") + return 1 + + if location == 'local': + try: + from config import get_local_storage_path + resolved_dir = get_local_storage_path(config) + except Exception: + resolved_dir = None + + if not resolved_dir: + resolved_dir = config.get("LocalDir") or config.get("OutputDir") + + if not resolved_dir: + log("❌ No local storage path configured. Set 'storage.local.path' in config.json", file=sys.stderr) + return 1 + + log(f"Moving into configured local library: {resolved_dir}", file=sys.stderr) + exit_code, dest_path = _handle_local_transfer(media_path, Path(resolved_dir), result, config) + + # After successful local transfer, emit result for pipeline continuation + # This allows downstream commands like add-tags to chain automatically + if exit_code == 0 and dest_path: + # Extract tags from result for emission + emit_tags = extract_tags_from_result(result) + file_hash = _resolve_file_hash(result, None, dest_path) + + # Extract title from original result, fallback to filename if not available + result_title = extract_title_from_result(result) or dest_path.name + + # Always emit result for local files, even if no tags + # This allows @N selection and piping to downstream commands + result_dict = create_pipe_object_result( + source='local', + identifier=str(dest_path), + file_path=str(dest_path), + cmdlet_name='add-file', + title=result_title, + file_hash=file_hash, + tags=emit_tags if emit_tags else [], + target=str(dest_path) # Explicit target for get-file + ) + ctx.emit(result_dict) + + # Clear the stage table so downstream @N doesn't try to re-run download-data + # Next stage will use these local file results, not format objects + ctx.set_current_stage_table(None) + + return exit_code + + elif is_local_path: + try: + destination_root = Path(location) + except Exception as exc: + log(f"❌ Invalid destination path '{location}': {exc}", file=sys.stderr) + return 1 + + log(f"Moving to local path: {destination_root}", file=sys.stderr) + exit_code, dest_path = _handle_local_transfer(media_path, destination_root, result, config) + + # After successful local transfer, emit result for pipeline continuation + if exit_code == 0 and dest_path: + # Extract tags from result for emission + emit_tags = extract_tags_from_result(result) + file_hash = _resolve_file_hash(result, None, dest_path) + + # Extract title from original result, fallback to filename if not available + result_title = extract_title_from_result(result) or dest_path.name + + # Always emit result for local files, even if no tags + # This allows @N selection and piping to downstream commands + result_dict = create_pipe_object_result( + source='local', + identifier=str(dest_path), + file_path=str(dest_path), + cmdlet_name='add-file', + title=result_title, + file_hash=file_hash, + tags=emit_tags if emit_tags else [], + target=str(dest_path) # Explicit target for get-file + ) + ctx.emit(result_dict) + + # Clear the stage table so downstream @N doesn't try to re-run download-data + # Next stage will use these local file results, not format objects + ctx.set_current_stage_table(None) + + return exit_code + + # location == 'hydrus' + # Compute file hash to check if already in Hydrus + log(f"Uploading to Hydrus: {media_path.name}", file=sys.stderr) + log(f"Computing SHA-256 hash for: {media_path.name}", file=sys.stderr) + try: + file_hash = sha256_file(media_path) + except Exception as exc: + log(f"❌ Failed to compute file hash: {exc}", file=sys.stderr) + return 1 + log(f"File hash: {file_hash}", file=sys.stderr) + + # Read sidecar tags and known URLs first (for tagging) + + sidecar_path, hash_from_sidecar, sidecar_tags, sidecar_urls = _load_sidecar_bundle(media_path, origin=origin, config=config) + if sidecar_path: + log(f"Found sidecar at: {sidecar_path}", file=sys.stderr) + log(f"Read sidecar: hash={hash_from_sidecar}, {len(sidecar_tags)} tag(s), {len(sidecar_urls)} URL(s)", file=sys.stderr) + if sidecar_tags: + log(f"Sidecar tags: {sidecar_tags}", file=sys.stderr) + if sidecar_urls: + log(f"Sidecar URLs: {sidecar_urls}", file=sys.stderr) + else: + log(f"No sidecar found for {media_path.name}", file=sys.stderr) + + # Normalize all title tags to use spaces instead of underscores BEFORE merging + # This ensures that "Radiohead - Creep" and "Radiohead_-_Creep" are treated as the same title + def normalize_title_tag(tag: str) -> str: + """Normalize a title tag by replacing underscores with spaces.""" + if str(tag).strip().lower().startswith("title:"): + parts = tag.split(":", 1) + if len(parts) == 2: + value = parts[1].replace("_", " ").strip() + return f"title:{value}" + return tag + + sidecar_tags = [normalize_title_tag(t) for t in sidecar_tags] + pipe_object_tags = [normalize_title_tag(t) for t in pipe_object_tags] + + # Merge tags from PipeObject with tags from sidecar + # NOTE: Remove ALL existing title tags and use only filename-based title + # The filename is the source of truth for the title + tags_without_titles = [t for t in merge_sequences(sidecar_tags, pipe_object_tags, case_sensitive=True) + if not str(t).strip().lower().startswith("title:")] + + # Ensure ONE title tag based on the actual filename + filename_title = media_path.stem.replace("_", " ").strip() + if filename_title: + tags = [f"title:{filename_title}"] + tags_without_titles + else: + tags = tags_without_titles + + known_urls = merge_sequences(sidecar_urls, pipe_known_urls, case_sensitive=False) + + if pipe_object_tags: + log(f"Merged pipeline tags. Total tags now: {len(tags)}", file=sys.stderr) + + # Write metadata to file before uploading (only for local storage, not for Hydrus) + # Hydrus stores tags separately, so we don't need to modify the file + if location != 'hydrus': + try: + if tags: + # Determine file kind from extension + file_kind = '' + sfx = media_path.suffix.lower() + if sfx in {'.mp3', '.flac', '.wav', '.m4a', '.aac', '.ogg', '.opus', '.mka'}: + file_kind = 'audio' + elif sfx in {'.mp4', '.mkv', '.webm', '.mov', '.avi', '.flv', '.mpg', '.mpeg', '.ts', '.m4v', '.wmv'}: + file_kind = 'video' + + if embed_metadata_in_file(media_path, tags, file_kind): + log(f"Wrote metadata tags to file: {media_path.name}", file=sys.stderr) + else: + log(f"Note: Could not embed metadata in file (may not be supported format)", file=sys.stderr) + except Exception as exc: + log(f"Warning: Failed to write metadata to file: {exc}", file=sys.stderr) + else: + log(f"Note: Skipping FFmpeg metadata embedding for Hydrus (tags managed separately)", file=sys.stderr) + + # Use FileStorage backend to upload to Hydrus + try: + file_hash = storage["hydrus"].upload( + media_path, + config=config, + tags=tags, + ) + log(f"✅ File uploaded to Hydrus: {file_hash}", file=sys.stderr) + except Exception as exc: + log(f"❌ Hydrus upload failed: {exc}", file=sys.stderr) + return 1 + + # Associate known URLs in Hydrus metadata + url_count = 0 + if known_urls: + try: + client = hydrus_wrapper.get_client(config) + if client: + for url in known_urls: + u = str(url or "").strip() + if not u: + continue + try: + client.associate_url(file_hash, u) + except Exception as exc: + log(f"Hydrus associate-url failed for {u}: {exc}", file=sys.stderr) + continue + url_count += 1 + except Exception as exc: + log(f"Failed to associate URLs: {exc}", file=sys.stderr) + + if url_count: + log(f"✅ Associated {url_count} URL(s)", file=sys.stderr) + else: + log(f"No URLs to associate", file=sys.stderr) + + _cleanup_sidecar_files(media_path, sidecar_path) + + # Update in-memory result for downstream pipes + try: + # Only update piped result objects; direct -path usage may have a dummy result + setattr(result, "hash_hex", file_hash) + # Preserve media_kind for downstream commands (e.g., open) + if not hasattr(result, "media_kind") or getattr(result, "media_kind") == "other": + # Try to infer media_kind from file extension or keep existing + suffix = media_path.suffix.lower() + if suffix in {'.pdf', '.epub', '.txt', '.mobi', '.azw3', '.cbz', '.cbr', '.rtf', '.md', '.html', '.htm', '.doc', '.docx'}: + setattr(result, "media_kind", "document") + if hasattr(result, "columns") and isinstance(getattr(result, "columns"), list): + cols = list(getattr(result, "columns")) + if ("Hash", file_hash) not in cols: + cols.append(("Hash", file_hash)) + setattr(result, "columns", cols) + except Exception: + pass + + # If -delete flag is set, delete the file and .tags after successful upload + if delete_after_upload: + log(f"Deleting local files (as requested)...", file=sys.stderr) + try: + media_path.unlink() + log(f"✅ Deleted: {media_path.name}", file=sys.stderr) + except OSError as exc: + log(f"Failed to delete file: {exc}", file=sys.stderr) + + # Delete .tags sidecar if it exists + if sidecar_path is not None: + try: + sidecar_path.unlink() + log(f"✅ Deleted: {sidecar_path.name}", file=sys.stderr) + except OSError as exc: + log(f"Failed to delete sidecar: {exc}", file=sys.stderr) + + log(f"✅ Successfully completed: {media_path.name} (hash={file_hash})", file=sys.stderr) + + # Emit result for Hydrus uploads so downstream commands know about it + if location == 'hydrus': + # Extract title from original result, fallback to filename if not available + result_title = extract_title_from_result(result) or media_path.name + + result_dict = create_pipe_object_result( + source='hydrus', + identifier=file_hash, + file_path=f"hydrus:{file_hash}", + cmdlet_name='add-file', + title=result_title, + file_hash=file_hash, + extra={ + 'storage_source': 'hydrus', + 'hydrus_hash': file_hash, + 'tags': tags, + 'known_urls': known_urls, + } + ) + ctx.emit(result_dict) + + # Clear the stage table so downstream @N doesn't try to re-run download-data + # Next stage will use these Hydrus file results, not format objects + ctx.set_current_stage_table(None) + + return 0 + +CMDLET = Cmdlet( + name="add-file", + summary="Upload a media file to specified location (Hydrus, file provider, or local directory).", + usage="add-file (-path | ) (-storage | -provider ) [-delete]", + args=[ + CmdletArg(name="path", type="str", required=False, description="Direct file path to upload (alternative to piped result)", alias="p"), + SharedArgs.STORAGE, # For hydrus, local, or directory paths + CmdletArg(name="provider", type="str", required=False, description="File hosting provider (e.g., 0x0 for 0x0.st)", alias="prov"), + CmdletArg(name="delete", type="flag", required=False, description="Delete the file and its .tags after successful upload.", alias="del"), + ], + details=[ + "- Storage location options (use -storage):", + " hydrus: Upload to Hydrus database with metadata tagging", + " local: Copy file to local directory", + " : Copy file to specified directory", + "- File provider options (use -provider):", + " 0x0: Upload to 0x0.st for temporary hosting with public URL", + "- Accepts files from official Hydrus supported types: images, animations, videos, audio, applications, projects, and archives.", + "- When uploading to Hydrus: adds tags from .tags sidecar and associates known_urls", + "- When using file provider: uploads to service, adds URL to sidecar", + "- When copying locally: copies file with original metadata preserved", + "- Use -delete flag to automatically delete the file and .tags after successful operation.", + ], +) \ No newline at end of file diff --git a/cmdlets/add_note.py b/cmdlets/add_note.py new file mode 100644 index 0000000..9a24964 --- /dev/null +++ b/cmdlets/add_note.py @@ -0,0 +1,84 @@ +from __future__ import annotations + +from typing import Any, Dict, Sequence +import json + +from . import register +import models +import pipeline as ctx +from helper import hydrus as hydrus_wrapper +from ._shared import Cmdlet, CmdletArg, normalize_hash +from helper.logger import log + +CMDLET = Cmdlet( + name="add-note", + summary="Add or set a note on a Hydrus file.", + usage="add-note [-hash ] ", + args=[ + CmdletArg("hash", type="string", description="Override the Hydrus file hash (SHA256) to target instead of the selected result."), + CmdletArg("name", type="string", required=True, description="The note name/key to set (e.g. 'comment', 'source', etc.)."), + CmdletArg("text", type="string", required=True, description="The note text/content to store.", variadic=True), + ], + details=[ + "- Notes are stored in the 'my notes' service by default.", + ], +) + + +@register(["add-note", "set-note", "add_note"]) # aliases +def add(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: + # Help + try: + if any(str(a).lower() in {"-?", "/?", "--help", "-h", "help", "--cmdlet"} for a in args): + log(json.dumps(CMDLET, ensure_ascii=False, indent=2)) + return 0 + except Exception: + pass + + from ._shared import parse_cmdlet_args + parsed = parse_cmdlet_args(args, CMDLET) + override_hash = parsed.get("hash") + name = parsed.get("name") + text_parts = parsed.get("text") + + if not name: + log("Requires a note name") + return 1 + + name = str(name).strip() + + if isinstance(text_parts, list): + text = " ".join(text_parts).strip() + else: + text = str(text_parts or "").strip() + + if not text: + log("Empty note text") + return 1 + + # Handle @N selection which creates a list - extract the first item + if isinstance(result, list) and len(result) > 0: + result = result[0] + + hash_hex = normalize_hash(override_hash) if override_hash else normalize_hash(getattr(result, "hash_hex", None)) + if not hash_hex: + log("Selected result does not include a Hydrus hash") + return 1 + try: + client = hydrus_wrapper.get_client(config) + except Exception as exc: + log(f"Hydrus client unavailable: {exc}") + return 1 + + if client is None: + log("Hydrus client unavailable") + return 1 + try: + service_name = "my notes" + client.set_notes(hash_hex, {name: text}, service_name) + except Exception as exc: + log(f"Hydrus add-note failed: {exc}") + return 1 + ctx.emit(f"Added note '{name}' ({len(text)} chars)") + return 0 + diff --git a/cmdlets/add_relationship.py b/cmdlets/add_relationship.py new file mode 100644 index 0000000..ff9af02 --- /dev/null +++ b/cmdlets/add_relationship.py @@ -0,0 +1,264 @@ +"""Add file relationships in Hydrus based on relationship tags in sidecar.""" + +from __future__ import annotations + +from typing import Any, Dict, Optional, Sequence +import json +import re +from pathlib import Path +import sys + +from helper.logger import log + +from . import register +import models +import pipeline as ctx +from helper import hydrus as hydrus_wrapper +from ._shared import Cmdlet, CmdletArg, parse_cmdlet_args +from helper.local_library import read_sidecar, find_sidecar + + +CMDLET = Cmdlet( + name="add-relationship", + summary="Associate file relationships (king/alt/related) in Hydrus based on relationship tags in sidecar.", + usage="add-relationship OR add-relationship -path ", + args=[ + CmdletArg("path", type="string", description="Specify the local file path (if not piping a result)."), + ], + details=[ + "- Reads relationship tags from sidecar (format: 'relationship: hash(king),hash(alt),hash(related)')", + "- Calls Hydrus API to associate the hashes as relationships", + "- Supports three relationship types: king (primary), alt (alternative), related (other versions)", + "- Works with piped file results or -path argument for direct invocation", + ], +) + + +def _normalise_hash_hex(value: Optional[str]) -> Optional[str]: + """Normalize a hash hex string to lowercase 64-char format.""" + if not value or not isinstance(value, str): + return None + normalized = value.strip().lower() + if len(normalized) == 64 and all(c in '0123456789abcdef' for c in normalized): + return normalized + return None + + +def _extract_relationships_from_tag(tag_value: str) -> Dict[str, list[str]]: + """Parse relationship tag like 'relationship: hash(king),hash(alt)'. + + Returns a dict like {"king": ["HASH1"], "alt": ["HASH2"], ...} + """ + result: Dict[str, list[str]] = {} + if not isinstance(tag_value, str): + return result + + # Match patterns like hash(king)HASH or hash(type)HASH (no angle brackets) + pattern = r'hash\((\w+)\)([a-fA-F0-9]{64})' + matches = re.findall(pattern, tag_value) + + for rel_type, hash_value in matches: + normalized = _normalise_hash_hex(hash_value) + if normalized: + if rel_type not in result: + result[rel_type] = [] + result[rel_type].append(normalized) + + return result + + +@register(["add-relationship", "add-rel"]) # primary name and alias +def _run(result: Any, _args: Sequence[str], config: Dict[str, Any]) -> int: + """Associate file relationships in Hydrus. + + Two modes of operation: + 1. Read from sidecar: Looks for relationship tags in the file's sidecar (format: "relationship: hash(king),hash(alt)") + 2. Pipeline mode: When piping multiple results, the first becomes "king" and subsequent items become "alt" + + Returns 0 on success, non-zero on failure. + """ + # Help + try: + if any(str(a).lower() in {"-?", "/?", "--help", "-h", "help", "--cmdlet"} for a in _args): + log(json.dumps(CMDLET, ensure_ascii=False, indent=2)) + return 0 + except Exception: + pass + + # Parse arguments using CMDLET spec + parsed = parse_cmdlet_args(_args, CMDLET) + arg_path: Optional[Path] = None + if parsed: + # Get the first arg value (e.g., -path) + first_arg_name = CMDLET.get("args", [{}])[0].get("name") if CMDLET.get("args") else None + if first_arg_name and first_arg_name in parsed: + arg_value = parsed[first_arg_name] + try: + arg_path = Path(str(arg_value)).expanduser() + except Exception: + arg_path = Path(str(arg_value)) + + # Get Hydrus client + try: + client = hydrus_wrapper.get_client(config) + except Exception as exc: + log(f"Hydrus client unavailable: {exc}", file=sys.stderr) + return 1 + + if client is None: + log("Hydrus client unavailable", file=sys.stderr) + return 1 + + # Handle @N selection which creates a list - extract the first item + if isinstance(result, list) and len(result) > 0: + result = result[0] + + # Check if we're in pipeline mode (have a hash) or file mode + file_hash = getattr(result, "hash_hex", None) + + # PIPELINE MODE: Track relationships across multiple items + if file_hash: + file_hash = _normalise_hash_hex(file_hash) + if not file_hash: + log("Invalid file hash format", file=sys.stderr) + return 1 + + # Load or initialize king hash from pipeline context + try: + king_hash = ctx.load_value("relationship_king") + except Exception: + king_hash = None + + # If this is the first item, make it the king + if not king_hash: + try: + ctx.store_value("relationship_king", file_hash) + log(f"Established king hash: {file_hash}", file=sys.stderr) + return 0 # First item just becomes the king, no relationships yet + except Exception: + pass + + # If we already have a king and this is a different hash, link them + if king_hash and king_hash != file_hash: + try: + client.set_relationship(file_hash, king_hash, "alt") + log( + f"[add-relationship] Set alt relationship: {file_hash} <-> {king_hash}", + file=sys.stderr + ) + return 0 + except Exception as exc: + log(f"Failed to set relationship: {exc}", file=sys.stderr) + return 1 + + return 0 + + # FILE MODE: Read relationships from sidecar + log("Note: Use piping mode for easier relationships. Example: 1,2,3 | add-relationship", file=sys.stderr) + + # Resolve media path from -path arg or result target + target = getattr(result, "target", None) or getattr(result, "path", None) + media_path = arg_path if arg_path is not None else Path(str(target)) if isinstance(target, str) else None + if media_path is None: + log("Provide -path or pipe a local file result", file=sys.stderr) + return 1 + + # Validate local file + if str(media_path).lower().startswith(("http://", "https://")): + log("This cmdlet requires a local file path, not a URL", file=sys.stderr) + return 1 + if not media_path.exists() or not media_path.is_file(): + log(f"File not found: {media_path}", file=sys.stderr) + return 1 + + # Build Hydrus client + try: + client = hydrus_wrapper.get_client(config) + except Exception as exc: + log(f"Hydrus client unavailable: {exc}", file=sys.stderr) + return 1 + + if client is None: + log("Hydrus client unavailable", file=sys.stderr) + return 1 + + # Read sidecar to find relationship tags + sidecar_path = find_sidecar(media_path) + if sidecar_path is None: + log(f"No sidecar found for {media_path.name}", file=sys.stderr) + return 1 + + try: + _, tags, _ = read_sidecar(sidecar_path) + except Exception as exc: + log(f"Failed to read sidecar: {exc}", file=sys.stderr) + return 1 + + # Find relationship tags (format: "relationship: hash(king),hash(alt),hash(related)") + relationship_tags = [t for t in tags if isinstance(t, str) and t.lower().startswith("relationship:")] + + if not relationship_tags: + log(f"No relationship tags found in sidecar", file=sys.stderr) + return 0 # Not an error, just nothing to do + + # Get the file hash from result (should have been set by add-file) + file_hash = getattr(result, "hash_hex", None) + if not file_hash: + log("File hash not available (run add-file first)", file=sys.stderr) + return 1 + + file_hash = _normalise_hash_hex(file_hash) + if not file_hash: + log("Invalid file hash format", file=sys.stderr) + return 1 + + # Parse relationships from tags and apply them + success_count = 0 + error_count = 0 + + for rel_tag in relationship_tags: + try: + # Parse: "relationship: hash(king),hash(alt),hash(related)" + rel_str = rel_tag.split(":", 1)[1].strip() # Get part after "relationship:" + + # Parse relationships + rels = _extract_relationships_from_tag(f"relationship: {rel_str}") + + # Set the relationships in Hydrus + for rel_type, related_hashes in rels.items(): + if not related_hashes: + continue + + for related_hash in related_hashes: + # Don't set relationship between hash and itself + if file_hash == related_hash: + continue + + try: + client.set_relationship(file_hash, related_hash, rel_type) + log( + f"[add-relationship] Set {rel_type} relationship: " + f"{file_hash} <-> {related_hash}", + file=sys.stderr + ) + success_count += 1 + except Exception as exc: + log(f"Failed to set {rel_type} relationship: {exc}", file=sys.stderr) + error_count += 1 + + except Exception as exc: + log(f"Failed to parse relationship tag: {exc}", file=sys.stderr) + error_count += 1 + + if success_count > 0: + log(f"Successfully set {success_count} relationship(s) for {media_path.name}", file=sys.stderr) + ctx.emit(f"add-relationship: {media_path.name} ({success_count} relationships set)") + return 0 + elif error_count == 0: + log(f"No relationships to set", file=sys.stderr) + return 0 # Success with nothing to do + else: + log(f"Failed with {error_count} error(s)", file=sys.stderr) + return 1 + + diff --git a/cmdlets/add_tags.py b/cmdlets/add_tags.py new file mode 100644 index 0000000..5b5faec --- /dev/null +++ b/cmdlets/add_tags.py @@ -0,0 +1,276 @@ +from __future__ import annotations + +from typing import Any, Dict, List, Sequence, Optional +import json +from pathlib import Path +import sys + +from helper.logger import log + +from . import register +import models +import pipeline as ctx +from ._shared import normalize_result_input, filter_results_by_temp +from helper import hydrus as hydrus_wrapper +from helper.local_library import read_sidecar, write_sidecar, find_sidecar, has_sidecar, LocalLibraryDB +from metadata import rename_by_metadata +from ._shared import Cmdlet, CmdletArg, normalize_hash, parse_tag_arguments, expand_tag_groups, parse_cmdlet_args +from config import get_local_storage_path + + +CMDLET = Cmdlet( + name="add-tags", + summary="Add tags to a Hydrus file or write them to a local .tags sidecar.", + usage="add-tags [-hash ] [-duplicate ] [-list [,...]] [--all] [,...]", + args=[ + CmdletArg("-hash", type="string", description="Override the Hydrus file hash (SHA256) to target instead of the selected result."), + CmdletArg("-duplicate", type="string", description="Copy existing tag values to new namespaces. Formats: title:album,artist (explicit) or title,album,artist (inferred)"), + CmdletArg("-list", type="string", description="Load predefined tag lists from adjective.json. Comma-separated list names (e.g., -list philosophy,occult)."), + CmdletArg("--all", type="flag", description="Include temporary files in tagging (by default, only tags non-temporary files)."), + CmdletArg("tags", type="string", required=True, description="One or more tags to add. Comma- or space-separated. Can also use {list_name} syntax.", variadic=True), + ], + details=[ + "- By default, only tags non-temporary files (from pipelines). Use --all to tag everything.", + "- Without -hash and when the selection is a local file, tags are written to .tags.", + "- With a Hydrus hash, tags are sent to the 'my tags' service.", + "- Multiple tags can be comma-separated or space-separated.", + "- Use -list to include predefined tag lists from adjective.json: -list philosophy,occult", + "- Tags can also reference lists with curly braces: add-tag {philosophy} \"other:tag\"", + "- Use -duplicate to copy EXISTING tag values to new namespaces:", + " Explicit format: -duplicate title:album,artist (copies title: to album: and artist:)", + " Inferred format: -duplicate title,album,artist (first is source, rest are targets)", + "- The source namespace must already exist in the file being tagged.", + "- Target namespaces that already have a value are skipped (not overwritten).", + ], +) + +@register(["add-tag", "add-tags"]) +def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: + """Add tags to a file with smart filtering for pipeline results.""" + try: + if any(str(a).lower() in {"-?", "/?", "--help", "-h", "help", "--cmdlet"} for a in args): + log(json.dumps(CMDLET, ensure_ascii=False, indent=2)) + return 0 + except Exception: + pass + + # Parse arguments + parsed = parse_cmdlet_args(args, CMDLET) + + # Check for --all flag + include_temp = parsed.get("all", False) + + # Normalize input to list + results = normalize_result_input(result) + + # Filter by temp status (unless --all is set) + if not include_temp: + results = filter_results_by_temp(results, include_temp=False) + + if not results: + log("No valid files to tag (all results were temporary; use --all to include temporary files)", file=sys.stderr) + return 1 + + # Get tags from arguments + raw_tags = parsed.get("tags", []) + if isinstance(raw_tags, str): + raw_tags = [raw_tags] + + # Handle -list argument (convert to {list} syntax) + list_arg = parsed.get("list") + if list_arg: + for l in list_arg.split(','): + l = l.strip() + if l: + raw_tags.append(f"{{{l}}}") + + # Parse and expand tags + tags_to_add = parse_tag_arguments(raw_tags) + tags_to_add = expand_tag_groups(tags_to_add) + + # Get other flags + hash_override = normalize_hash(parsed.get("hash")) + duplicate_arg = parsed.get("duplicate") + + # If no tags provided (and no list), write sidecar files with embedded tags + # Note: Since 'tags' is required=True in CMDLET, this block might be unreachable via CLI + # unless called programmatically or if required check is bypassed. + if not tags_to_add and not duplicate_arg: + # Write sidecar files with the tags that are already in the result dicts + sidecar_count = 0 + for res in results: + # Handle both dict and PipeObject formats + file_path = None + tags = [] + file_hash = "" + + if isinstance(res, models.PipeObject): + file_path = res.file_path + tags = res.extra.get('tags', []) + file_hash = res.file_hash or "" + elif isinstance(res, dict): + file_path = res.get('file_path') + tags = res.get('tags', []) # Check both tags and extra['tags'] + if not tags and 'extra' in res: + tags = res['extra'].get('tags', []) + file_hash = res.get('file_hash', "") + + if not file_path: + log(f"[add_tags] Warning: Result has no file_path, skipping", file=sys.stderr) + ctx.emit(res) + continue + + if tags: + # Write sidecar file for this file with its tags + try: + sidecar_path = write_sidecar(Path(file_path), tags, [], file_hash) + log(f"[add_tags] Wrote {len(tags)} tag(s) to sidecar: {sidecar_path}", file=sys.stderr) + sidecar_count += 1 + except Exception as e: + log(f"[add_tags] Warning: Failed to write sidecar for {file_path}: {e}", file=sys.stderr) + + ctx.emit(res) + + if sidecar_count > 0: + log(f"[add_tags] Wrote {sidecar_count} sidecar file(s) with embedded tags", file=sys.stderr) + else: + log(f"[add_tags] No tags to write - passed {len(results)} result(s) through unchanged", file=sys.stderr) + return 0 + + # Tags ARE provided - append them to each result and write sidecar files or add to Hydrus + sidecar_count = 0 + for res in results: + # Handle both dict and PipeObject formats + file_path = None + existing_tags = [] + file_hash = "" + storage_source = None + hydrus_hash = None + + if isinstance(res, models.PipeObject): + file_path = res.file_path + existing_tags = res.extra.get('tags', []) + file_hash = res.file_hash or "" + storage_source = res.extra.get('storage_source') or res.extra.get('source') + hydrus_hash = res.extra.get('hydrus_hash') + elif isinstance(res, dict): + file_path = res.get('file_path') or res.get('path') + existing_tags = res.get('tags', []) + if not existing_tags and 'extra' in res: + existing_tags = res['extra'].get('tags', []) + file_hash = res.get('file_hash', "") + storage_source = res.get('storage_source') or res.get('source') or res.get('origin') + if not storage_source and 'extra' in res: + storage_source = res['extra'].get('storage_source') or res['extra'].get('source') + # For Hydrus results from search-file, look for hash, hash_hex, or target (all contain the hash) + hydrus_hash = res.get('hydrus_hash') or res.get('hash') or res.get('hash_hex') + if not hydrus_hash and 'extra' in res: + hydrus_hash = res['extra'].get('hydrus_hash') or res['extra'].get('hash') or res['extra'].get('hash_hex') + else: + ctx.emit(res) + continue + + # Apply hash override if provided + if hash_override: + hydrus_hash = hash_override + # If we have a hash override, we treat it as a Hydrus target + storage_source = "hydrus" + + if not file_path and not hydrus_hash: + log(f"[add_tags] Warning: Result has neither file_path nor hash available, skipping", file=sys.stderr) + ctx.emit(res) + continue + + # Handle -duplicate logic (copy existing tags to new namespaces) + if duplicate_arg: + # Parse duplicate format: source:target1,target2 or source,target1,target2 + parts = duplicate_arg.split(':') + source_ns = "" + targets = [] + + if len(parts) > 1: + # Explicit format: source:target1,target2 + source_ns = parts[0] + targets = parts[1].split(',') + else: + # Inferred format: source,target1,target2 + parts = duplicate_arg.split(',') + if len(parts) > 1: + source_ns = parts[0] + targets = parts[1:] + + if source_ns and targets: + # Find tags in source namespace + source_tags = [t for t in existing_tags if t.startswith(source_ns + ':')] + for t in source_tags: + value = t.split(':', 1)[1] + for target_ns in targets: + new_tag = f"{target_ns}:{value}" + if new_tag not in existing_tags and new_tag not in tags_to_add: + tags_to_add.append(new_tag) + + # Merge new tags with existing tags, handling namespace overwrites + # When adding a tag like "namespace:value", remove any existing "namespace:*" tags + for new_tag in tags_to_add: + # Check if this is a namespaced tag (format: "namespace:value") + if ':' in new_tag: + namespace = new_tag.split(':', 1)[0] + # Remove any existing tags with the same namespace + existing_tags = [t for t in existing_tags if not (t.startswith(namespace + ':'))] + + # Add the new tag if not already present + if new_tag not in existing_tags: + existing_tags.append(new_tag) + + # Update the result's tags + if isinstance(res, models.PipeObject): + res.extra['tags'] = existing_tags + elif isinstance(res, dict): + res['tags'] = existing_tags + + # Determine where to add tags: Hydrus, local DB, or sidecar + if storage_source and storage_source.lower() == 'hydrus': + # Add tags to Hydrus using the API + target_hash = hydrus_hash or file_hash + if target_hash: + try: + log(f"[add_tags] Adding {len(existing_tags)} tag(s) to Hydrus file: {target_hash}", file=sys.stderr) + hydrus_client = hydrus_wrapper.get_client(config) + hydrus_client.add_tags(target_hash, existing_tags, "my tags") + log(f"[add_tags] ✓ Tags added to Hydrus", file=sys.stderr) + sidecar_count += 1 + except Exception as e: + log(f"[add_tags] Warning: Failed to add tags to Hydrus: {e}", file=sys.stderr) + else: + log(f"[add_tags] Warning: No hash available for Hydrus file, skipping", file=sys.stderr) + elif storage_source and storage_source.lower() == 'local': + # For local storage, save directly to DB (no sidecar needed) + if file_path: + library_root = get_local_storage_path(config) + if library_root: + try: + with LocalLibraryDB(library_root) as db: + db.save_tags(Path(file_path), existing_tags) + log(f"[add_tags] Saved {len(existing_tags)} tag(s) to local DB", file=sys.stderr) + sidecar_count += 1 + except Exception as e: + log(f"[add_tags] Warning: Failed to save tags to local DB: {e}", file=sys.stderr) + else: + log(f"[add_tags] Warning: No library root configured for local storage, skipping", file=sys.stderr) + else: + log(f"[add_tags] Warning: No file path for local storage, skipping", file=sys.stderr) + else: + # For other storage types or unknown sources, write sidecar file if we have a file path + if file_path: + try: + sidecar_path = write_sidecar(Path(file_path), existing_tags, [], file_hash) + log(f"[add_tags] Wrote {len(existing_tags)} tag(s) to sidecar: {sidecar_path}", file=sys.stderr) + sidecar_count += 1 + except Exception as e: + log(f"[add_tags] Warning: Failed to write sidecar for {file_path}: {e}", file=sys.stderr) + + # Emit the modified result + ctx.emit(res) + + log(f"[add_tags] Processed {len(results)} result(s)", file=sys.stderr) + return 0 diff --git a/cmdlets/add_url.py b/cmdlets/add_url.py new file mode 100644 index 0000000..7c95498 --- /dev/null +++ b/cmdlets/add_url.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +from typing import Any, Dict, Sequence +import json + +from . import register +import models +import pipeline as ctx +from helper import hydrus as hydrus_wrapper +from ._shared import Cmdlet, CmdletArg, normalize_hash +from helper.logger import log + +CMDLET = Cmdlet( + name="add-url", + summary="Associate a URL with a Hydrus file.", + usage="add-url [-hash ] ", + args=[ + CmdletArg("-hash", description="Override the Hydrus file hash (SHA256) to target instead of the selected result."), + CmdletArg("url", required=True, description="The URL to associate with the file."), + ], + details=[ + "- Adds the URL to the Hydrus file's known URL list.", + ], +) + + +@register(["add-url", "ass-url", "associate-url", "add_url"]) # aliases +def add(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: + # Help + try: + if any(str(a).lower() in {"-?", "/?", "--help", "-h", "help", "--cmdlet"} for a in args): + log(json.dumps(CMDLET, ensure_ascii=False, indent=2)) + return 0 + except Exception: + pass + + from ._shared import parse_cmdlet_args + parsed = parse_cmdlet_args(args, CMDLET) + override_hash = parsed.get("hash") + url = parsed.get("url") + + if not url: + log("Requires a URL argument") + return 1 + + url = str(url).strip() + if not url: + log("Requires a non-empty URL") + return 1 + + # Handle @N selection which creates a list - extract the first item + if isinstance(result, list) and len(result) > 0: + result = result[0] + + hash_hex = normalize_hash(override_hash) if override_hash else normalize_hash(getattr(result, "hash_hex", None)) + if not hash_hex: + log("Selected result does not include a Hydrus hash") + return 1 + try: + client = hydrus_wrapper.get_client(config) + except Exception as exc: + log(f"Hydrus client unavailable: {exc}") + return 1 + + if client is None: + log("Hydrus client unavailable") + return 1 + try: + client.associate_url(hash_hex, url) + except Exception as exc: + log(f"Hydrus add-url failed: {exc}") + return 1 + preview = hash_hex[:12] + ('…' if len(hash_hex) > 12 else '') + ctx.emit(f"Associated URL with {preview}: {url}") + return 0 + + + diff --git a/cmdlets/adjective.py b/cmdlets/adjective.py new file mode 100644 index 0000000..4231a91 --- /dev/null +++ b/cmdlets/adjective.py @@ -0,0 +1,148 @@ +import json +import os +import sys +from typing import List, Dict, Any, Optional, Sequence +from ._shared import Cmdlet, CmdletArg, parse_cmdlet_args +from helper.logger import log +from result_table import ResultTable +import pipeline as ctx + +ADJECTIVE_FILE = os.path.join(os.path.dirname(os.path.dirname(__file__)), "helper", "adjective.json") + +def _load_adjectives() -> Dict[str, List[str]]: + try: + if os.path.exists(ADJECTIVE_FILE): + with open(ADJECTIVE_FILE, 'r', encoding='utf-8') as f: + return json.load(f) + except Exception as e: + log(f"Error loading adjectives: {e}", file=sys.stderr) + return {} + +def _save_adjectives(data: Dict[str, List[str]]) -> bool: + try: + with open(ADJECTIVE_FILE, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2) + return True + except Exception as e: + log(f"Error saving adjectives: {e}", file=sys.stderr) + return False + +def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: + data = _load_adjectives() + + # Parse arguments manually first to handle positional args + # We expect: .adjective [category] [tag] [-add] [-delete] + + # If no args, list categories + if not args: + table = ResultTable("Adjective Categories") + for i, (category, tags) in enumerate(data.items()): + row = table.add_row() + row.add_column("#", str(i + 1)) + row.add_column("Category", category) + row.add_column("Tag Amount", str(len(tags))) + + # Selection expands to: .adjective "Category Name" + table.set_row_selection_args(i, [category]) + + table.set_source_command(".adjective") + ctx.set_last_result_table_overlay(table, list(data.keys())) + ctx.set_current_stage_table(table) + print(table) + return 0 + + # We have args. First arg is likely category. + category = args[0] + + # Check if we are adding a new category (implicit if it doesn't exist) + if category not in data: + # If only category provided, create it + if len(args) == 1: + data[category] = [] + _save_adjectives(data) + log(f"Created new category: {category}") + # If more args, we might be trying to add to a non-existent category + elif "-add" in args: + data[category] = [] + # Continue to add logic + + # Handle operations within category + remaining_args = list(args[1:]) + + # Check for -add flag + if "-add" in remaining_args: + # .adjective category -add tag + # or .adjective category tag -add + add_idx = remaining_args.index("-add") + # Tag could be before or after + tag = None + if add_idx + 1 < len(remaining_args): + tag = remaining_args[add_idx + 1] + elif add_idx > 0: + tag = remaining_args[add_idx - 1] + + if tag: + if tag not in data[category]: + data[category].append(tag) + _save_adjectives(data) + log(f"Added '{tag}' to '{category}'") + else: + log(f"Tag '{tag}' already exists in '{category}'") + else: + log("Error: No tag specified to add") + return 1 + + # Check for -delete flag + elif "-delete" in remaining_args: + # .adjective category -delete tag + # or .adjective category tag -delete + del_idx = remaining_args.index("-delete") + tag = None + if del_idx + 1 < len(remaining_args): + tag = remaining_args[del_idx + 1] + elif del_idx > 0: + tag = remaining_args[del_idx - 1] + + if tag: + if tag in data[category]: + data[category].remove(tag) + _save_adjectives(data) + log(f"Deleted '{tag}' from '{category}'") + else: + log(f"Tag '{tag}' not found in '{category}'") + else: + log("Error: No tag specified to delete") + return 1 + + # List tags in category (Default action if no flags or after modification) + tags = data.get(category, []) + table = ResultTable(f"Tags in '{category}'") + for i, tag in enumerate(tags): + row = table.add_row() + row.add_column("#", str(i + 1)) + row.add_column("Tag", tag) + + # Selection expands to: .adjective "Category" "Tag" + # This allows typing @N -delete to delete it + table.set_row_selection_args(i, [category, tag]) + + table.set_source_command(".adjective") + ctx.set_last_result_table_overlay(table, tags) + ctx.set_current_stage_table(table) + print(table) + + return 0 + +CMDLET = Cmdlet( + name=".adjective", + aliases=["adj"], + summary="Manage adjective categories and tags", + usage=".adjective [category] [-add tag] [-delete tag]", + args=[ + CmdletArg(name="category", type="string", description="Category name", required=False), + CmdletArg(name="tag", type="string", description="Tag name", required=False), + CmdletArg(name="add", type="flag", description="Add tag"), + CmdletArg(name="delete", type="flag", description="Delete tag"), + ], + exec=_run +) diff --git a/cmdlets/check_file_status.py b/cmdlets/check_file_status.py new file mode 100644 index 0000000..468feca --- /dev/null +++ b/cmdlets/check_file_status.py @@ -0,0 +1,153 @@ +from __future__ import annotations + +from typing import Any, Dict, Sequence +import json +import sys + +from helper.logger import log + +from . import register +from helper import hydrus as hydrus_wrapper +from ._shared import Cmdlet, CmdletArg, normalize_hash + + +CMDLET = Cmdlet( + name="check-file-status", + summary="Check if a file is active, deleted, or corrupted in Hydrus.", + usage="check-file-status [-hash ]", + args=[ + CmdletArg("-hash", description="File hash (SHA256) to check. If not provided, uses selected result."), + ], + details=[ + "- Shows whether file is active in Hydrus or marked as deleted", + "- Detects corrupted data (e.g., comma-separated URLs)", + "- Displays file metadata and service locations", + "- Note: Hydrus keeps deleted files for recovery. Use cleanup-corrupted for full removal.", + ], +) + + +@register(["check-file-status", "check-status", "file-status", "status"]) +def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: + # Help + try: + if any(str(a).lower() in {"-?", "/?", "--help", "-h", "help", "--cmdlet"} for a in args): + log(json.dumps(CMDLET, ensure_ascii=False, indent=2)) + return 0 + except Exception: + pass + + # Parse arguments + override_hash: str | None = None + i = 0 + while i < len(args): + token = args[i] + low = str(token).lower() + if low in {"-hash", "--hash", "hash"} and i + 1 < len(args): + override_hash = str(args[i + 1]).strip() + i += 2 + continue + i += 1 + + hash_hex = normalize_hash(override_hash) if override_hash else normalize_hash(getattr(result, "hash_hex", None)) + + if not hash_hex: + log("No hash provided and no result selected", file=sys.stderr) + return 1 + + try: + client = hydrus_wrapper.get_client(config) + except Exception as exc: + log(f"Hydrus client unavailable: {exc}", file=sys.stderr) + return 1 + + if client is None: + log("Hydrus client unavailable", file=sys.stderr) + return 1 + + try: + result_data = client.fetch_file_metadata(hashes=[hash_hex]) + if not result_data.get("metadata"): + log(f"File not found: {hash_hex[:16]}...", file=sys.stderr) + return 1 + + file_info = result_data["metadata"][0] + + # Status summary + is_deleted = file_info.get("is_deleted", False) + is_local = file_info.get("is_local", False) + is_trashed = file_info.get("is_trashed", False) + + status_str = "DELETED" if is_deleted else ("TRASHED" if is_trashed else "ACTIVE") + log(f"File status: {status_str}", file=sys.stderr) + + # File info + log(f"\n📄 File Information:", file=sys.stderr) + log(f" Hash: {file_info['hash'][:16]}...", file=sys.stderr) + log(f" Size: {file_info['size']:,} bytes", file=sys.stderr) + log(f" MIME: {file_info['mime']}", file=sys.stderr) + log(f" Dimensions: {file_info.get('width', '?')}x{file_info.get('height', '?')}", file=sys.stderr) + + # Service status + file_services = file_info.get("file_services", {}) + current_services = file_services.get("current", {}) + deleted_services = file_services.get("deleted", {}) + + if current_services: + log(f"\n✓ In services ({len(current_services)}):", file=sys.stderr) + for service_key, service_info in current_services.items(): + sname = service_info.get("name", "unknown") + stype = service_info.get("type_pretty", "unknown") + log(f" - {sname} ({stype})", file=sys.stderr) + + if deleted_services: + log(f"\n✗ Deleted from services ({len(deleted_services)}):", file=sys.stderr) + for service_key, service_info in deleted_services.items(): + sname = service_info.get("name", "unknown") + stype = service_info.get("type_pretty", "unknown") + time_deleted = service_info.get("time_deleted", "?") + log(f" - {sname} ({stype}) - deleted at {time_deleted}", file=sys.stderr) + + # URL check + urls = file_info.get("known_urls", []) + log(f"\n🔗 URLs ({len(urls)}):", file=sys.stderr) + + corrupted_count = 0 + for i, url in enumerate(urls, 1): + if "," in url: + corrupted_count += 1 + log(f" [{i}] ⚠️ CORRUPTED (comma-separated): {url[:50]}...", file=sys.stderr) + else: + log(f" [{i}] {url[:70]}{'...' if len(url) > 70 else ''}", file=sys.stderr) + + if corrupted_count > 0: + log(f"\n⚠️ WARNING: Found {corrupted_count} corrupted URL(s)", file=sys.stderr) + + # Tags + tags_dict = file_info.get("tags", {}) + total_tags = 0 + for service_key, service_data in tags_dict.items(): + service_name = service_data.get("name", "unknown") + display_tags = service_data.get("display_tags", {}).get("0", []) + total_tags += len(display_tags) + + if total_tags > 0: + log(f"\n🏷️ Tags ({total_tags}):", file=sys.stderr) + for service_key, service_data in tags_dict.items(): + display_tags = service_data.get("display_tags", {}).get("0", []) + if display_tags: + service_name = service_data.get("name", "unknown") + log(f" {service_name}:", file=sys.stderr) + for tag in display_tags[:5]: # Show first 5 + log(f" - {tag}", file=sys.stderr) + if len(display_tags) > 5: + log(f" ... and {len(display_tags) - 5} more", file=sys.stderr) + + log("\n", file=sys.stderr) + return 0 + + except Exception as exc: + log(f"Error checking file status: {exc}", file=sys.stderr) + import traceback + traceback.print_exc(file=sys.stderr) + return 1 diff --git a/cmdlets/cleanup.py b/cmdlets/cleanup.py new file mode 100644 index 0000000..1288925 --- /dev/null +++ b/cmdlets/cleanup.py @@ -0,0 +1,110 @@ +"""Cleanup cmdlet for removing temporary artifacts from pipeline. + +This cmdlet processes result lists and removes temporary files (marked with is_temp=True), +then emits the remaining non-temporary results for further pipeline stages. +""" + +from __future__ import annotations + +from typing import Any, Dict, Sequence +from pathlib import Path +import sys + +from helper.logger import log + +from . import register +from ._shared import Cmdlet, CmdletArg, get_pipe_object_path, normalize_result_input, filter_results_by_temp +import models +import pipeline as pipeline_context + + +@register(["cleanup"]) +def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: + """Remove temporary files from pipeline results. + + Accepts: + - Single result object with is_temp field + - List of result objects to clean up + + Process: + - Filters results by is_temp=True + - Deletes those files from disk + - Emits only non-temporary results + + Typical pipeline usage: + download-data url | screen-shot | add-tag "tag" --all | cleanup + """ + + # Help + try: + if any(str(a).lower() in {"-?", "/?", "--help", "-h", "help", "--cmdlet"} for a in args): + import json + log(json.dumps(CMDLET, ensure_ascii=False, indent=2)) + return 0 + except Exception: + pass + + # Normalize input to list + results = normalize_result_input(result) + + if not results: + log("[cleanup] No results to process", file=sys.stderr) + return 1 + + # Separate temporary and permanent results + temp_results = pipeline_context.filter_results_by_temp(results, include_temp=True) + perm_results = pipeline_context.filter_results_by_temp(results, include_temp=False) + + # Delete temporary files + deleted_count = 0 + for temp_result in temp_results: + try: + file_path = get_pipe_object_path(temp_result) + + if file_path: + path_obj = Path(file_path) + if path_obj.exists(): + # Delete the file + path_obj.unlink() + log(f"[cleanup] Deleted temporary file: {path_obj.name}", file=sys.stderr) + deleted_count += 1 + + # Clean up any associated sidecar files + for ext in ['.tags', '.metadata']: + sidecar = path_obj.parent / (path_obj.name + ext) + if sidecar.exists(): + try: + sidecar.unlink() + log(f"[cleanup] Deleted sidecar: {sidecar.name}", file=sys.stderr) + except Exception as e: + log(f"[cleanup] Warning: Could not delete sidecar {sidecar.name}: {e}", file=sys.stderr) + else: + log(f"[cleanup] File does not exist: {file_path}", file=sys.stderr) + except Exception as e: + log(f"[cleanup] Error deleting file: {e}", file=sys.stderr) + + # Log summary + log(f"[cleanup] Deleted {deleted_count} temporary file(s), emitting {len(perm_results)} permanent result(s)", file=sys.stderr) + + # Emit permanent results for downstream processing + for perm_result in perm_results: + pipeline_context.emit(perm_result) + + return 0 + + +CMDLET = Cmdlet( + name="cleanup", + summary="Remove temporary artifacts from pipeline (marked with is_temp=True).", + usage="cleanup", + args=[], + details=[ + "- Accepts pipeline results that may contain temporary files (screenshots, intermediate artifacts)", + "- Deletes files marked with is_temp=True from disk", + "- Also cleans up associated sidecar files (.tags, .metadata)", + "- Emits only non-temporary results for further processing", + "- Typical usage at end of pipeline: ... | add-tag \"tag\" --all | cleanup", + "- Exit code 0 if cleanup successful, 1 if no results to process", + ], +) + diff --git a/cmdlets/delete_file.py b/cmdlets/delete_file.py new file mode 100644 index 0000000..1140038 --- /dev/null +++ b/cmdlets/delete_file.py @@ -0,0 +1,242 @@ +from __future__ import annotations + +from typing import Any, Dict, Sequence +import json +import sys + +from helper.logger import log +import sqlite3 +from pathlib import Path + +import models +import pipeline as ctx +from helper import hydrus as hydrus_wrapper +from ._shared import Cmdlet, CmdletArg, normalize_hash + + + + +def _delete_database_entry(db_path: Path, file_path: str) -> bool: + """Delete file and related entries from local library database. + + Args: + db_path: Path to the library.db file + file_path: Exact file path string as stored in database + + Returns: + True if successful, False otherwise + """ + try: + if not db_path.exists(): + log(f"Database not found at {db_path}", file=sys.stderr) + return False + + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + + log(f"Searching database for file_path: {file_path}", file=sys.stderr) + + # Find the file_id using the exact file_path + cursor.execute('SELECT id FROM files WHERE file_path = ?', (file_path,)) + result = cursor.fetchone() + + if not result: + log(f"ERROR: File path not found in database", file=sys.stderr) + log(f"Expected: {file_path}", file=sys.stderr) + + # Debug: show sample entries + cursor.execute('SELECT id, file_path FROM files LIMIT 3') + samples = cursor.fetchall() + if samples: + log(f"Sample DB entries:", file=sys.stderr) + for fid, fpath in samples: + log(f"{fid}: {fpath}", file=sys.stderr) + + conn.close() + return False + + file_id = result[0] + log(f"Found file_id={file_id}, deleting all related records", file=sys.stderr) + + # Delete related records + cursor.execute('DELETE FROM metadata WHERE file_id = ?', (file_id,)) + meta_count = cursor.rowcount + + cursor.execute('DELETE FROM tags WHERE file_id = ?', (file_id,)) + tags_count = cursor.rowcount + + cursor.execute('DELETE FROM notes WHERE file_id = ?', (file_id,)) + notes_count = cursor.rowcount + + cursor.execute('DELETE FROM files WHERE id = ?', (file_id,)) + files_count = cursor.rowcount + + conn.commit() + conn.close() + + log(f"Deleted: metadata={meta_count}, tags={tags_count}, notes={notes_count}, files={files_count}", file=sys.stderr) + return True + + except Exception as exc: + log(f"Database cleanup failed: {exc}", file=sys.stderr) + import traceback + traceback.print_exc(file=sys.stderr) + return False + + +def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: + # Help + try: + if any(str(a).lower() in {"-?", "/?", "--help", "-h", "help", "--cmdlet"} for a in args): + log(json.dumps(CMDLET, ensure_ascii=False, indent=2)) + return 0 + except Exception: + pass + + # Handle @N selection which creates a list - extract the first item + if isinstance(result, list) and len(result) > 0: + result = result[0] + + # Parse overrides and options + override_hash: str | None = None + conserve: str | None = None + lib_root: str | None = None + reason_tokens: list[str] = [] + i = 0 + while i < len(args): + token = args[i] + low = str(token).lower() + if low in {"-hash", "--hash", "hash"} and i + 1 < len(args): + override_hash = str(args[i + 1]).strip() + i += 2 + continue + if low in {"-conserve", "--conserve"} and i + 1 < len(args): + value = str(args[i + 1]).strip().lower() + if value in {"local", "hydrus"}: + conserve = value + i += 2 + continue + if low in {"-lib-root", "--lib-root", "lib-root"} and i + 1 < len(args): + lib_root = str(args[i + 1]).strip() + i += 2 + continue + reason_tokens.append(token) + i += 1 + + # Handle result as either dict or object + if isinstance(result, dict): + hash_hex_raw = result.get("hash_hex") or result.get("hash") + target = result.get("target") + origin = result.get("origin") + else: + hash_hex_raw = getattr(result, "hash_hex", None) or getattr(result, "hash", None) + target = getattr(result, "target", None) + origin = getattr(result, "origin", None) + + # For Hydrus files, the target IS the hash + if origin and origin.lower() == "hydrus" and not hash_hex_raw: + hash_hex_raw = target + + hash_hex = normalize_hash(override_hash) if override_hash else normalize_hash(hash_hex_raw) + reason = " ".join(token for token in reason_tokens if str(token).strip()).strip() + + local_deleted = False + local_target = isinstance(target, str) and target.strip() and not str(target).lower().startswith(("http://", "https://")) + if conserve != "local" and local_target: + path = Path(str(target)) + file_path_str = str(target) # Keep the original string for DB matching + try: + if path.exists() and path.is_file(): + path.unlink() + local_deleted = True + if ctx._PIPE_ACTIVE: + ctx.emit(f"Removed local file: {path}") + log(f"Deleted: {path.name}", file=sys.stderr) + except Exception as exc: + log(f"Local delete failed: {exc}", file=sys.stderr) + + # Remove common sidecars regardless of file removal success + for sidecar in (path.with_suffix(".tags"), path.with_suffix(".tags.txt"), + path.with_suffix(".metadata"), path.with_suffix(".notes")): + try: + if sidecar.exists() and sidecar.is_file(): + sidecar.unlink() + except Exception: + pass + + # Clean up database entry if library root provided - do this regardless of file deletion success + if lib_root: + lib_root_path = Path(lib_root) + db_path = lib_root_path / ".downlow_library.db" + log(f"Attempting DB cleanup: lib_root={lib_root}, db_path={db_path}", file=sys.stderr) + log(f"Deleting DB entry for: {file_path_str}", file=sys.stderr) + if _delete_database_entry(db_path, file_path_str): + if ctx._PIPE_ACTIVE: + ctx.emit(f"Removed database entry: {path.name}") + log(f"Database entry cleaned up", file=sys.stderr) + local_deleted = True # Mark as deleted if DB cleanup succeeded + else: + log(f"Database entry not found or cleanup failed for {file_path_str}", file=sys.stderr) + else: + log(f"No lib_root provided, skipping database cleanup", file=sys.stderr) + + hydrus_deleted = False + if conserve != "hydrus" and hash_hex: + try: + client = hydrus_wrapper.get_client(config) + except Exception as exc: + if not local_deleted: + log(f"Hydrus client unavailable: {exc}", file=sys.stderr) + return 1 + else: + if client is None: + if not local_deleted: + log("Hydrus client unavailable", file=sys.stderr) + return 1 + else: + payload: Dict[str, Any] = {"hashes": [hash_hex]} + if reason: + payload["reason"] = reason + try: + client._post("/add_files/delete_files", data=payload) # type: ignore[attr-defined] + hydrus_deleted = True + preview = hash_hex[:12] + ('…' if len(hash_hex) > 12 else '') + log(f"Deleted from Hydrus: {preview}…", file=sys.stderr) + except Exception as exc: + log(f"Hydrus delete failed: {exc}", file=sys.stderr) + if not local_deleted: + return 1 + + if hydrus_deleted and hash_hex: + preview = hash_hex[:12] + ('…' if len(hash_hex) > 12 else '') + if ctx._PIPE_ACTIVE: + if reason: + ctx.emit(f"Deleted {preview} (reason: {reason}).") + else: + ctx.emit(f"Deleted {preview}.") + + if hydrus_deleted or local_deleted: + return 0 + + log("Selected result has neither Hydrus hash nor local file target") + return 1 + +CMDLET = Cmdlet( + name="delete-file", + summary="Delete a file locally and/or from Hydrus, including database entries.", + usage="delete-file [-hash ] [-conserve ] [-lib-root ] [reason]", + aliases=["del-file"], + args=[ + CmdletArg("hash", description="Override the Hydrus file hash (SHA256) to target instead of the selected result."), + CmdletArg("conserve", description="Choose which copy to keep: 'local' or 'hydrus'."), + CmdletArg("lib-root", description="Path to local library root for database cleanup."), + CmdletArg("reason", description="Optional reason for deletion (free text)."), + ], + details=[ + "Default removes both the local file and Hydrus file.", + "Use -conserve local to keep the local file, or -conserve hydrus to keep it in Hydrus.", + "Database entries are automatically cleaned up for local files.", + "Any remaining arguments are treated as the Hydrus reason text.", + ], +) + diff --git a/cmdlets/delete_note.py b/cmdlets/delete_note.py new file mode 100644 index 0000000..6bf9a7e --- /dev/null +++ b/cmdlets/delete_note.py @@ -0,0 +1,79 @@ +from __future__ import annotations + +from typing import Any, Dict, Sequence +import json + +from helper import hydrus as hydrus_wrapper +from ._shared import Cmdlet, CmdletArg, normalize_hash +from helper.logger import log + +CMDLET = Cmdlet( + name="delete-note", + summary="Delete a named note from a Hydrus file.", + usage="i | del-note [-hash ] ", + aliases=["del-note"], + args=[ + + ], + details=[ + "- Removes the note with the given name from the Hydrus file.", + ], +) + + +def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: + # Help + try: + if any(str(a).lower() in {"-?", "/?", "--help", "-h", "help", "--cmdlet"} for a in args): + log(json.dumps(CMDLET, ensure_ascii=False, indent=2)) + return 0 + except Exception: + pass + if not args: + log("Requires the note name/key to delete") + return 1 + override_hash: str | None = None + rest: list[str] = [] + i = 0 + while i < len(args): + a = args[i] + low = str(a).lower() + if low in {"-hash", "--hash", "hash"} and i + 1 < len(args): + override_hash = str(args[i + 1]).strip() + i += 2 + continue + rest.append(a) + i += 1 + if not rest: + log("Requires the note name/key to delete") + return 1 + name = str(rest[0] or '').strip() + if not name: + log("Requires a non-empty note name/key") + return 1 + + # Handle @N selection which creates a list - extract the first item + if isinstance(result, list) and len(result) > 0: + result = result[0] + + hash_hex = normalize_hash(override_hash) if override_hash else normalize_hash(getattr(result, "hash_hex", None)) + if not hash_hex: + log("Selected result does not include a Hydrus hash") + return 1 + try: + client = hydrus_wrapper.get_client(config) + except Exception as exc: + log(f"Hydrus client unavailable: {exc}") + return 1 + + if client is None: + log("Hydrus client unavailable") + return 1 + try: + service_name = "my notes" + client.delete_notes(hash_hex, [name], service_name) + except Exception as exc: + log(f"Hydrus delete-note failed: {exc}") + return 1 + log(f"Deleted note '{name}'") + return 0 diff --git a/cmdlets/delete_tag.py b/cmdlets/delete_tag.py new file mode 100644 index 0000000..c131ec8 --- /dev/null +++ b/cmdlets/delete_tag.py @@ -0,0 +1,219 @@ +from __future__ import annotations + +from typing import Any, Dict, Sequence +import json + +from . import register +import models +import pipeline as ctx +from helper import hydrus as hydrus_wrapper +from ._shared import Cmdlet, CmdletArg, normalize_hash, parse_tag_arguments +from helper.logger import log + + +CMDLET = Cmdlet( + name="delete-tags", + summary="Remove tags from a Hydrus file.", + usage="del-tags [-hash ] [,...]", + aliases=["del-tag", "del-tags", "delete-tag"], + args=[ + CmdletArg("-hash", description="Override the Hydrus file hash (SHA256) to target instead of the selected result."), + CmdletArg("[,...]", required=True, description="One or more tags to remove. Comma- or space-separated."), + ], + details=[ + "- Requires a Hydrus file (hash present) or explicit -hash override.", + "- Multiple tags can be comma-separated or space-separated.", + ], +) + +@register(["del-tag", "del-tags", "delete-tag", "delete-tags"]) # Still needed for backward compatibility +def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: + # Help + try: + if any(str(a).lower() in {"-?", "/?", "--help", "-h", "help", "--cmdlet"} for a in args): + log(json.dumps(CMDLET, ensure_ascii=False, indent=2)) + return 0 + except Exception: + pass + + # Check if we have a piped TagItem with no args (i.e., from @1 | delete-tag) + has_piped_tag = (result and hasattr(result, '__class__') and + result.__class__.__name__ == 'TagItem' and + hasattr(result, 'tag_name')) + + # Check if we have a piped list of TagItems (from @N selection) + has_piped_tag_list = (isinstance(result, list) and result and + hasattr(result[0], '__class__') and + result[0].__class__.__name__ == 'TagItem') + + if not args and not has_piped_tag and not has_piped_tag_list: + log("Requires at least one tag argument") + return 1 + + # Parse -hash override and collect tags from remaining args + override_hash: str | None = None + rest: list[str] = [] + i = 0 + while i < len(args): + a = args[i] + low = str(a).lower() + if low in {"-hash", "--hash", "hash"} and i + 1 < len(args): + override_hash = str(args[i + 1]).strip() + i += 2 + continue + rest.append(a) + i += 1 + + # Check if first argument is @ syntax (result table selection) + # @5 or @{2,5,8} to delete tags from ResultTable by index + tags_from_at_syntax = [] + hash_from_at_syntax = None + + if rest and str(rest[0]).startswith("@"): + selector_arg = str(rest[0]) + pipe_selector = selector_arg[1:].strip() + # Parse @N or @{N,M,K} syntax + if pipe_selector.startswith("{") and pipe_selector.endswith("}"): + # @{2,5,8} + pipe_selector = pipe_selector[1:-1] + try: + indices = [int(tok.strip()) for tok in pipe_selector.split(',') if tok.strip()] + except ValueError: + log("Invalid selection syntax. Use @2 or @{2,5,8}") + return 1 + + # Get the last ResultTable from pipeline context + try: + last_table = ctx._LAST_RESULT_TABLE + if last_table: + # Extract tags from selected rows + for idx in indices: + if 1 <= idx <= len(last_table.rows): + # Look for a TagItem in _LAST_RESULT_ITEMS by index + if idx - 1 < len(ctx._LAST_RESULT_ITEMS): + item = ctx._LAST_RESULT_ITEMS[idx - 1] + if hasattr(item, '__class__') and item.__class__.__name__ == 'TagItem': + tag_name = getattr(item, 'tag_name', None) + if tag_name: + log(f"[delete_tag] Extracted tag from @{idx}: {tag_name}") + tags_from_at_syntax.append(tag_name) + # Also get hash from first item for consistency + if not hash_from_at_syntax: + hash_from_at_syntax = getattr(item, 'hash_hex', None) + + if not tags_from_at_syntax: + log(f"No tags found at indices: {indices}") + return 1 + else: + log("No ResultTable in pipeline (use @ after running get-tag)") + return 1 + except Exception as exc: + log(f"Error processing @ selection: {exc}", file=__import__('sys').stderr) + return 1 + + # Handle @N selection which creates a list - extract the first item + if isinstance(result, list) and len(result) > 0: + # If we have a list of TagItems, we want to process ALL of them if no args provided + # This handles: delete-tag @1 (where @1 expands to a list containing one TagItem) + if not args and hasattr(result[0], '__class__') and result[0].__class__.__name__ == 'TagItem': + # We will extract tags from the list later + pass + else: + result = result[0] + + # Determine tags and hash to use + tags: list[str] = [] + hash_hex = None + + if tags_from_at_syntax: + # Use tags extracted from @ syntax + tags = tags_from_at_syntax + hash_hex = normalize_hash(override_hash) if override_hash else normalize_hash(hash_from_at_syntax) + log(f"[delete_tag] Using @ syntax extraction: {len(tags)} tag(s) to delete: {tags}") + elif isinstance(result, list) and result and hasattr(result[0], '__class__') and result[0].__class__.__name__ == 'TagItem': + # Got a list of TagItems (e.g. from delete-tag @1) + tags = [getattr(item, 'tag_name') for item in result if getattr(item, 'tag_name', None)] + # Use hash from first item + hash_hex = normalize_hash(override_hash) if override_hash else normalize_hash(getattr(result[0], "hash_hex", None)) + elif result and hasattr(result, '__class__') and result.__class__.__name__ == 'TagItem': + # Got a piped TagItem - delete this specific tag + tag_name = getattr(result, 'tag_name', None) + if tag_name: + tags = [tag_name] + hash_hex = normalize_hash(override_hash) if override_hash else normalize_hash(getattr(result, "hash_hex", None)) + else: + # Traditional mode - parse tag arguments + tags = parse_tag_arguments(rest) + hash_hex = normalize_hash(override_hash) if override_hash else normalize_hash(getattr(result, "hash_hex", None)) + + if not tags: + log("No valid tags were provided") + return 1 + + if not hash_hex: + log("Selected result does not include a hash") + return 1 + + try: + service_name = hydrus_wrapper.get_tag_service_name(config) + except Exception as exc: + log(f"Failed to resolve tag service: {exc}") + return 1 + + try: + client = hydrus_wrapper.get_client(config) + except Exception as exc: + log(f"Hydrus client unavailable: {exc}") + return 1 + + if client is None: + log("Hydrus client unavailable") + return 1 + + log(f"[delete_tag] Sending deletion request: hash={hash_hex}, tags={tags}, service={service_name}") + try: + result = client.delete_tags(hash_hex, tags, service_name) + log(f"[delete_tag] Hydrus response: {result}") + except Exception as exc: + log(f"Hydrus del-tag failed: {exc}") + return 1 + + preview = hash_hex[:12] + ('…' if len(hash_hex) > 12 else '') + log(f"Removed {len(tags)} tag(s) from {preview} via '{service_name}'.") + + # Re-fetch and emit updated tags after deletion + try: + payload = client.fetch_file_metadata(hashes=[str(hash_hex)], include_service_keys_to_tags=True, include_file_urls=False) + items = payload.get("metadata") if isinstance(payload, dict) else None + if isinstance(items, list) and items: + meta = items[0] if isinstance(items[0], dict) else None + if isinstance(meta, dict): + # Extract tags from updated metadata + from cmdlets.get_tag import _extract_my_tags_from_hydrus_meta, TagItem + service_key = hydrus_wrapper.get_tag_service_key(client, service_name) + updated_tags = _extract_my_tags_from_hydrus_meta(meta, service_key, service_name) + + # Emit updated tags as TagItem objects + from result_table import ResultTable + table = ResultTable("Tags", max_columns=2) + tag_items = [] + for idx, tag_name in enumerate(updated_tags, start=1): + tag_item = TagItem( + tag_name=tag_name, + tag_index=idx, + hash_hex=hash_hex, + source="hydrus", + service_name=service_name, + ) + tag_items.append(tag_item) + table.add_result(tag_item) + ctx.emit(tag_item) + + # Store items for @ selection in next command (CLI will handle table management) + # Don't call set_last_result_table so we don't pollute history or table context + except Exception as exc: + log(f"Warning: Could not fetch updated tags after deletion: {exc}", file=__import__('sys').stderr) + + return 0 + + diff --git a/cmdlets/delete_url.py b/cmdlets/delete_url.py new file mode 100644 index 0000000..1000dd1 --- /dev/null +++ b/cmdlets/delete_url.py @@ -0,0 +1,82 @@ +from __future__ import annotations + +from typing import Any, Dict, Sequence +import json + +from . import register +from helper import hydrus as hydrus_wrapper +from ._shared import Cmdlet, CmdletArg, normalize_hash +from helper.logger import log + +CMDLET = Cmdlet( + name="delete-url", + summary="Remove a URL association from a Hydrus file.", + usage="delete-url [-hash ] ", + args=[ + CmdletArg("-hash", description="Override the Hydrus file hash (SHA256) to target instead of the selected result."), + CmdletArg("", required=True, description="The URL to remove from the file."), + ], + details=[ + "- Removes the URL from the Hydrus file's known URL list.", + ], +) + + +def _parse_hash_and_rest(args: Sequence[str]) -> tuple[str | None, list[str]]: + override_hash: str | None = None + rest: list[str] = [] + i = 0 + while i < len(args): + a = args[i] + low = str(a).lower() + if low in {"-hash", "--hash", "hash"} and i + 1 < len(args): + override_hash = str(args[i + 1]).strip() + i += 2 + continue + rest.append(a) + i += 1 + return override_hash, rest + + +@register(["del-url", "delete-url", "delete_url"]) # aliases +def delete(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: + # Help + try: + if any(str(a).lower() in {"-?", "/?", "--help", "-h", "help", "--cmdlet"} for a in args): + log(json.dumps(CMDLET, ensure_ascii=False, indent=2)) + return 0 + except Exception: + pass + override_hash, rest = _parse_hash_and_rest(args) + if not rest: + log("Requires a URL argument") + return 1 + url = str(rest[0] or '').strip() + if not url: + log("Requires a non-empty URL") + return 1 + + # Handle @N selection which creates a list - extract the first item + if isinstance(result, list) and len(result) > 0: + result = result[0] + + hash_hex = normalize_hash(override_hash) if override_hash else normalize_hash(getattr(result, "hash_hex", None)) + if not hash_hex: + log("Selected result does not include a Hydrus hash") + return 1 + try: + client = hydrus_wrapper.get_client(config) + except Exception as exc: + log(f"Hydrus client unavailable: {exc}") + return 1 + + if client is None: + log("Hydrus client unavailable") + return 1 + try: + client.delete_url(hash_hex, url) + except Exception as exc: + log(f"Hydrus del-url failed: {exc}") + return 1 + log(f"Deleted URL: {url}") + return 0 diff --git a/cmdlets/download_data.py b/cmdlets/download_data.py new file mode 100644 index 0000000..222fd7f --- /dev/null +++ b/cmdlets/download_data.py @@ -0,0 +1,2633 @@ +"""Download data from URLs using yt-dlp with playlist, clipping, and format selection. + +This is a merged implementation combining: +- cmdlets/download_data.py (pipeline wrapper) +- funact/download_data.py (feature-rich implementation) +- helper/download.py (low-level machinery) + +Features: +- Direct file downloads and yt-dlp streaming sites +- Playlist detection with interactive track selection +- Clip extraction (time ranges like 34:03-35:08) +- Format selection and audio/video toggles +- Cookies file support +- Tag extraction and metadata integration +- Progress tracking and debug logging +- Pipeline integration with result emission +- Background torrent/magnet downloads via AllDebrid +""" + +from __future__ import annotations + +import hashlib +import re +import sys +import threading +import time +from pathlib import Path +from typing import Any, Dict, List, Optional, Sequence, Tuple +import uuid + +from helper.logger import log, debug +from helper.download import download_media, probe_url +from helper.utils import sha256_file +from models import DownloadOptions + +from . import register +from ._shared import Cmdlet, CmdletArg, SharedArgs, create_pipe_object_result, normalize_result_input, parse_cmdlet_args +import models +import pipeline as pipeline_context +from config import resolve_output_dir +from metadata import ( + fetch_openlibrary_metadata_tags, + format_playlist_entry, + extract_ytdlp_tags +) + +# ============================================================================ +# Try to import optional dependencies +# ============================================================================ + +try: + from yt_dlp.utils import sanitize_filename as ytdlp_sanitize_filename # type: ignore +except Exception: # pragma: no cover - optional dependency + ytdlp_sanitize_filename = None + + +# ============================================================================ +# Background Worker for AllDebrid Downloads +# ============================================================================ + +def _download_torrent_worker( + worker_id: str, + magnet_url: str, + output_dir: Path, + config: Dict[str, Any], + api_key: str, + playlist_items: Optional[str] = None, + audio_mode: bool = False, + wait_timeout: int = 600, + worker_manager: Optional[Any] = None, +) -> None: + """Background worker to download torrent/magnet via AllDebrid. + + Runs in a separate thread and updates worker_manager with progress. + + Args: + worker_id: Unique ID for this worker task + magnet_url: Magnet link or .torrent URL to download + output_dir: Directory to save downloaded files + config: Configuration dict + api_key: AllDebrid API key + playlist_items: Optional file selection (e.g., "1,3,5-8") + audio_mode: Whether to tag as audio or video + wait_timeout: Timeout in seconds for magnet processing + worker_manager: WorkerManager instance for progress updates + """ + worker = None + downloaded_files = [] + + try: + from helper.alldebrid import AllDebridClient + + # Get worker reference if manager provided + if worker_manager: + try: + workers = worker_manager.get_active_workers() + worker = next((w for w in workers if w.get('id') == worker_id), None) + except: + worker = None + + def log_progress(message: str) -> None: + """Log progress to both console and worker manager.""" + debug(message) + if worker_manager and worker_id: + try: + worker_manager.log_step(worker_id, message) + except: + pass + + log_progress(f"[Worker {worker_id}] Submitting magnet to AllDebrid...") + client = AllDebridClient(api_key) + + # Add magnet + magnet_info = client.magnet_add(magnet_url) + magnet_id = int(magnet_info.get('id', 0)) + + if magnet_id <= 0: + log_progress(f"[Worker {worker_id}] ✗ Failed to add magnet to AllDebrid") + if worker_manager: + try: + worker_manager.finish_worker(worker_id, "failed", f"Failed to add magnet") + except: + pass + return + + log_progress(f"[Worker {worker_id}] ✓ Magnet added (ID: {magnet_id})") + + # Poll for ready status + elapsed = 0 + last_status_reported = 0 + + while elapsed < wait_timeout: + try: + status_info = client.magnet_status(magnet_id) + except Exception as e: + log_progress(f"[Worker {worker_id}] ⚠ Failed to get status: {e}") + time.sleep(2) + elapsed += 2 + continue + + status_code = status_info.get('statusCode', -1) + status_text = status_info.get('status', 'Unknown') + + # Report progress every 5 seconds (avoid log spam) + if elapsed - last_status_reported >= 5 or elapsed < 2: + downloaded = status_info.get('downloaded', 0) + total_size = status_info.get('size', 0) + seeders = status_info.get('seeders', 0) + speed = status_info.get('downloadSpeed', 0) + + if total_size > 0: + percent = (downloaded / total_size) * 100 + speed_str = f" @ {speed / (1024**2):.1f} MB/s" if speed > 0 else "" + seeders_str = f" ({seeders} seeders)" if seeders > 0 else "" + progress_msg = f"[Worker {worker_id}] ⧗ {status_text}: {percent:.1f}% ({downloaded / (1024**3):.2f} / {total_size / (1024**3):.2f} GB){speed_str}{seeders_str}" + log_progress(progress_msg) + + # Update worker with progress + if worker_manager: + try: + worker_manager.update_worker( + worker_id, + status="running", + progress=f"{percent:.1f}%", + details=progress_msg + ) + except: + pass + else: + log_progress(f"[Worker {worker_id}] ⧗ {status_text}...") + + last_status_reported = elapsed + + if status_code == 4: # Ready + log_progress(f"[Worker {worker_id}] ✓ Files ready") + break + elif status_code >= 5: # Error + error_status = { + 5: "Upload failed", + 6: "Internal error during unpacking", + 7: "Not downloaded in 20 minutes", + 8: "File too big (>1TB)", + 9: "Internal error", + 10: "Download took >72 hours", + 11: "Deleted on hoster website", + 12: "Processing failed", + 13: "Processing failed", + 14: "Tracker error", + 15: "No peers available" + } + error_msg = error_status.get(status_code, f"Unknown error {status_code}") + log_progress(f"[Worker {worker_id}] ✗ Magnet failed: {error_msg}") + if worker_manager: + try: + worker_manager.finish_worker(worker_id, "failed", error_msg) + except: + pass + return + + time.sleep(2) + elapsed += 2 + + if elapsed >= wait_timeout: + log_progress(f"[Worker {worker_id}] ✗ Timeout waiting for magnet (>{wait_timeout}s)") + if worker_manager: + try: + worker_manager.finish_worker(worker_id, "failed", f"Timeout after {wait_timeout}s") + except: + pass + return + + # Get files + files_result = client.magnet_links([magnet_id]) + magnet_files = files_result.get(str(magnet_id), {}) + if not magnet_files and isinstance(magnet_id, int): + # Try integer key as fallback + for key in files_result: + if str(key) == str(magnet_id): + magnet_files = files_result[key] + break + files_array = magnet_files.get('files', []) + + if not files_array: + log_progress(f"[Worker {worker_id}] ✗ No files found in magnet") + if worker_manager: + try: + worker_manager.finish_worker(worker_id, "failed", "No files found in magnet") + except: + pass + return + + log_progress(f"[Worker {worker_id}] ✓ Found {len(files_array)} file(s)") + + # Extract download links + download_links = [] + + def extract_links(items, prefix=""): + if not isinstance(items, list): + return + for item in items: + if isinstance(item, dict): + name = item.get('n', '') + link = item.get('l', '') + size = item.get('s', 0) + entries = item.get('e', []) + + if link: + download_links.append({ + 'link': link, + 'name': name, + 'size': size, + 'path': f"{prefix}/{name}" if prefix else name + }) + + if entries: + extract_links(entries, f"{prefix}/{name}" if prefix else name) + + extract_links(files_array) + + if not download_links: + log_progress(f"[Worker {worker_id}] ✗ No downloadable files found") + if worker_manager: + try: + worker_manager.finish_worker(worker_id, "failed", "No downloadable files") + except: + pass + return + + # Filter by playlist_items if specified + if playlist_items and playlist_items != '*': + # Parse selection like "1,3,5-8" + selected_indices = [] + for part in playlist_items.split(','): + part = part.strip() + if '-' in part: + start, end = part.split('-') + selected_indices.extend(range(int(start)-1, int(end))) + else: + selected_indices.append(int(part)-1) + + download_links = [download_links[i] for i in selected_indices if i < len(download_links)] + log_progress(f"[Worker {worker_id}] Downloading {len(download_links)} selected file(s)") + + # Download each file + for idx, file_info in enumerate(download_links, 1): + link = file_info['link'] + name = file_info['name'] + + log_progress(f"[Worker {worker_id}] ({idx}/{len(download_links)}) Downloading: {name}") + + try: + # Unlock the link + try: + actual_link = client.unlock_link(link) + if actual_link and actual_link != link: + link = actual_link + except: + pass + + # Download via HTTP + from helper.http_client import HTTPClient + + output_dir.mkdir(parents=True, exist_ok=True) + file_path = output_dir / name + file_path.parent.mkdir(parents=True, exist_ok=True) + + with HTTPClient() as http_client: + http_client.download(link, str(file_path)) + + log_progress(f"[Worker {worker_id}] ✓ Downloaded: {name}") + + # Compute hash and emit result + file_hash = _compute_file_hash(file_path) + + result_obj = { + 'file_path': str(file_path), + 'source_url': magnet_url, + 'file_hash': file_hash, + 'media_kind': 'audio' if audio_mode else 'video', + } + + pipeline_context.emit(result_obj) + downloaded_files.append(file_path) + + except Exception as e: + log_progress(f"[Worker {worker_id}] ⚠ Failed to download {name}: {e}") + + if downloaded_files: + msg = f"✓ Torrent download complete ({len(downloaded_files)} file(s))" + log_progress(f"[Worker {worker_id}] {msg}") + if worker_manager: + try: + worker_manager.finish_worker(worker_id, "success", msg) + except: + pass + else: + if worker_manager: + try: + worker_manager.finish_worker(worker_id, "failed", "No files downloaded") + except: + pass + + except ImportError: + log_progress(f"[Worker {worker_id}] ✗ AllDebrid client not available") + if worker_manager: + try: + worker_manager.finish_worker(worker_id, "failed", "AllDebrid client not available") + except: + pass + except Exception as e: + import traceback + log_progress(f"[Worker {worker_id}] ✗ Torrent download failed: {e}") + if worker_manager: + try: + worker_manager.finish_worker(worker_id, "failed", str(e)) + except: + pass + traceback.print_exc(file=sys.stderr) + + +# ============================================================================ +# CMDLET Metadata Declaration +# ============================================================================ + + + + +# ============================================================================ +# Torrent File Parsing +# ============================================================================ + +def _parse_torrent_file(file_path: str) -> Optional[str]: + """Parse a .torrent file and extract magnet link. + + Args: + file_path: Path to .torrent file + + Returns: + Magnet link string or None if parsing fails + """ + try: + import bencode3 + except ImportError: + log("⚠ bencode3 module not found. Install: pip install bencode3", file=sys.stderr) + return None + + try: + with open(file_path, 'rb') as f: + torrent_data = bencode3.bdecode(f.read()) + except Exception as e: + log(f"✗ Failed to parse torrent file: {e}", file=sys.stderr) + return None + + try: + # Get info dict - bencode3 returns string keys, not bytes + info = torrent_data.get('info') + if not info: + log("✗ No info dict in torrent file", file=sys.stderr) + return None + + # Calculate info hash (SHA1 of bencoded info dict) + import hashlib + info_hash = hashlib.sha1(bencode3.bencode(info)).hexdigest() + + # Get name + name = info.get('name', 'Unknown') + if isinstance(name, bytes): + name = name.decode('utf-8', errors='ignore') + + # Create magnet link + magnet = f"magnet:?xt=urn:btih:{info_hash}&dn={name}" + + # Add trackers if available + announce = torrent_data.get('announce') + if announce: + try: + tracker = announce if isinstance(announce, str) else announce.decode('utf-8', errors='ignore') + magnet += f"&tr={tracker}" + except: + pass + + announce_list = torrent_data.get('announce-list', []) + for tier in announce_list: + if isinstance(tier, list): + for tracker_item in tier: + try: + tracker = tracker_item if isinstance(tracker_item, str) else tracker_item.decode('utf-8', errors='ignore') + if tracker: + magnet += f"&tr={tracker}" + except: + pass + + debug(f"✓ Parsed torrent: {name} (hash: {info_hash})") + return magnet + + except Exception as e: + log(f"✗ Error parsing torrent metadata: {e}", file=sys.stderr) + return None + + +def _download_torrent_file(url: str, temp_dir: Optional[Path] = None) -> Optional[str]: + """Download a .torrent file from URL and parse it. + + Args: + url: URL to .torrent file + temp_dir: Optional temp directory for storing downloaded file + + Returns: + Magnet link string or None if download/parsing fails + """ + try: + from helper.http_client import HTTPClient + except ImportError: + log("⚠ HTTPClient not available", file=sys.stderr) + return None + + try: + # Download torrent file + debug(f"⇓ Downloading torrent file: {url}") + + with HTTPClient(timeout=30.0) as client: + response = client.get(url) + response.raise_for_status() + torrent_data = response.content + + # Create temp file + if temp_dir is None: + temp_dir = Path.home() / ".cache" / "downlow" + temp_dir.mkdir(parents=True, exist_ok=True) + + # Save to temp file + import hashlib + url_hash = hashlib.md5(url.encode()).hexdigest()[:8] + temp_file = temp_dir / f"torrent_{url_hash}.torrent" + temp_file.write_bytes(torrent_data) + + debug(f"✓ Downloaded torrent file: {temp_file}") + + # Parse it + magnet = _parse_torrent_file(str(temp_file)) + + # Clean up + try: + temp_file.unlink() + except: + pass + + return magnet + + except Exception as e: + log(f"✗ Failed to download/parse torrent: {e}", file=sys.stderr) + return None + + +def _is_torrent_file_or_url(arg: str) -> bool: + """Check if argument is a .torrent file path or URL. + + Args: + arg: Argument to check + + Returns: + True if it's a .torrent file or URL + """ + arg_lower = arg.lower() + + # Check if it's a .torrent file path + if arg_lower.endswith('.torrent'): + return Path(arg).exists() or arg_lower.startswith('http') + + # Check if it's a URL to .torrent file + if arg_lower.startswith('http://') or arg_lower.startswith('https://'): + return '.torrent' in arg_lower + + return False + + +def _process_torrent_input(arg: str) -> Optional[str]: + """Process torrent file or URL and convert to magnet link. + + Args: + arg: .torrent file path or URL + + Returns: + Magnet link or original argument if not processable + """ + try: + if arg.lower().startswith('http://') or arg.lower().startswith('https://'): + # It's a URL + return _download_torrent_file(arg) or arg + else: + # It's a file path + if Path(arg).exists(): + return _parse_torrent_file(arg) or arg + else: + return arg + except Exception as e: + log(f"⚠ Error processing torrent: {e}", file=sys.stderr) + return arg + + +# ============================================================================ +# Helper Functions +# ============================================================================ + + + + +def _show_playlist_table(url: str, probe_info: Dict[str, Any]) -> Optional[Dict[str, Any]]: + """Show playlist result table and get user selection. + + Args: + url: Original URL + probe_info: Info dict from probe_url() + + Returns: + Modified probe_info with selected_entries, or None if user cancelled + """ + entries = probe_info.get("entries", []) + if not entries: + return probe_info + + extractor = probe_info.get("extractor", "") + title = probe_info.get("title", "Playlist") + + debug(f"📋 Detected playlist: {title} ({len(entries)} items) - {extractor}") + + # Skip full metadata enrichment for speed - extract_flat usually provides enough info + # debug("📋 Fetching metadata for each item...") + # entries = enrich_playlist_entries(entries, extractor) + + # Emit each playlist item as a separate result row + for i, entry in enumerate(entries, 1): + formatted = format_playlist_entry(entry, i, extractor) + + # Build tags from available metadata + tags = [] + artist = formatted.get("artist") or formatted.get("uploader", "") + if artist: + tags.append(artist) + + album = formatted.get("album", "") + if album and album != title: # Don't repeat playlist title + tags.append(album) + + # Extract individual fields for separate columns + duration = formatted.get("duration", 0) + duration_str = "" + if duration: + minutes = int(duration // 60) + seconds = int(duration % 60) + duration_str = f"{minutes}m{seconds}s" + tags.append(duration_str) + + # Normalize extractor for comparison (remove special chars and case) + ext_lower = extractor.lower().replace(":", "").replace(" ", "") + + track_number = None + # Add site-specific tags and fields + if "youtube" in ext_lower and formatted.get("channel"): + tags.append(f"channel:{formatted.get('channel')}") + elif "bandcamp" in ext_lower: + track_number = formatted.get("track_number", i) + tags.append(f"track:{track_number}") + + # Create result row with separate columns for important metadata + # Build columns dynamically based on available data + columns = [ + ("#", i), + ("Title", formatted["title"]), + ] + + # Add Artist column if available + if artist: + columns.append(("Artist", artist)) + + # Add Duration column if available + if duration_str: + columns.append(("Duration", duration_str)) + + # Add Track number column for music platforms + if track_number is not None: + columns.append(("Track", str(track_number))) + + # Add Tags column for remaining tags (if any) + remaining_tags = [t for t in tags if t not in [artist, duration_str]] + if remaining_tags: + columns.append(("Tags", ", ".join(remaining_tags))) + + # Create result row with compact columns display + # Using "columns" field tells ResultTable which columns to show + result_row = { + "title": formatted["title"], + "tags": tags, + "index": i, + # Store all metadata but don't display in table (use columns field) + "__source": "playlist-probe", + "__id": f"{i}", + "__file_path": url, + "__action": f"playlist-item:{i}", + "__artist": formatted.get("artist", ""), + "__duration": formatted.get("duration", 0), + "__extractor": extractor, + # Define which columns should be shown in the result table + "columns": columns + } + + # Add site-specific metadata for pipeline use + if "youtube" in ext_lower: + result_row["__video_id"] = formatted.get("video_id", "") + result_row["__channel"] = formatted.get("channel", "") + elif "bandcamp" in ext_lower: + result_row["__track_number"] = formatted.get("track_number", i) + result_row["__album"] = formatted.get("album") or title + elif "spotify" in ext_lower: + result_row["__artists"] = formatted.get("artists", "") + result_row["__album"] = formatted.get("album", "") + + pipeline_context.emit(result_row) + + debug(f"ℹ️ Playlist items displayed. Use result table references (@1, @2, etc.) to select tracks.") + + # Return modified probe info + return probe_info + + +def _parse_time_range(clip_spec: str) -> Optional[Tuple[int, int]]: + """Parse time range from MM:SS-MM:SS or seconds format. + + Args: + clip_spec: Time range string like "34:03-35:08" or "2043-2108" + + Returns: + Tuple of (start_seconds, end_seconds) or None if invalid + """ + try: + if '-' not in clip_spec: + return None + + parts = clip_spec.split('-') + if len(parts) != 2: + return None + + start_str, end_str = parts + + # Try MM:SS format first + if ':' in start_str: + start_parts = start_str.split(':') + if len(start_parts) == 2: + start_sec = int(start_parts[0]) * 60 + int(start_parts[1]) + else: + return None + else: + start_sec = int(start_str) + + if ':' in end_str: + end_parts = end_str.split(':') + if len(end_parts) == 2: + end_sec = int(end_parts[0]) * 60 + int(end_parts[1]) + else: + return None + else: + end_sec = int(end_str) + + if start_sec >= end_sec: + return None + + return (start_sec, end_sec) + + except (ValueError, AttributeError): + return None + + +MEDIA_EXTENSIONS = {'.mp3', '.m4a', '.mp4', '.mkv', '.webm', '.flac', '.wav', '.aac'} + + +def _parse_playlist_selection_indices(selection: Optional[str], total_items: int) -> list[int]: + """Convert playlist selection string to 0-based indices.""" + if total_items <= 0: + return [] + if not selection or selection.strip() in {"*", ""}: + return list(range(total_items)) + indices: list[int] = [] + for part in selection.split(','): + part = part.strip() + if not part: + continue + if '-' in part: + bounds = part.split('-', 1) + try: + start = int(bounds[0]) + end = int(bounds[1]) + except ValueError: + continue + if start <= 0 or end <= 0: + continue + if start > end: + start, end = end, start + for idx in range(start - 1, end): + if 0 <= idx < total_items: + indices.append(idx) + else: + try: + idx = int(part) - 1 + except ValueError: + continue + if 0 <= idx < total_items: + indices.append(idx) + seen: set[int] = set() + ordered: list[int] = [] + for idx in indices: + if idx not in seen: + ordered.append(idx) + seen.add(idx) + return ordered + + +def _select_playlist_entries(entries: Any, selection: Optional[str]) -> list[Dict[str, Any]]: + """Pick playlist entries according to a selection string.""" + if not isinstance(entries, list): + return [] + indices = _parse_playlist_selection_indices(selection, len(entries)) + if not indices: + return [] + selected: list[Dict[str, Any]] = [] + for idx in indices: + entry = entries[idx] + if isinstance(entry, dict): + selected.append(entry) + return selected + + +def _sanitize_title_for_filename(title: Optional[str]) -> str: + """Match yt-dlp's restricted filename sanitization for comparisons.""" + if not title: + return "" + if ytdlp_sanitize_filename: + try: + return ytdlp_sanitize_filename(title, restricted=True) + except Exception: + pass + sanitized = re.sub(r"[^0-9A-Za-z._-]+", "_", title) + return sanitized.strip() or "" + + +def _find_playlist_files_from_entries( + entries: Sequence[Dict[str, Any]], + output_dir: Path, +) -> list[Path]: + """Resolve expected playlist files based on entry titles/exts.""" + matched: list[Path] = [] + seen: set[str] = set() + for entry in entries: + title = entry.get('title') if isinstance(entry, dict) else None + sanitized = _sanitize_title_for_filename(title) + if not sanitized: + continue + preferred_exts: list[str] = [] + for key in ('ext', 'audio_ext', 'video_ext'): + value = entry.get(key) if isinstance(entry, dict) else None + if isinstance(value, str) and value: + preferred_exts.append(value.lower()) + if not preferred_exts: + preferred_exts = [ext.strip('.') for ext in MEDIA_EXTENSIONS] + candidate: Optional[Path] = None + for ext in preferred_exts: + ext = ext.lstrip('.').lower() + path = output_dir / f"{sanitized}.{ext}" + if path.exists(): + candidate = path + break + if candidate is None: + try: + # Bandcamp/yt-dlp often prefixes uploader info, so fall back to a substring match. + for f in output_dir.glob(f"*{sanitized}*"): + if f.suffix.lower() in MEDIA_EXTENSIONS and f.is_file(): + candidate = f + break + except OSError: + candidate = None + if candidate and str(candidate) not in seen: + matched.append(candidate) + seen.add(str(candidate)) + return matched + + +def _snapshot_playlist_paths( + entries: Sequence[Dict[str, Any]], + output_dir: Path, +) -> tuple[list[Path], set[str]]: + """Capture current playlist file paths for a given selection.""" + matches = _find_playlist_files_from_entries(entries, output_dir) + resolved: set[str] = set() + for path in matches: + try: + resolved.add(str(path.resolve())) + except OSError: + resolved.add(str(path)) + return matches, resolved + + +def _expand_playlist_selection(selection: str, num_items: int) -> str: + """Expand playlist selection string, handling wildcards. + + Args: + selection: Selection string like '1,3,5-8' or '*' + num_items: Total number of items in playlist + + Returns: + Expanded selection string like '1,3,5,6,7,8' or '1-18' for '*' + """ + if selection.strip() == "*": + # Wildcard: select all items + return f"1-{num_items}" + + # Return as-is if not wildcard (yt-dlp will handle ranges and lists) + return selection + + +def _parse_selection_string(selection: str) -> List[int]: + """Parse selection string into list of integers. + + Handles formats like: + - "2" -> [2] + - "1,3,5" -> [1, 3, 5] + - "1-3" -> [1, 2, 3] + - "1,3-5,7" -> [1, 3, 4, 5, 7] + + Args: + selection: Selection string + + Returns: + List of integer indices + """ + result = [] + for part in selection.split(','): + part = part.strip() + if '-' in part: + # Range like "3-5" + try: + start, end = part.split('-') + start_num = int(start.strip()) + end_num = int(end.strip()) + result.extend(range(start_num, end_num + 1)) + except (ValueError, AttributeError): + continue + else: + # Single number + try: + result.append(int(part)) + except ValueError: + continue + return result + + +def _filter_and_sort_formats(formats: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Filter and sort formats for user selection. + + Filters out: + - Storyboards (webp, svg formats) + - Low quality audio (below ~128 kbps, typically 48kHz audio) + - Video below 360p + + Sorts to prioritize: + - @1: Best combined audio+video (highest resolution, highest bitrate) + - @2: Best audio-only (highest bitrate audio) + - Then rest by quality + + Args: + formats: List of format dicts from yt-dlp + + Returns: + Filtered and sorted format list + """ + filtered = [] + + for fmt in formats: + format_id = fmt.get("format_id", "") + ext = fmt.get("ext", "") + vcodec = fmt.get("vcodec", "") + acodec = fmt.get("acodec", "") + height = fmt.get("height") + tbr = fmt.get("tbr") # Total bitrate + + # Skip storyboards (webp images, svg, etc.) + if ext in {"webp", "svg", "mhtml"}: + continue + + # Skip video-only formats below 360p + if vcodec != "none" and acodec == "none": + if height and height < 360: + continue + + # Skip low-bitrate audio (typically 48kHz, very low quality) + # Keep audio with tbr >= 64 kbps (reasonable quality threshold) + if acodec != "none" and vcodec == "none": + if tbr and tbr < 64: + continue + + filtered.append(fmt) + + # Sort formats: best combined first, then best audio-only, then video-only + def format_sort_key(fmt: Dict[str, Any]) -> tuple: + vcodec = fmt.get("vcodec", "") + acodec = fmt.get("acodec", "") + height = fmt.get("height", 0) or 0 + tbr = fmt.get("tbr", 0) or 0 + + # Category 0: has both audio and video (sort first) + # Category 1: audio only (sort second) + # Category 2: video only (sort last, by height desc) + if vcodec != "none" and acodec != "none": + category = 0 + return (category, -height, -tbr) + elif acodec != "none" and vcodec == "none": + category = 1 + return (category, -tbr) # Sort by bitrate descending + else: # Video only + category = 2 + return (category, -height, -tbr) # Sort by height descending, then bitrate + + return sorted(filtered, key=format_sort_key) + + +def _compute_file_hash(file_path: Path) -> Optional[str]: + """Compute SHA256 hash of file.""" + try: + return sha256_file(file_path) + except Exception: + return None + + + + + +# ============================================================================ +# Main Cmdlet Function +# ============================================================================ + +def _run(result: Any, args: Sequence[str], config: Dict[str, Any], emit_results: bool = True) -> int: + """Download data from URLs with advanced options. + + Accepts: + - Single URL as string + - Result object with 'url' or 'file_path' field + - List of results + - File containing URLs (one per line) + + Returns: + Exit code (0 for success, 1 for failure) + """ + + debug("Starting download-data") + + collected_results: List[Dict[str, Any]] = [] + + def _emit(obj: Any) -> None: + """Internal helper to collect and optionally emit results.""" + collected_results.append(obj) + if emit_results: + pipeline_context.emit(obj) + + # Track pipeline mode once so playlist handling can respect current run scope + stage_ctx = pipeline_context.get_stage_context() + in_pipeline = stage_ctx is not None and getattr(stage_ctx, 'total_stages', 1) > 1 + + # ======================================================================== + # ARGUMENT PARSING + # ======================================================================== + + # Parse arguments using shared parser + parsed = parse_cmdlet_args(args, CMDLET) + + audio_mode = parsed.get("audio", False) + format_selector = parsed.get("format") + list_formats_mode = parsed.get("list-formats", False) + + clip_spec = parsed.get("clip") + clip_range = None + if clip_spec: + clip_range = _parse_time_range(clip_spec) + if clip_range: + debug(f"Clip range: {clip_spec} ({clip_range[0]}-{clip_range[1]} seconds)") + else: + log(f"Invalid clip format: {clip_spec}", file=sys.stderr) + return 1 + + cookies_path = parsed.get("cookies") + storage_location = parsed.get("storage") + + torrent_mode = parsed.get("torrent", False) + wait_timeout = float(parsed.get("wait", 1800)) + + # Collect URLs from positional args and -url flag + # Both map to "url" in parsed result + urls_to_download = [] + raw_urls = parsed.get("url", []) + if isinstance(raw_urls, str): + raw_urls = [raw_urls] + + for arg in raw_urls: + if arg.lower().startswith(('http://', 'https://')): + # Check if it's a .torrent URL or file first + if '.torrent' in arg.lower(): + debug(f"Processing torrent URL: {arg}") + magnet = _process_torrent_input(arg) + if magnet and magnet.lower().startswith('magnet:'): + urls_to_download.append(magnet) + debug(f"✓ Converted to magnet: {magnet[:70]}...") + elif magnet: + urls_to_download.append(magnet) + else: + log(f"✗ Failed to process torrent: {arg}", file=sys.stderr) + else: + urls_to_download.append(arg) + elif torrent_mode and (arg.lower().startswith('magnet:') or len(arg) == 40 or len(arg) == 64): + # In torrent mode, accept magnet links or torrent hashes (40-char SHA1 or 64-char SHA256) + urls_to_download.append(arg) + debug(f"Torrent/magnet added: {arg[:50]}...") + elif _is_torrent_file_or_url(arg): + # Handle .torrent files and URLs + log(f"Processing torrent file/URL: {arg}", flush=True) + magnet = _process_torrent_input(arg) + if magnet and magnet.lower().startswith('magnet:'): + urls_to_download.append(magnet) + log(f"✓ Converted to magnet: {magnet[:70]}...", flush=True) + elif magnet: + urls_to_download.append(magnet) + else: + log(f"✗ Failed to process torrent: {arg}", file=sys.stderr) + else: + # Treat as URL if it looks like one + if arg.lower().startswith(('magnet:', 'ftp://')): + urls_to_download.append(arg) + else: + # Check if it's a file containing URLs + path = Path(arg) + if path.exists() and path.is_file(): + try: + with open(arg, 'r') as f: + for line in f: + line = line.strip() + if line and line.lower().startswith(('http://', 'https://')): + urls_to_download.append(line) + log(f"Loaded URLs from file: {arg}", flush=True) + except Exception as e: + log(f"Error reading file {arg}: {e}", file=sys.stderr) + else: + log(f"Ignored argument: {arg}", file=sys.stderr) + + # Item selection (for playlists/formats) + # Note: -item flag is deprecated in favor of @N pipeline selection, but kept for compatibility + playlist_items = parsed.get("item") + if playlist_items: + log(f"Item selection: {playlist_items}", flush=True) + + + + + def _is_openlibrary_downloadable(ebook_access_val: Any, status_val: Any) -> bool: + access = str(ebook_access_val or "").strip().lower() + status = str(status_val or "").strip().lower() + if status == "download": + return True + if access in {"borrowable", "public", "full", "open"} or access.startswith("full "): + return True + if "✓" in str(status_val or ""): + return True + return False + + # ======================================================================== + # INPUT PROCESSING - Extract URLs from pipeline or arguments + # ======================================================================== + + # Initialize worker tracking for downloads + import uuid + from helper.local_library import LocalLibraryDB + from config import get_local_storage_path + + worker_id = str(uuid.uuid4()) + library_root = get_local_storage_path(config or {}) + db = None + if library_root: + try: + db = LocalLibraryDB(library_root) + db.insert_worker( + worker_id, + "download", + title="Download Data", + description="Downloading files from search results", + pipe=pipeline_context.get_current_command_text() + ) + except Exception as e: + log(f"⚠ Worker tracking unavailable: {e}", file=sys.stderr) + + piped_results = normalize_result_input(result) + + # Track files downloaded directly (e.g. Soulseek) to avoid "No URLs" error + files_downloaded_directly = 0 + + # Only process piped results if no URLs were provided in arguments + # This prevents picking up residue from previous commands when running standalone + if piped_results and not urls_to_download: + for item in piped_results: + url = None + origin = None + + # ====== CHECK FOR PLAYLIST ITEM MARKER FROM add-file ====== + # When add-file detects a playlist item and wants to download it + if isinstance(item, dict) and item.get('__playlist_url'): + playlist_url = item.get('__playlist_url') + item_num = item.get('__playlist_item', 1) + log(f"📍 Playlist item from add-file: #{item_num}", flush=True) + # Add to download list with marker + urls_to_download.append({ + '__playlist_url': playlist_url, + '__playlist_item': int(item_num) + }) + continue + + # ====== CHECK FOR PLAYLIST ITEM SELECTION FIRST ====== + # When user selects @12 from a playlist, item is emitted dict with __action: "playlist-item:12" + if isinstance(item, dict) and '__action' in item and item['__action'].startswith('playlist-item:'): + playlist_url = item.get('__file_path') + playlist_action = item['__action'] # e.g., "playlist-item:12" + item_num = playlist_action.split(':')[1] # Extract item number (1-based) + + if playlist_url: + # Playlist item selected - need to download this specific track + log(f"📍 Playlist item selected: #{item_num} - {item.get('title', 'Unknown')}", flush=True) + # Add to download list - the playlist will be probed and item extracted + # Store with special marker so we know which item to select + urls_to_download.append({ + '__playlist_url': playlist_url, + '__playlist_item': int(item_num) + }) + continue + + # ====== CHECK FOR FORMAT SELECTION RESULT ====== + if isinstance(item, dict) and item.get('format_id') is not None and item.get('source_url'): + log(f"🎬 Format selected from pipe: {item.get('format_id')}", flush=True) + log(f" Source URL: {item.get('source_url')}", flush=True) + # Store as dict so we can extract format_id + source_url during download + urls_to_download.append(item) + continue + elif hasattr(item, 'format_id') and hasattr(item, 'source_url') and item.format_id is not None: + log(f"🎬 Format selected from pipe: {item.format_id}", flush=True) + log(f" Source URL: {item.source_url}", flush=True) + urls_to_download.append({ + 'format_id': item.format_id, + 'source_url': item.source_url, + }) + continue + + if isinstance(item, dict): + # Check for search provider results first + origin = item.get('origin') + if origin in {'openlibrary', 'libgen', 'soulseek', 'debrid'}: + # Handle search provider results + title = item.get('title', 'Item') + if origin == 'openlibrary': + # OpenLibrary: First check if lendable/downloadable via Archive.org + # Only route to LibGen if NOT available on Archive.org + metadata = item.get('full_metadata', {}) if isinstance(item.get('full_metadata'), dict) else {} + isbn = metadata.get('isbn') or item.get('isbn') + olid = metadata.get('olid') or item.get('olid') + + log(f"[search-result] OpenLibrary: '{title}'", flush=True) + if isbn: + log(f" ISBN: {isbn}", flush=True) + + # Check if book is borrowable from ebook_access field or status + ebook_access = metadata.get('ebook_access') or item.get('ebook_access', '') + status_text = metadata.get('status') or item.get('status', '') + archive_id = metadata.get('archive_id') or item.get('archive_id') + + # Determine if borrowable based on new status vocabulary + is_borrowable = _is_openlibrary_downloadable(ebook_access, status_text) + + if is_borrowable: + log(f" ✓ Available for borrowing on Archive.org", flush=True) + log(f" → Queued for auto-borrowing...", flush=True) + # Queue borrow request as special dict object + # We need OCAID (Archive.org ID), not just numeric OLID + ocaid = archive_id + + if not ocaid and isbn: + # If no OCAID in metadata, fetch it from OpenLibrary ISBN lookup + try: + import requests + ol_url = f'https://openlibrary.org/isbn/{isbn}.json' + r = requests.get(ol_url, timeout=5) + if r.status_code == 200: + ol_data = r.json() + ocaid = ol_data.get('ocaid') + except Exception as e: + log(f" ⚠ Could not fetch OCAID from OpenLibrary: {e}", file=sys.stderr) + + if ocaid: + urls_to_download.append({ + '__borrow_request__': True, + 'book_id': ocaid, + 'isbn': isbn, + 'title': title, + 'olid': olid + }) + else: + # OCAID not found - book claims borrowable but not on Archive.org + # Fall back to LibGen search instead + log(f" ⚠ Book marked borrowable but not found on Archive.org", file=sys.stderr) + if isbn: + try: + from helper.search_provider import get_provider + libgen_provider = get_provider("libgen", config) + if libgen_provider: + libgen_results = libgen_provider.search(f"isbn:{isbn}", limit=1) + if libgen_results: + libgen_result = libgen_results[0] + url = libgen_result.get('target') if isinstance(libgen_result, dict) else getattr(libgen_result, 'target', None) + if url: + urls_to_download.append(url) + log(f" ✓ Found on LibGen instead", flush=True) + else: + log(f" ⚠ Not found on LibGen", file=sys.stderr) + else: + log(f" ⚠ Not found on LibGen", file=sys.stderr) + else: + log(f" ⚠ LibGen provider not available", file=sys.stderr) + except Exception as e: + log(f" ✗ Error searching LibGen: {e}", file=sys.stderr) + else: + # Book is NOT borrowable - route to LibGen + if isbn: + log(f" ⚠ Not available on Archive.org - attempting LibGen...", flush=True) + try: + from helper.search_provider import get_provider + libgen_provider = get_provider("libgen", config) + if libgen_provider: + libgen_results = libgen_provider.search(f"isbn:{isbn}", limit=1) + if libgen_results: + libgen_result = libgen_results[0] + url = libgen_result.get('target') if isinstance(libgen_result, dict) else getattr(libgen_result, 'target', None) + if url: + urls_to_download.append(url) + log(f" ✓ Found on LibGen", flush=True) + else: + log(f" ⚠ Not found on LibGen", file=sys.stderr) + else: + log(f" ⚠ Not found on LibGen", flush=True) + log(f" ▶ To search LibGen: search-file -provider libgen 'isbn:{isbn}' | @1 | download-data", flush=True) + else: + log(f" ▶ To search LibGen: search-file -provider libgen 'isbn:{isbn}' | @1 | download-data", flush=True) + except Exception as e: + log(f" ⚠ Could not search LibGen: {e}", file=sys.stderr) + log(f" ▶ To search LibGen: search-file -provider libgen 'isbn:{isbn}' | @1 | download-data", flush=True) + else: + log(f" ⚠ ISBN not available", flush=True) + log(f" ▶ Visit: {item.get('target', 'https://openlibrary.org')}", flush=True) + log(f" ▶ Or find ISBN and use: search-file -provider libgen 'isbn:\"\"'", flush=True) + elif origin == 'soulseek': + # Handle Soulseek downloads using the provider + metadata = item.get('full_metadata', {}) if isinstance(item.get('full_metadata'), dict) else {} + username = metadata.get('username') + filename = metadata.get('filename') + size = item.get('size_bytes') or 0 + + if username and filename: + try: + import asyncio + from helper.search_provider import SoulSeekProvider + provider = SoulSeekProvider(config) + log(f"[search-result] Soulseek: '{title}'", flush=True) + log(f" ▶ Downloading from {username}...", flush=True) + + if db: + db.append_worker_stdout(worker_id, f"Downloading from Soulseek: {title} (from {username})") + + # Get temp directory from config + temp_dir = config.get('temp') + if temp_dir: + temp_dir = str(Path(temp_dir).expanduser()) + + # Call async download_file with asyncio.run() + success = asyncio.run(provider.download_file( + username=username, + filename=filename, + file_size=size, + target_dir=temp_dir + )) + + if success: + downloaded_file = Path(provider.DOWNLOAD_DIR) / Path(filename).name + if downloaded_file.exists(): + log(f" ✓ Downloaded: {downloaded_file.name}", flush=True) + files_downloaded_directly += 1 + if db: + db.append_worker_stdout(worker_id, f"✓ Downloaded: {downloaded_file.name}") + if pipeline_context._PIPE_ACTIVE: + # Create proper PipeObject result + result_dict = create_pipe_object_result( + source='soulseek', + identifier=filename, + file_path=str(downloaded_file), + cmdlet_name='download-data', + title=title, + target=str(downloaded_file), # Explicit target for add-file + extra={ + "metadata": metadata, + "origin": "soulseek" + } + ) + pipeline_context.emit(result_dict) + else: + log(f" ✗ Download failed (peer may be offline)", file=sys.stderr) + if db: + db.append_worker_stdout(worker_id, f"✗ Download failed for {title}") + log(f" ▶ Try another result: search-file -provider soulseek \"...\" | @2 | download-data", flush=True) + except Exception as e: + log(f" ✗ Download error: {e}", file=sys.stderr) + if db: + db.append_worker_stdout(worker_id, f"✗ Error: {e}") + log(f" ▶ Alternative: search-soulseek -download \"{title}\" -storage ", flush=True) + else: + log(f"[search-result] Soulseek: '{title}'", flush=True) + log(f" ⚠ Missing download info (username/filename)", flush=True) + if db: + db.append_worker_stdout(worker_id, f"⚠ Missing download info for {title}") + elif origin == 'libgen': + # LibGen results can use the direct URL + # Also extract mirrors dict for fallback if primary fails + url = item.get('target') + # Extract mirrors and book_id from full_metadata + metadata = item.get('full_metadata', {}) if isinstance(item.get('full_metadata'), dict) else {} + mirrors = metadata.get('mirrors', {}) + book_id = metadata.get('book_id', '') + + if url: + url_entry = { + 'url': str(url), + 'mirrors': mirrors, # Alternative mirrors for fallback + 'book_id': book_id, + } + urls_to_download.append(url_entry) + log(f"[search-result] LibGen: '{title}'", flush=True) + log(f" ✓ Queued for download", flush=True) + if mirrors: + log(f" Mirrors available: {len(mirrors)}", flush=True) + elif origin == 'debrid': + # Debrid results can use download-data + url = item.get('target') + if url: + urls_to_download.append(str(url)) + log(f"[search-result] Debrid: '{title}'", flush=True) + log(f" ✓ Queued for download", flush=True) + else: + # Regular fields for non-search results + url = item.get('url') or item.get('link') or item.get('href') or item.get('target') + else: + # Object attributes + origin = getattr(item, 'origin', None) + title = getattr(item, 'title', 'Item') + if origin in {'openlibrary', 'libgen', 'soulseek', 'debrid'}: + # Handle search provider results + if origin == 'openlibrary': + # OpenLibrary: First check if lendable/downloadable via Archive.org + # Only route to LibGen if NOT available on Archive.org + metadata = getattr(item, 'full_metadata', {}) if isinstance(getattr(item, 'full_metadata', None), dict) else {} + isbn = metadata.get('isbn') or getattr(item, 'isbn', None) + olid = metadata.get('olid') or getattr(item, 'olid', None) + + log(f"[search-result] OpenLibrary: '{title}'", flush=True) + if isbn: + log(f" ISBN: {isbn}", flush=True) + + # Check if book is borrowable from ebook_access field or status + ebook_access = metadata.get('ebook_access') or getattr(item, 'ebook_access', '') + status_text = metadata.get('status') or getattr(item, 'status', '') + archive_id = metadata.get('archive_id') or getattr(item, 'archive_id', '') + + # Determine if borrowable using unified helper + is_borrowable = _is_openlibrary_downloadable(ebook_access, status_text) + + if is_borrowable: + # Book IS borrowable on Archive.org + log(f" ✓ Available for borrowing on Archive.org", flush=True) + log(f" → Queued for auto-borrowing...", flush=True) + # Queue borrow request as special dict object + ocaid = archive_id + if not ocaid and isbn: + try: + import requests + ol_url = f'https://openlibrary.org/isbn/{isbn}.json' + r = requests.get(ol_url, timeout=5) + if r.status_code == 200: + ol_data = r.json() + ocaid = ol_data.get('ocaid') + except Exception as e: + log(f" ⚠ Could not fetch OCAID from OpenLibrary: {e}", file=sys.stderr) + if ocaid: + urls_to_download.append({ + '__borrow_request__': True, + 'book_id': ocaid, + 'isbn': isbn, + 'title': title, + 'olid': olid or getattr(item, 'openlibrary_id', '') + }) + else: + # OCAID not found - book claims borrowable but not on Archive.org + # Fall back to LibGen search instead + log(f" ⚠ No Archive.org ID found - attempting LibGen instead...", file=sys.stderr) + if isbn: + try: + from helper.search_provider import get_provider + libgen_provider = get_provider("libgen", config) + if libgen_provider: + libgen_results = libgen_provider.search(f"isbn:{isbn}", limit=1) + if libgen_results: + libgen_result = libgen_results[0] + url = libgen_result.get('target') if isinstance(libgen_result, dict) else getattr(libgen_result, 'target', None) + if url: + urls_to_download.append(url) + log(f" ✓ Found on LibGen instead", flush=True) + else: + log(f" ⚠ Not found on LibGen", file=sys.stderr) + else: + log(f" ⚠ Not found on LibGen", file=sys.stderr) + else: + log(f" ⚠ LibGen provider not available", file=sys.stderr) + except Exception as e: + log(f" ✗ Error searching LibGen: {e}", file=sys.stderr) + else: + log(f" ⚠ ISBN not available for LibGen fallback", file=sys.stderr) + else: + # Book is NOT borrowable - route to LibGen + if isbn: + log(f" ⚠ Not available on Archive.org - attempting LibGen...", flush=True) + try: + from helper.search_provider import get_provider + libgen_provider = get_provider("libgen", config) + if libgen_provider: + libgen_results = libgen_provider.search(f"isbn:{isbn}", limit=1) + if libgen_results: + libgen_result = libgen_results[0] + url = libgen_result.get('target') if isinstance(libgen_result, dict) else getattr(libgen_result, 'target', None) + if url: + urls_to_download.append(url) + log(f" ✓ Found on LibGen", flush=True) + else: + log(f" ⚠ Not found on LibGen", file=sys.stderr) + else: + log(f" ⚠ Not found on LibGen", flush=True) + log(f" ▶ To search LibGen: search-file -provider libgen 'isbn:{isbn}' | @1 | download-data", flush=True) + else: + log(f" ▶ To search LibGen: search-file -provider libgen 'isbn:{isbn}' | @1 | download-data", flush=True) + except Exception as e: + log(f" ⚠ Could not search LibGen: {e}", file=sys.stderr) + log(f" ▶ To search LibGen: search-file -provider libgen 'isbn:{isbn}' | @1 | download-data", flush=True) + else: + log(f" ⚠ ISBN not available", flush=True) + log(f" ▶ Visit: {getattr(item, 'target', 'https://openlibrary.org')}", flush=True) + log(f" ▶ Or find ISBN and use: search-file -provider libgen 'isbn:\"\"'", flush=True) + elif origin == 'soulseek': + # Handle Soulseek downloads using the provider + metadata = getattr(item, 'full_metadata', {}) if isinstance(getattr(item, 'full_metadata', None), dict) else {} + username = metadata.get('username') + filename = metadata.get('filename') + size = getattr(item, 'size_bytes', 0) or 0 + + if username and filename: + try: + import asyncio + from helper.search_provider import SoulSeekProvider + provider = SoulSeekProvider(config) + log(f"[search-result] Soulseek: '{title}'", flush=True) + log(f" ▶ Downloading from {username}...", flush=True) + + if db: + db.append_worker_stdout(worker_id, f"Downloading from Soulseek: {title} (from {username})") + + # Get temp directory from config + temp_dir = config.get('temp') + if temp_dir: + temp_dir = str(Path(temp_dir).expanduser()) + + # Call async download_file with asyncio.run() + success = asyncio.run(provider.download_file( + username=username, + filename=filename, + file_size=size, + target_dir=temp_dir + )) + + if success: + downloaded_file = Path(provider.DOWNLOAD_DIR) / Path(filename).name + if downloaded_file.exists(): + log(f" ✓ Downloaded: {downloaded_file.name}", flush=True) + files_downloaded_directly += 1 + if db: + db.append_worker_stdout(worker_id, f"✓ Downloaded: {downloaded_file.name}") + if pipeline_context._PIPE_ACTIVE: + # Create proper PipeObject result + result_dict = create_pipe_object_result( + source='soulseek', + identifier=filename, + file_path=str(downloaded_file), + cmdlet_name='download-data', + title=title, + target=str(downloaded_file), # Explicit target for add-file + extra={ + "metadata": metadata, + "origin": "soulseek" + } + ) + pipeline_context.emit(result_dict) + else: + log(f" ✗ Download failed (peer may be offline)", file=sys.stderr) + if db: + db.append_worker_stdout(worker_id, f"✗ Download failed for {title}") + log(f" ▶ Try another result: search-file -provider soulseek \"...\" | @2 | download-data", flush=True) + except Exception as e: + log(f" ✗ Download error: {e}", file=sys.stderr) + if db: + db.append_worker_stdout(worker_id, f"✗ Error: {e}") + log(f" ▶ Alternative: search-soulseek -download \"{title}\" -storage ", flush=True) + else: + log(f"[search-result] Soulseek: '{title}'", flush=True) + log(f" ⚠ Missing download info (username/filename)", flush=True) + if db: + db.append_worker_stdout(worker_id, f"⚠ Missing download info for {title}") + elif origin == 'libgen': + # LibGen results with mirrors dict for fallback + url = getattr(item, 'target', None) + # Extract mirrors and book_id from full_metadata + metadata = getattr(item, 'full_metadata', {}) if isinstance(getattr(item, 'full_metadata', None), dict) else {} + mirrors = metadata.get('mirrors', {}) + book_id = metadata.get('book_id', '') + + if url: + url_entry = { + 'url': str(url), + 'mirrors': mirrors, # Alternative mirrors for fallback + 'book_id': book_id, + } + urls_to_download.append(url_entry) + else: + urls_to_download.append(url) if url else None + elif origin == 'debrid': + url = getattr(item, 'target', None) + else: + url = getattr(item, 'url', None) or getattr(item, 'link', None) or getattr(item, 'href', None) or getattr(item, 'target', None) + + if url: + urls_to_download.append(str(url)) + + if not urls_to_download and files_downloaded_directly == 0: + log(f"No downloadable URLs found", file=sys.stderr) + return 1 + + log(f"Processing {len(urls_to_download)} URL(s)", flush=True) + for i, u in enumerate(urls_to_download, 1): + if isinstance(u, dict): + log(f" [{i}] Format: {u.get('format_id', '?')} from {u.get('source_url', '?')[:60]}...", flush=True) + else: + log(f" [{i}] URL: {str(u)[:60]}...", flush=True) + + # ======================================================================== + # RESOLVE OUTPUT DIRECTORY + # ======================================================================== + + final_output_dir = None + + # Priority 1: --storage flag + if storage_location: + try: + final_output_dir = SharedArgs.resolve_storage(storage_location) + log(f"Using storage location: {storage_location} → {final_output_dir}", flush=True) + except ValueError as e: + log(str(e), file=sys.stderr) + return 1 + + # Priority 2: Config resolver + if final_output_dir is None and resolve_output_dir is not None: + try: + final_output_dir = resolve_output_dir(config) + log(f"Using config resolver: {final_output_dir}", flush=True) + except Exception: + pass + + # Priority 4: Config outfile + if final_output_dir is None and config and config.get("outfile"): + try: + final_output_dir = Path(config["outfile"]).expanduser() + log(f"Using config outfile: {final_output_dir}", flush=True) + except Exception: + pass + + # Priority 5: Default (home/Videos) + if final_output_dir is None: + final_output_dir = Path.home() / "Videos" + log(f"Using default directory: {final_output_dir}", flush=True) + + # Ensure directory exists + try: + final_output_dir.mkdir(parents=True, exist_ok=True) + except Exception as e: + log(f"Cannot create output directory {final_output_dir}: {e}", file=sys.stderr) + return 1 + + # ======================================================================== + # DOWNLOAD EACH URL + # ======================================================================== + + downloaded_files = [] + playlists_displayed = 0 + formats_displayed = False # NEW: Track if we showed formats + exit_code = 0 + + for url in urls_to_download: + try: + selected_playlist_entries: list[Dict[str, Any]] = [] + playlist_existing_paths: set[str] = set() + + # ====== HANDLE FORMAT SELECTION FROM PIPED RESULT ====== + # If url is a dict with format_id and source_url, extract them and override format_selector + current_format_selector = format_selector + actual_url = url + if isinstance(url, dict) and url.get('format_id') and url.get('source_url'): + log(f"🎬 Format selected: {url.get('format_id')}", flush=True) + format_id = url.get('format_id') + current_format_selector = format_id + + # If it's a video-only format (has vcodec but no acodec), add bestaudio + vcodec = url.get('vcodec', '') + acodec = url.get('acodec', '') + if vcodec and vcodec != "none" and (not acodec or acodec == "none"): + # Video-only format, add bestaudio automatically + current_format_selector = f"{format_id}+bestaudio" + log(f" ℹ️ Video-only format detected, automatically adding bestaudio", flush=True) + + actual_url = url.get('source_url') + url = actual_url # Use the actual URL for further processing + + # ====== AUTO-BORROW MODE - INTERCEPT SPECIAL BORROW REQUEST DICTS ====== + if isinstance(url, dict) and url.get('__borrow_request__'): + try: + from helper.archive_client import credential_openlibrary, loan, get_book_infos, download + import tempfile + import shutil + + book_id = url.get('book_id') + if not book_id: + log(f" ✗ Missing book ID for borrowing", file=sys.stderr) + exit_code = 1 + continue + + title_val = url.get('title', 'Unknown Book') + book_id_str = str(book_id) + + log(f"[auto-borrow] Starting borrow for: {title_val}", flush=True) + log(f" Book ID: {book_id_str}", flush=True) + + # Get Archive.org credentials + email, password = credential_openlibrary(config) + if not email or not password: + log(f" ✗ Archive.org credentials not configured", file=sys.stderr) + log(f" ▶ Set ARCHIVE_EMAIL and ARCHIVE_PASSWORD environment variables", file=sys.stderr) + exit_code = 1 + continue + + # Attempt to borrow and download + try: + log(f" → Logging into Archive.org...", flush=True) + from helper.archive_client import login + import requests + try: + session = login(email, password) + except requests.exceptions.Timeout: + log(f" ✗ Timeout logging into Archive.org (server not responding)", file=sys.stderr) + exit_code = 1 + continue + except requests.exceptions.RequestException as e: + log(f" ✗ Error connecting to Archive.org: {e}", file=sys.stderr) + exit_code = 1 + continue + + log(f" → Borrowing book...", flush=True) + try: + session = loan(session, book_id_str, verbose=True) + except requests.exceptions.Timeout: + log(f" ✗ Timeout while borrowing (server not responding)", file=sys.stderr) + exit_code = 1 + continue + except requests.exceptions.RequestException as e: + log(f" ✗ Error while borrowing: {e}", file=sys.stderr) + exit_code = 1 + continue + + log(f" → Extracting page information...", flush=True) + # Try both URL formats + book_urls = [ + f"https://archive.org/borrow/{book_id_str}", + f"https://archive.org/details/{book_id_str}" + ] + + title = None + links = None + metadata = None + last_error = None + for book_url in book_urls: + try: + title, links, metadata = get_book_infos(session, book_url) + if title and links: + log(f" → Found {len(links)} pages", flush=True) + break + except requests.exceptions.Timeout: + last_error = "Timeout while extracting pages" + log(f" ⚠ Timeout while extracting from {book_url}", flush=True) + continue + except Exception as e: + last_error = str(e) + log(f" ⚠ Failed to extract from {book_url}: {e}", flush=True) + continue + + if not links: + log(f" ✗ Could not extract book pages (Last error: {last_error})", file=sys.stderr) + exit_code = 1 + continue + + # Download pages + log(f" → Downloading {len(links)} pages...", flush=True) + with tempfile.TemporaryDirectory() as temp_dir: + # download(session, n_threads, directory, links, scale, book_id) + images = download( + session, + n_threads=4, + directory=temp_dir, + links=links, + scale=2, + book_id=str(book_id) + ) + + if not images: + log(f" ✗ No pages downloaded", file=sys.stderr) + exit_code = 1 + continue + + log(f" ✓ Downloaded {len(images)} pages", flush=True) + + # Try to merge into PDF + try: + import img2pdf + log(f" → Merging pages into PDF...", flush=True) + + filename = title if title else f"book_{book_id_str}" + filename = "".join(c for c in filename if c.isalnum() or c in (' ', '.', '-'))[:100] + output_path = Path(final_output_dir) / f"{filename}.pdf" + + # Make unique filename if needed + i = 1 + while output_path.exists(): + output_path = Path(final_output_dir) / f"{filename}({i}).pdf" + i += 1 + + pdf_content = img2pdf.convert(images) + if pdf_content: + with open(output_path, 'wb') as f: + f.write(pdf_content) + + log(f" ✓ Successfully borrowed and saved to: {output_path}", flush=True) + downloaded_files.append(str(output_path)) + + # Emit result for downstream cmdlets + file_hash = _compute_file_hash(output_path) + # Build tags including ISBN if available + emit_tags = ['book', 'borrowed', 'pdf'] + isbn_tag = url.get('isbn') + if isbn_tag: + emit_tags.append(f'isbn:{isbn_tag}') + olid_tag = url.get('olid') + if olid_tag: + emit_tags.append(f'olid:{olid_tag}') + + # Fetch OpenLibrary metadata tags + ol_tags = fetch_openlibrary_metadata_tags(isbn=isbn_tag, olid=olid_tag) + emit_tags.extend(ol_tags) + + pipe_obj = create_pipe_object_result( + source='archive.org', + identifier=book_id_str, + file_path=str(output_path), + cmdlet_name='download-data', + title=title_val, + file_hash=file_hash, + tags=emit_tags, + source_url=url.get('source_url', f'archive.org/borrow/{book_id_str}') + ) + pipeline_context.emit(pipe_obj) + exit_code = 0 + except ImportError: + log(f" ⚠ img2pdf not available - saving pages as collection", file=sys.stderr) + # Just copy images to output dir + filename = title if title else f"book_{book_id_str}" + filename = "".join(c for c in filename if c.isalnum() or c in (' ', '.', '-'))[:100] + output_dir = Path(final_output_dir) / filename + i = 1 + while output_dir.exists(): + output_dir = Path(final_output_dir) / f"{filename}({i})" + i += 1 + + shutil.copytree(temp_dir, str(output_dir)) + log(f" ✓ Successfully borrowed and saved to: {output_dir}", flush=True) + downloaded_files.append(str(output_dir)) + + # Emit result for downstream cmdlets + # Build tags including ISBN if available + emit_tags = ['book', 'borrowed', 'pages'] + isbn_tag = url.get('isbn') + if isbn_tag: + emit_tags.append(f'isbn:{isbn_tag}') + olid_tag = url.get('olid') + if olid_tag: + emit_tags.append(f'olid:{olid_tag}') + + # Fetch OpenLibrary metadata tags + ol_tags = fetch_openlibrary_metadata_tags(isbn=isbn_tag, olid=olid_tag) + emit_tags.extend(ol_tags) + + pipe_obj = create_pipe_object_result( + source='archive.org', + identifier=book_id_str, + file_path=str(output_dir), + cmdlet_name='download-data', + title=title_val, + tags=emit_tags, + source_url=url.get('source_url', f'archive.org/borrow/{book_id_str}') + ) + pipeline_context.emit(pipe_obj) + exit_code = 0 + + except Exception as e: + log(f" ✗ Borrow/download failed: {e}", file=sys.stderr) + import traceback + traceback.print_exc() + exit_code = 1 + + continue # Skip normal URL handling + + except ImportError as e: + log(f" ✗ Archive.org tools not available: {e}", file=sys.stderr) + exit_code = 1 + continue + except Exception as e: + log(f" ✗ Auto-borrow error: {e}", file=sys.stderr) + import traceback + traceback.print_exc() + exit_code = 1 + continue + + + # ====== LIBGEN MIRROR FALLBACK MODE ====== + # Handle libgen results with mirrors dict for fallback on failure + if isinstance(url, dict) and 'mirrors' in url: + try: + primary_url = url.get('url') + mirrors_dict = url.get('mirrors', {}) + book_id = url.get('book_id', '') + + if not primary_url: + log(f"Skipping libgen entry: no primary URL", file=sys.stderr) + exit_code = 1 + continue + + # Build list of mirrors to try: primary first, then alternatives + mirrors_to_try = [primary_url] + mirrors_to_try.extend(mirrors_dict.values()) + + # Remove duplicates while preserving order + mirrors_to_try = list(dict.fromkeys(mirrors_to_try)) + + log(f"🔄 LibGen download with mirror fallback (book_id: {book_id})", flush=True) + log(f" Primary: {primary_url[:80]}...", flush=True) + + if len(mirrors_to_try) > 1: + log(f" {len(mirrors_to_try) - 1} alternative mirror(s) available", flush=True) + + # Resolve cookies path + final_cookies_path_libgen = None + if cookies_path: + if resolve_cookies_path: + try: + final_cookies_path_libgen = resolve_cookies_path(config, Path(cookies_path)) + except Exception: + final_cookies_path_libgen = Path(cookies_path).expanduser() if cookies_path else None + else: + final_cookies_path_libgen = Path(cookies_path).expanduser() + + download_succeeded = False + last_error = None + successful_mirror = None + + # Try each mirror in sequence using libgen_service's native download + for mirror_idx, mirror_url in enumerate(mirrors_to_try, 1): + try: + if mirror_idx > 1: + log(f" → Trying mirror #{mirror_idx}: {mirror_url[:80]}...", flush=True) + + # Use libgen_service's download_from_mirror for proper libgen handling + from helper.libgen_service import download_from_mirror + + # Generate filename from book_id and title + safe_title = "".join(c for c in str(title or "book") if c.isalnum() or c in (' ', '.', '-'))[:100] + file_path = final_output_dir / f"{safe_title}_{book_id}.pdf" + + # Attempt download using libgen's native function + success = download_from_mirror( + mirror_url=mirror_url, + output_path=file_path, + log_info=lambda msg: log(f" {msg}", flush=True), + log_error=lambda msg: log(f" ⚠ {msg}", file=sys.stderr) + ) + + if success and file_path.exists(): + log(f" ✓ Downloaded successfully from mirror #{mirror_idx}", flush=True) + successful_mirror = mirror_url + download_succeeded = True + + # Emit result for downstream cmdlets + file_hash = _compute_file_hash(file_path) + emit_tags = ['libgen', 'book'] + + pipe_obj = create_pipe_object_result( + source='libgen', + identifier=book_id, + file_path=str(file_path), + cmdlet_name='download-data', + file_hash=file_hash, + tags=emit_tags, + source_url=successful_mirror + ) + pipeline_context.emit(pipe_obj) + downloaded_files.append(str(file_path)) + exit_code = 0 + break # Success, stop trying mirrors + + except Exception as e: + last_error = str(e) + if mirror_idx == 1: + log(f" ⚠ Primary mirror failed: {e}", flush=True) + else: + log(f" ⚠ Mirror #{mirror_idx} failed: {e}", flush=True) + + if not download_succeeded: + log(f" ✗ All mirrors failed. Last error: {last_error}", file=sys.stderr) + if "getaddrinfo failed" in str(last_error) or "NameResolutionError" in str(last_error) or "Failed to resolve" in str(last_error): + log(f" ⚠ Network issue detected: Cannot resolve LibGen mirror hostnames", file=sys.stderr) + log(f" ▶ Check your network connection or try with a VPN/proxy", file=sys.stderr) + exit_code = 1 + + continue # Skip to next URL + + except Exception as e: + log(f" ✗ LibGen mirror fallback error: {e}", file=sys.stderr) + import traceback + traceback.print_exc(file=sys.stderr) + exit_code = 1 + continue + + # Ensure URL is a string for normal handling + if not isinstance(url, str): + # Check if it's a playlist item marker + if isinstance(url, dict) and url.get('__playlist_url'): + playlist_url = url.get('__playlist_url') + item_num = url.get('__playlist_item', 1) + log(f"📍 Handling selected playlist item #{item_num}", flush=True) + # Convert to actual URL and set playlist_items to download only this item + url = playlist_url + playlist_items = str(item_num) + # Fall through to normal handling below + else: + log(f"Skipping invalid URL entry: {url}", file=sys.stderr) + continue + + log(f"Probing URL: {url}", flush=True) + + # ====== TORRENT MODE - INTERCEPT BEFORE NORMAL DOWNLOAD ====== + if torrent_mode or url.lower().startswith('magnet:'): + log(f"🧲 Torrent/magnet mode - spawning background worker...", flush=True) + + try: + # Get API key from config + from config import get_debrid_api_key + api_key = get_debrid_api_key(config) + + if not api_key: + log(f"✗ AllDebrid API key not found in config", file=sys.stderr) + exit_code = 1 + continue + + # Create a unique worker ID + worker_id = f"torrent_{uuid.uuid4().hex[:8]}" + + # Get worker manager if available from config + worker_manager = config.get('_worker_manager') + + # Create worker in manager if available + if worker_manager: + try: + worker_manager.track_worker( + worker_id, + worker_type="download_torrent", + title=f"Download: {url[:60]}...", + description=f"Torrent/magnet download via AllDebrid", + pipe=pipeline_context.get_current_command_text() + ) + log(f"✓ Worker created (ID: {worker_id})", flush=True) + except Exception as e: + log(f"⚠ Failed to create worker: {e}", file=sys.stderr) + worker_manager = None + + # Spawn background thread to handle the download + worker_thread = threading.Thread( + target=_download_torrent_worker, + args=( + worker_id, + url, + final_output_dir, + config, + api_key, + playlist_items, + audio_mode, + wait_timeout, + worker_manager, + ), + daemon=False, + name=f"TorrentWorker_{worker_id}" + ) + + worker_thread.start() + log(f"✓ Background worker started (ID: {worker_id})", flush=True) + + # Emit worker info so user can track it + worker_info = { + 'worker_id': worker_id, + 'worker_type': 'download_torrent', + 'source_url': url, + 'status': 'running', + 'message': 'Downloading in background...' + } + pipeline_context.emit(worker_info) + + continue + + except ImportError: + log(f"✗ AllDebrid client not available", file=sys.stderr) + exit_code = 1 + except Exception as e: + # Catches AllDebridError and other exceptions + log(f"✗ Failed to spawn torrent worker: {e}", file=sys.stderr) + import traceback + traceback.print_exc(file=sys.stderr) + exit_code = 1 + + continue # Skip to next URL + + # ====== NORMAL DOWNLOAD MODE (HTTP/HTTPS) ====== + + # First, probe the URL to detect playlists and get info + # For YouTube URLs, ignore playlists and only probe the single video + is_youtube_url = isinstance(url, str) and ('youtube.com' in url or 'youtu.be' in url) + probe_info = probe_url(url, no_playlist=is_youtube_url) + is_actual_playlist = False # Track if we have a real multi-item playlist + + if probe_info: + log(f"✓ Probed: {probe_info.get('title', url)} ({probe_info.get('extractor', 'unknown')})") + + # If it's a playlist, show the result table and skip download for now + entries = probe_info.get("entries", []) + if entries and not playlist_items: + is_actual_playlist = True # We have a real playlist with multiple items + # Playlist detected but NO selection provided + # Always show table for user to select items + log(f"📋 Found playlist with {len(entries)} items") + _show_playlist_table(url, probe_info) + log(f"ℹ️ Playlist displayed. To select items, use @* or @1,3,5-8 syntax after piping results") + playlists_displayed += 1 + continue # Skip to next URL - don't download playlist without selection + elif entries and playlist_items: + is_actual_playlist = True # We have a real playlist with item selection + # Playlist detected WITH selection - will download below + # Expand wildcard if present + expanded_items = _expand_playlist_selection(playlist_items, len(entries)) + playlist_items = expanded_items + selected_playlist_entries = _select_playlist_entries(entries, playlist_items) + log(f"📋 Found playlist with {len(entries)} items - downloading selected: {playlist_items}") + else: + log(f"Single item: {probe_info.get('title', 'Unknown')}") + + # ====== FORMAT LISTING MODE ====== + if list_formats_mode and isinstance(url, str) and url.startswith(('http://', 'https://')): + log(f"Fetching formats for: {url}", flush=True) + from helper.download import list_formats + from result_table import ResultTable + + all_formats = list_formats(url, no_playlist=is_youtube_url, playlist_items=playlist_items) + if all_formats: + # Filter and sort formats for better user experience + formats = _filter_and_sort_formats(all_formats) + + # Create result table for format display + table = ResultTable(title=f"Available Formats - {probe_info.get('title', 'Unknown')}") + + for fmt in formats: + row = table.add_row() + row.add_column("Format ID", fmt.get("format_id", "")) + + # Build resolution/bitrate string + vcodec = fmt.get("vcodec", "") + acodec = fmt.get("acodec", "") + height = fmt.get("height") + tbr = fmt.get("tbr") + + if vcodec != "none" and acodec != "none": + # Video + audio + res_str = fmt.get("resolution", "") + elif acodec != "none" and vcodec == "none": + # Audio only - show bitrate + res_str = f"{tbr:.0f} kbps" if tbr else "audio" + else: + # Video only + res_str = fmt.get("resolution", "") + + row.add_column("Resolution", res_str) + + # Build codec string (merged vcodec/acodec) + codec_parts = [] + if vcodec and vcodec != "none": + codec_parts.append(f"v:{vcodec}") + if acodec and acodec != "none": + codec_parts.append(f"a:{acodec}") + codec_str = " | ".join(codec_parts) if codec_parts else "unknown" + row.add_column("Codec", codec_str) + + if fmt.get("filesize"): + size_mb = fmt["filesize"] / (1024 * 1024) + row.add_column("Size", f"{size_mb:.1f} MB") + + # Set source command for @N expansion + table.set_source_command("download-data", [url]) + + # Note: Row selection args are not set - users select with @N syntax directly + + # Display table and emit as pipeline result + log(str(table), flush=True) + formats_displayed = True + + # Store table for @N expansion so CLI can reconstruct commands + # Uses separate current_stage_table instead of result history table + pipeline_context.set_current_stage_table(table) + + # Always emit formats so they can be selected with @N + for i, fmt in enumerate(formats, 1): + pipeline_context.emit({ + "format_id": fmt.get("format_id", ""), + "format_string": fmt.get("format", ""), + "resolution": fmt.get("resolution", ""), + "vcodec": fmt.get("vcodec", ""), + "acodec": fmt.get("acodec", ""), + "ext": fmt.get("ext", ""), + "filesize": fmt.get("filesize"), + "source_url": url, + "index": i, + }) + log(f"Use @N syntax to select a format and download", flush=True) + else: + log(f"✗ No formats available for this URL", file=sys.stderr) + + continue # Skip download, just show formats + + # ====== AUTO-DETECT MULTIPLE FORMATS ====== + # Check if multiple formats exist and handle based on -item flag + if (not current_format_selector and not list_formats_mode and + isinstance(url, str) and url.startswith(('http://', 'https://'))): + # Check if this is a yt-dlp supported URL (YouTube, Vimeo, etc.) + from helper.download import is_url_supported_by_ytdlp, list_formats + from result_table import ResultTable + + if is_url_supported_by_ytdlp(url): + log(f"Checking available formats for: {url}", flush=True) + all_formats = list_formats(url, no_playlist=is_youtube_url, playlist_items=playlist_items) + + if all_formats: + # Filter and sort formats for better user experience + formats = _filter_and_sort_formats(all_formats) + + # Handle -item selection for formats (single video) + if playlist_items and playlist_items.isdigit() and not is_actual_playlist: + idx = int(playlist_items) + if 0 < idx <= len(formats): + fmt = formats[idx-1] + current_format_selector = fmt.get("format_id") + log(f"Selected format #{idx}: {current_format_selector}") + playlist_items = None # Clear so it doesn't affect download options + else: + log(f"Invalid format index: {idx}", file=sys.stderr) + + elif len(formats) > 1: + # Multiple formats available + log(f"📊 Found {len(formats)} available formats for: {probe_info.get('title', 'Unknown')}", flush=True) + + # Always show table for format selection via @N syntax + # Show table and wait for @N selection + table = ResultTable(title=f"Available Formats - {probe_info.get('title', 'Unknown')}") + + for fmt in formats: + row = table.add_row() + row.add_column("Format ID", fmt.get("format_id", "")) + + # Build resolution/bitrate string + vcodec = fmt.get("vcodec", "") + acodec = fmt.get("acodec", "") + height = fmt.get("height") + tbr = fmt.get("tbr") + + if vcodec != "none" and acodec != "none": + # Video + audio + res_str = fmt.get("resolution", "") + elif acodec != "none" and vcodec == "none": + # Audio only - show bitrate + res_str = f"{tbr:.0f} kbps" if tbr else "audio" + else: + # Video only + res_str = fmt.get("resolution", "") + + row.add_column("Resolution", res_str) + + # Build codec string (merged vcodec/acodec) + codec_parts = [] + if vcodec and vcodec != "none": + codec_parts.append(f"v:{vcodec}") + if acodec and acodec != "none": + codec_parts.append(f"a:{acodec}") + codec_str = " | ".join(codec_parts) if codec_parts else "unknown" + row.add_column("Codec", codec_str) + + if fmt.get("filesize"): + size_mb = fmt["filesize"] / (1024 * 1024) + row.add_column("Size", f"{size_mb:.1f} MB") + + # Set source command for @N expansion + table.set_source_command("download-data", [url]) + + # Set row selection args so @N expands to "download-data URL -item N" + for i in range(len(formats)): + # i is 0-based index, but -item expects 1-based index + table.set_row_selection_args(i, ["-item", str(i + 1)]) + + # Display table and emit formats so they can be selected with @N + log(str(table), flush=True) + log(f"💡 Use @N syntax to select a format and download (e.g., @1)", flush=True) + + # Store table for @N expansion so CLI can reconstruct commands + pipeline_context.set_current_stage_table(table) + + # Emit formats as pipeline results for @N selection + for i, fmt in enumerate(formats, 1): + pipeline_context.emit({ + "format_id": fmt.get("format_id", ""), + "format_string": fmt.get("format", ""), + "resolution": fmt.get("resolution", ""), + "vcodec": fmt.get("vcodec", ""), + "acodec": fmt.get("acodec", ""), + "filesize": fmt.get("filesize"), + "tbr": fmt.get("tbr"), + "source_url": url, + "index": i, + }) + + formats_displayed = True # Mark that we displayed formats + continue # Skip download, user must select format via @N + + log(f"Downloading: {url}", flush=True) + + # Resolve cookies path if specified + final_cookies_path = None + if cookies_path: + if resolve_cookies_path: + try: + final_cookies_path = resolve_cookies_path(config, Path(cookies_path)) + except Exception: + final_cookies_path = Path(cookies_path).expanduser() if cookies_path else None + else: + final_cookies_path = Path(cookies_path).expanduser() + + # Create download options - use correct parameter names + # Mode is "audio" or "video", required field + mode = "audio" if audio_mode else "video" + + # Detect YouTube URLs and set no_playlist to download only the single video + is_youtube_url = isinstance(url, str) and ('youtube.com' in url or 'youtu.be' in url) + + download_opts = DownloadOptions( + url=url, + mode=mode, + output_dir=final_output_dir, + cookies_path=final_cookies_path, + ytdl_format=current_format_selector, # Use per-URL format override if available + clip_sections=f"{clip_range[0]}-{clip_range[1]}" if clip_range else None, + playlist_items=playlist_items, + no_playlist=is_youtube_url, # For YouTube, ignore playlist URLs and download single video + ) + + # For playlist downloads, capture existing files BEFORE download + if playlist_items and selected_playlist_entries: + _, playlist_existing_paths = _snapshot_playlist_paths(selected_playlist_entries, final_output_dir) + + # Call download_media from helper - no show_progress param + result_data = download_media(download_opts) + + if result_data and result_data.path: + file_path = result_data.path + + if file_path.exists(): + # Check if this was a playlist download (is_actual_playlist tracks if we have a multi-item playlist) + if is_actual_playlist: + if not selected_playlist_entries: + log( + "⚠ Playlist metadata unavailable; cannot emit selected items for this stage.", + file=sys.stderr, + ) + exit_code = 1 + continue + + matched_after, _ = _snapshot_playlist_paths(selected_playlist_entries, final_output_dir) + if not matched_after: + log( + "⚠ No playlist files found for the selected items after download.", + file=sys.stderr, + ) + exit_code = 1 + continue + + new_playlist_files: list[Path] = [] + for playlist_file in matched_after: + try: + path_key = str(playlist_file.resolve()) + except OSError: + path_key = str(playlist_file) + if path_key not in playlist_existing_paths: + new_playlist_files.append(playlist_file) + + emit_targets = new_playlist_files if new_playlist_files else matched_after + if new_playlist_files: + log(f"📋 Playlist download completed: {len(new_playlist_files)} new file(s)") + else: + log(f"📁 Reusing {len(emit_targets)} cached playlist file(s)", flush=True) + + for playlist_file in emit_targets: + file_hash = _compute_file_hash(playlist_file) + + tags = [] + if extract_ytdlp_tags and result_data.tags: + tags = result_data.tags + + pipe_obj = create_pipe_object_result( + source='download', + identifier=playlist_file.stem, + file_path=str(playlist_file), + cmdlet_name='download-data', + title=playlist_file.name, + file_hash=file_hash, + is_temp=False, + extra={ + 'url': url, + 'tags': tags, + 'audio_mode': audio_mode, + 'format': format_selector, + 'from_playlist': True, + }, + ) + + downloaded_files.append(playlist_file) + pipeline_context.emit(pipe_obj) + else: + # Single file download + file_hash = result_data.hash_value or _compute_file_hash(file_path) + tags = result_data.tags if result_data.tags else [] + + pipe_obj = create_pipe_object_result( + source='download', + identifier=file_path.stem, + file_path=str(file_path), + cmdlet_name='download-data', + title=file_path.name, + file_hash=file_hash, + is_temp=False, + extra={ + 'url': url, + 'tags': tags, + 'audio_mode': audio_mode, + 'format': format_selector, + 'clipped': clip_range is not None, + } + ) + + downloaded_files.append(file_path) + pipeline_context.emit(pipe_obj) + + log(f"✓ Downloaded: {file_path}", flush=True) + else: + log(f"Download returned no result for {url}", file=sys.stderr) + exit_code = 1 + + except Exception as e: + log(f"Error downloading {url}: {e}", file=sys.stderr) + import traceback + traceback.print_exc(file=sys.stderr) + exit_code = 1 + + # Success if we downloaded files or displayed playlists/formats + if downloaded_files or files_downloaded_directly > 0: + total_files = len(downloaded_files) + files_downloaded_directly + log(f"✓ Successfully downloaded {total_files} file(s)", flush=True) + if db: + db.update_worker_status(worker_id, 'completed') + return 0 + + if playlists_displayed: + log(f"✓ Displayed {playlists_displayed} playlist(s) for selection", flush=True) + if db: + db.update_worker_status(worker_id, 'completed') + db.close() + return 0 # Success - playlists shown + + if formats_displayed: + log(f"✓ Format selection table displayed - use @N to select and download", flush=True) + if db: + db.update_worker_status(worker_id, 'completed') + db.close() + return 0 # Success - formats shown + + log(f"No files were downloaded or playlists displayed", file=sys.stderr) + if db: + db.update_worker_status(worker_id, 'completed') + db.close() + return 1 + + + +CMDLET = Cmdlet( + name="download-data", + exec=_run, + summary="Download data from URLs with playlist/clip support using yt-dlp", + usage="download-data [options] or search-file | download-data [options]", + aliases=["download", "dl"], + args=[ + CmdletArg( + name="url", + type="string", + required=False, + description="URL to download (HTTP/HTTPS or file with URL list)", + variadic=True + ), + CmdletArg( + name="-url", + type="string", + description="URL to download (alias for positional argument)", + variadic=True + ), + CmdletArg( + name="list-formats", + type="flag", + description="List available formats without downloading" + ), + CmdletArg( + name="audio", + type="flag", + alias="a", + description="Download audio only (extract from video)" + ), + CmdletArg( + name="video", + type="flag", + alias="v", + description="Download video (default if not specified)" + ), + CmdletArg( + name="format", + type="string", + alias="fmt", + description="Explicit yt-dlp format selector (e.g., 'bestvideo+bestaudio')" + ), + CmdletArg( + name="clip", + type="string", + description="Extract time range: MM:SS-MM:SS (e.g., 34:03-35:08) or seconds" + ), + CmdletArg( + name="cookies", + type="string", + description="Path to cookies.txt file for authentication" + ), + CmdletArg( + name="torrent", + type="flag", + description="Download torrent/magnet via AllDebrid (requires API key in config)" + ), + CmdletArg( + name="wait", + type="float", + description="Wait time (seconds) for magnet processing timeout" + ), + CmdletArg( + name="item", + type="string", + alias="items", + description="Item selection for playlists/formats: use '-item N' to select format N, or '-item' to show table for @N selection in next command" + ), + SharedArgs.STORAGE, # Storage location: local, hydrus, 0x0, debrid, ftp + ], + details=[ + "Download media from URLs with advanced features.", + "", + "BASIC USAGE:", + " download-data https://youtube.com/watch?v=xyz", + " download-data https://example.com/file.pdf -storage local", + "", + "AUDIO/VIDEO OPTIONS:", + " -audio, -a Extract audio from video (M4A, MP3)", + " -video, -v Download as video (default)", + "", + "FORMAT SELECTION:", + " -format SELECTOR Specify yt-dlp format", + " Examples: 'best', 'bestvideo+bestaudio', '22'", + "", + "FORMAT/RESULT ITEM SELECTION:", + " -item Show available formats in table (see @N below)", + " -item N Auto-select and download format #N (e.g., -item 1)", + " Example: download-data URL -item 2 | add-file -storage local", + "", + "FORMAT SELECTION WITH @N SYNTAX:", + " 1. Show formats: download-data URL", + " 2. Select with @N: @1 | download-data | add-file", + " OR use -item N to skip manual selection", + "", + "CLIPPING:", + " -clip START-END Extract time range from media", + " Format: MM:SS-MM:SS (e.g., 34:03-35:08)", + " Also accepts: 2043-2108 (seconds)", + "", + "PLAYLIST MODE:", + " Automatically detects playlists", + " Shows numbered list of tracks", + " Download specific items: -item '1,3,5-8'", + " Download all items: -item '*'", + "", + "TORRENT MODE:", + " Download torrents/magnets via AllDebrid (if configured)", + " Usage: download-data -torrent magnet:?xt=urn:btih:... -item '1,3,5-8'", + " -wait SECONDS Maximum wait time for magnet processing (default: 1800)", + "", + "STORAGE LOCATIONS:", + " -storage local ~/Videos (default)", + " -storage hydrus ~/.hydrus/client_files", + " -storage 0x0 ~/Screenshots", + " -storage debrid ~/Debrid", + " -storage ftp ~/FTP", + "", + "EXAMPLES:", + " # Download YouTube video as audio", + " download-data https://youtube.com/watch?v=xyz -audio -storage local", + "", + " # Extract specific clip from video", + " download-data https://vimeo.com/123456 -clip 1:30-2:45 -format best", + "", + " # Download specific tracks from playlist", + " download-data https://youtube.com/playlist?list=xyz -item '1,3,5-8'", + "", + " # Download all items from playlist", + " download-data https://youtube.com/playlist?list=xyz -item '*'", + "", + " # Download with authentication", + " download-data https://example.com/content -cookies ~/cookies.txt", + "", + "TORRENT EXAMPLES:", + " # Download specific tracks from magnet link", + " download-data -torrent magnet:?xt=urn:btih:... -item '1,3,5-8' -storage local", + "", + " # Download all items from torrent and merge", + " download-data -torrent magnet:?xt=urn:btih:... -item '*' | merge-file | add-file", + "", + " # Download with custom wait time (5 minutes)", + " download-data -torrent magnet:?xt=urn:btih:... -wait 300 -item '1-5'", + ] +) diff --git a/cmdlets/get_file.py b/cmdlets/get_file.py new file mode 100644 index 0000000..8144c3e --- /dev/null +++ b/cmdlets/get_file.py @@ -0,0 +1,1618 @@ +from __future__ import annotations + +from typing import Any, Dict, Optional, Sequence +from pathlib import Path +import shutil as _shutil +import subprocess as _subprocess +import json +import sys + +from helper.logger import log +import uuid as _uuid +import time as _time + +from downlow_helpers.progress import print_progress, print_final_progress, format_size +from downlow_helpers.http_client import HTTPClient +import fnmatch as _fnmatch + +from . import register +import models +import pipeline as ctx +from helper import hydrus as hydrus_wrapper +from ._shared import Cmdlet, CmdletArg, normalize_hash, looks_like_hash, create_pipe_object_result +from config import resolve_output_dir, get_hydrus_url, get_hydrus_access_key +from downlow_helpers.alldebrid import AllDebridClient + + + + + +def _is_alldebrid_pipe_data(line: str) -> bool: + """Check if line is AllDebrid pipe format: ID|filename|size|...""" + parts = line.strip().split('|') + if len(parts) < 5: + return False + try: + # Check if first part is magnet ID (integer) + magnet_id = int(parts[0]) + # Check if 3rd part (size) is integer + size = int(parts[2]) + # Check if 4th part (status_code) is integer + status_code = int(parts[3]) + return magnet_id > 0 and size >= 0 and status_code in {0, 1, 2, 3, 4} + except (ValueError, IndexError): + return False + + +def _handle_alldebrid_pipe(config: Dict[str, Any], args: Sequence[str]) -> int: + """Handle AllDebrid magnet downloads from piped stdin.""" + # Parse arguments + out_path = None + file_filter = None + i = 0 + while i < len(args): + if args[i].lower() in {"-path", "--path", "path"} and i + 1 < len(args): + out_path = Path(args[i + 1]).expanduser() + i += 2 + elif args[i].lower() in {"-file", "--file", "file"} and i + 1 < len(args): + file_filter = args[i + 1] + i += 2 + else: + i += 1 + + if not out_path: + log("✗ -path required for AllDebrid downloads", file=sys.stderr) + return 1 + + # Read magnet IDs from stdin + magnets = [] + try: + for line in sys.stdin: + line = line.strip() + if line and _is_alldebrid_pipe_data(line): + parts = line.split('|') + magnet_id = int(parts[0]) + magnets.append(magnet_id) + except Exception as e: + log(f"✗ Error reading stdin: {e}", file=sys.stderr) + return 1 + + if not magnets: + log("✗ No valid magnet IDs in pipe", file=sys.stderr) + return 1 + + # Get API key + from config import get_debrid_api_key + api_key = get_debrid_api_key(config) + if not api_key: + log("✗ AllDebrid API key not configured", file=sys.stderr) + return 1 + + # Download from each magnet + client = AllDebridClient(api_key) + total_files = 0 + failed_files = 0 + + log(f"Processing {len(magnets)} magnet(s)...", file=sys.stderr) + + for magnet_id in magnets: + try: + # Fetch magnet files using magnet_status with include_files + magnet_info = client.magnet_status(magnet_id, include_files=True) + + files_list = _extract_files_from_magnet(magnet_info, file_filter) + + if not files_list: + log(f"⊘ No files in magnet {magnet_id}", file=sys.stderr) + continue + + log(f"✓ Found {len(files_list)} file(s) in magnet {magnet_id}", file=sys.stderr) + + # Download each file + for file_info in files_list: + try: + link = file_info['link'] + filename = file_info['name'] + + # Unlock link to get direct URL + try: + direct_url = client.unlock_link(link) + if not direct_url: + log(f"✗ Failed to unlock link for {filename}", file=sys.stderr) + failed_files += 1 + continue + except Exception as e: + log(f"✗ Error unlocking link: {e}", file=sys.stderr) + failed_files += 1 + continue + + # Download file + output_file = out_path / filename + if _download_file_from_alldebrid(direct_url, output_file, filename, file_info['size']): + log(f"✓ Downloaded: {filename}", file=sys.stderr) + total_files += 1 + else: + log(f"✗ Failed to download: {filename}", file=sys.stderr) + failed_files += 1 + + except Exception as e: + log(f"✗ Error downloading file: {e}", file=sys.stderr) + failed_files += 1 + + except Exception as e: + log(f"✗ Error processing magnet {magnet_id}: {e}", file=sys.stderr) + failed_files += 1 + + log(f"✓ Download complete: {total_files} file(s) downloaded, {failed_files} failed", file=sys.stderr) + return 0 if failed_files == 0 else 1 + + +def _extract_files_from_magnet(magnet_info: Dict[str, Any], filter_pattern: Optional[str] = None) -> list: + """Extract files from magnet file tree, optionally filtering by pattern.""" + files = [] + + def traverse(items: Any, prefix: str = "") -> None: + if not isinstance(items, list): + return + for item in items: + if not isinstance(item, dict): + continue + name = item.get('n', '') + link = item.get('l', '') + size = item.get('s', 0) + entries = item.get('e', []) + + # File + if link: + full_path = f"{prefix}/{name}" if prefix else name + if filter_pattern is None or _fnmatch.fnmatch(name.lower(), filter_pattern.lower()): + files.append({'name': name, 'path': full_path, 'size': size, 'link': link}) + + # Folder + if entries: + full_path = f"{prefix}/{name}" if prefix else name + traverse(entries, full_path) + + items = magnet_info.get('files', []) + traverse(items) + return files + + +def _download_file_from_alldebrid(url: str, output_path: Path, filename: str, file_size: int) -> bool: + """Download a single file from AllDebrid with progress bar.""" + output_path.parent.mkdir(parents=True, exist_ok=True) + + try: + downloaded = 0 + chunk_size = 1024 * 1024 + start_time = _time.time() + last_update = start_time + + with HTTPClient(timeout=30.0, headers={'User-Agent': 'downlow/1.0'}) as client: + response = client.get(url) + response.raise_for_status() + with open(output_path, 'wb', buffering=1024*1024) as f: + for chunk in response.iter_bytes(chunk_size): + if not chunk: + break + f.write(chunk) + downloaded += len(chunk) + + # Update progress every 0.5 seconds to avoid spam + now = _time.time() + if now - last_update >= 0.5 or downloaded == file_size: + elapsed = now - start_time + speed = downloaded / elapsed if elapsed > 0 else 0 + print_progress(filename, downloaded, file_size, speed) + last_update = now + + # Print final progress line + elapsed = _time.time() - start_time + print_final_progress(filename, file_size, elapsed) + log(f"✓ {filename} downloaded", file=sys.stderr) + + return True + except Exception as e: + log(f"\n[get-file] ✗ Download error: {e}", file=sys.stderr) + return False + + +def _is_playable_in_mpv(file_path_or_ext: str, mime_type: Optional[str] = None) -> bool: + """Check if file can be played in MPV based on extension or mime type.""" + from helper.utils_constant import mime_maps + + # Check mime type first if provided + if mime_type: + mime_lower = mime_type.lower() + # Simple prefix check for common media types + if any(mime_lower.startswith(prefix) for prefix in ['video/', 'audio/', 'image/']): + return True + + # Extract extension + if file_path_or_ext.startswith('.'): + ext = file_path_or_ext.lower() + else: + ext = Path(file_path_or_ext).suffix.lower() + + if not ext: + return False + + # Check if extension is in playable categories + playable_categories = ['video', 'audio', 'image', 'image_sequence'] + + for category in playable_categories: + if category in mime_maps: + for key, info in mime_maps[category].items(): + if info.get('ext', '').lower() == ext: + return True + return False + + +def _get_fixed_ipc_pipe() -> str: + """Get the fixed IPC pipe name for persistent MPV connection. + + Uses a fixed name 'mpv-medeia-macina' so all playback sessions + connect to the same MPV window/process instead of creating new instances. + """ + import platform + if platform.system() == 'Windows': + return "\\\\.\\pipe\\mpv-medeia-macina" + else: + return "/tmp/mpv-medeia-macina.sock" + + +def _send_to_mpv_pipe(file_url: str, ipc_pipe: str, title: str, headers: Optional[Dict[str, str]] = None) -> bool: + """Send loadfile command to existing MPV via IPC pipe. + + Returns True if successfully sent to existing MPV, False if pipe unavailable. + """ + import json + import socket + import platform + + try: + # Prepare commands + # Use set_property for headers as loadfile options can be unreliable via IPC + header_str = "" + if headers: + header_str = ",".join([f"{k}: {v}" for k, v in headers.items()]) + + # Command 1: Set headers (or clear them) + cmd_headers = { + "command": ["set_property", "http-header-fields", header_str], + "request_id": 0 + } + + # Command 2: Load file using memory:// M3U to preserve title + # Sanitize title to avoid breaking M3U format + safe_title = title.replace("\n", " ").replace("\r", "") + m3u_content = f"#EXTM3U\n#EXTINF:-1,{safe_title}\n{file_url}\n" + + cmd_load = { + "command": ["loadfile", f"memory://{m3u_content}", "append-play"], + "request_id": 1 + } + + if platform.system() == 'Windows': + # Windows named pipes require special handling + try: + # Open in r+b to read response + with open(ipc_pipe, 'r+b', buffering=0) as pipe: + # Send headers + pipe.write((json.dumps(cmd_headers) + "\n").encode('utf-8')) + pipe.flush() + pipe.readline() # Consume response for headers + + # Send loadfile + pipe.write((json.dumps(cmd_load) + "\n").encode('utf-8')) + pipe.flush() + + # Read response + response_line = pipe.readline() + if response_line: + resp = json.loads(response_line.decode('utf-8')) + if resp.get('error') != 'success': + log(f"[get-file] MPV error: {resp.get('error')}", file=sys.stderr) + return False + + log(f"[get-file] Sent to existing MPV: {title}", file=sys.stderr) + return True + except (OSError, IOError): + # Pipe not available + return False + else: + # Unix socket for Linux/macOS + if not hasattr(socket, 'AF_UNIX'): + return False + + sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + sock.connect(ipc_pipe) + + # Send headers + sock.sendall((json.dumps(cmd_headers) + "\n").encode('utf-8')) + sock.recv(4096) # Consume response + + # Send loadfile + sock.sendall((json.dumps(cmd_load) + "\n").encode('utf-8')) + + # Read response + try: + response_data = sock.recv(4096) + if response_data: + resp = json.loads(response_data.decode('utf-8')) + if resp.get('error') != 'success': + log(f"[get-file] MPV error: {resp.get('error')}", file=sys.stderr) + sock.close() + return False + except: + pass + sock.close() + + log(f"[get-file] Sent to existing MPV: {title}", file=sys.stderr) + return True + except (OSError, socket.error, ConnectionRefusedError): + # Pipe doesn't exist or MPV not listening - will need to start new instance + return False + except Exception as e: + log(f"[get-file] IPC error: {e}", file=sys.stderr) + return False + + +def _play_in_mpv(file_url: str, file_title: str, is_stream: bool = False, headers: Optional[Dict[str, str]] = None) -> bool: + """Play file in MPV using IPC pipe, creating new instance if needed. + + Returns True on success, False on error. + """ + ipc_pipe = _get_fixed_ipc_pipe() + import json + import socket + import platform + + try: + # First try to send to existing MPV instance + if _send_to_mpv_pipe(file_url, ipc_pipe, file_title, headers): + print(f"Added to MPV: {file_title}") + return True + + # No existing MPV or pipe unavailable - start new instance + log(f"[get-file] Starting new MPV instance (pipe: {ipc_pipe})", file=sys.stderr) + cmd = ['mpv', file_url, f'--input-ipc-server={ipc_pipe}'] + + # Set title for new instance + cmd.append(f'--force-media-title={file_title}') + + if headers: + # Format headers for command line + # --http-header-fields="Header1: Val1,Header2: Val2" + header_str = ",".join([f"{k}: {v}" for k, v in headers.items()]) + cmd.append(f'--http-header-fields={header_str}') + + # Detach process to prevent freezing parent CLI + kwargs = {} + if platform.system() == 'Windows': + # CREATE_NEW_CONSOLE might be better than CREATE_NO_WINDOW if MPV needs a window + # But usually MPV creates its own window. + # DETACHED_PROCESS (0x00000008) is also an option. + kwargs['creationflags'] = 0x00000008 # DETACHED_PROCESS + + _subprocess.Popen(cmd, stdin=_subprocess.DEVNULL, stdout=_subprocess.DEVNULL, stderr=_subprocess.DEVNULL, **kwargs) + + print(f"{'Streaming' if is_stream else 'Playing'} in MPV: {file_title}") + log(f"[get-file] Started MPV with {file_title} (IPC: {ipc_pipe})", file=sys.stderr) + return True + + except FileNotFoundError: + log("Error: MPV not found. Install mpv to play media files", file=sys.stderr) + return False + except Exception as e: + log(f"Error launching MPV: {e}", file=sys.stderr) + return False + + +def _handle_search_result(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: + """Handle a file from search-file results using FileStorage backend.""" + try: + from helper.file_storage import FileStorage + + # Helper to get field from both dict and object + def get_field(obj: Any, field: str, default: Any = None) -> Any: + if isinstance(obj, dict): + return obj.get(field, default) + else: + return getattr(obj, field, default) + + # Extract file information from ResultItem + storage_name = get_field(result, 'origin', None) + # Also check for 'source' field (from add-file and other cmdlets) + if not storage_name: + storage_name = get_field(result, 'source', None) + file_hash = get_field(result, 'hash_hex', None) + # Also check for file_hash field (from add-file and other cmdlets) + if not file_hash: + file_hash = get_field(result, 'file_hash', None) + file_title = get_field(result, 'title', 'file') + mime_type = get_field(result, 'mime', None) + file_path = get_field(result, 'target', None) + # Also check for 'file_path' field (from add-file and other cmdlets) + if not file_path: + file_path = get_field(result, 'file_path', None) + # Also check for 'path' field (from search-file and other cmdlets) + if not file_path: + file_path = get_field(result, 'path', None) + + full_metadata = get_field(result, 'full_metadata', {}) + magnet_id = full_metadata.get('magnet_id') if isinstance(full_metadata, dict) else None + + if not storage_name: + log("Error: No storage backend specified in result", file=sys.stderr) + return 1 + + log(f"[get-file] Retrieving file from storage: {storage_name}", file=sys.stderr) + + # Handle different storage backends + if storage_name.lower() == 'hydrus': + return _handle_hydrus_file(file_hash, file_title, config, args, mime_type=mime_type) + elif storage_name.lower() == 'local': + return _handle_local_file(file_path, file_title, args, file_hash=file_hash) + elif storage_name.lower() == 'download': + # Downloads are local files + return _handle_local_file(file_path, file_title, args, file_hash=file_hash) + elif storage_name.lower() == 'debrid': + # Extract magnet_id from result (search-file stores it in full_metadata or as custom attribute) + if not magnet_id: + magnet_id = get_field(result, 'magnet_id', None) + if not magnet_id: + log("Error: No magnet ID in debrid result", file=sys.stderr) + return 1 + return _handle_debrid_file(magnet_id, file_title, config, args) + else: + log(f"Unknown storage backend: {storage_name}", file=sys.stderr) + return 1 + + except Exception as e: + log(f"Error processing search result: {e}", file=sys.stderr) + import traceback + traceback.print_exc(file=sys.stderr) + return 1 + + +def _handle_hydrus_file(file_hash: Optional[str], file_title: str, config: Dict[str, Any], args: Sequence[str], mime_type: Optional[str] = None) -> int: + """Handle file from Hydrus - auto-play in MPV if media file, otherwise open web URL.""" + if not file_hash: + log("Error: No file hash provided", file=sys.stderr) + return 1 + + try: + hydrus_url = get_hydrus_url(config) + access_key = get_hydrus_access_key(config) + + if not hydrus_url or not access_key: + log("Error: Hydrus not configured", file=sys.stderr) + return 1 + + # Check if it's a playable media file based on filename or mime type + is_media = _is_playable_in_mpv(file_title) + if not is_media and mime_type: + # Check mime type if filename check failed + if any(m in mime_type.lower() for m in ['video/', 'audio/', 'image/']): + is_media = True + + force_mpv = any(str(a).lower() in {'-mpv', '--mpv', 'mpv'} for a in args) + force_browser = any(str(a).lower() in {'-web', '--web', 'web', '-browser', '--browser'} for a in args) + + # Check MPV availability + from hydrus_health_check import check_mpv_availability + mpv_available, _ = check_mpv_availability() + + # Construct URLs for streaming/viewing + # For streaming, we use headers for auth, so we don't put the key in the URL + stream_url = f"{hydrus_url}/get_files/file?hash={file_hash}" + # For browser, we still need the key in the URL + web_url = f"{hydrus_url}/get_files/file?hash={file_hash}&Hydrus-Client-API-Access-Key={access_key}" + + headers = { + "Hydrus-Client-API-Access-Key": access_key + } + + if force_browser: + # User explicitly wants browser + ipc_pipe = _get_fixed_ipc_pipe() + result_dict = create_pipe_object_result( + source='hydrus', + identifier=file_hash, + file_path=web_url, + cmdlet_name='get-file', + title=file_title, + file_hash=file_hash, + extra={ + 'ipc': ipc_pipe, + 'action_type': 'browser', + 'web_url': web_url, + 'hydrus_url': hydrus_url, + 'access_key': access_key + } + ) + ctx.emit(result_dict) + try: + import webbrowser + webbrowser.open(web_url) + log(f"[get-file] Opened in browser: {file_title}", file=sys.stderr) + except Exception: + pass + return 0 + elif force_mpv or (is_media and mpv_available): + # Auto-play in MPV for media files (if available), or user requested it + if _play_in_mpv(stream_url, file_title, is_stream=True, headers=headers): + # Emit result as PipeObject-compatible dict for pipelining + ipc_pipe = _get_fixed_ipc_pipe() + result_dict = create_pipe_object_result( + source='hydrus', + identifier=file_hash, + file_path=stream_url, + cmdlet_name='get-file', + title=file_title, + file_hash=file_hash, + extra={ + 'ipc': ipc_pipe, + 'action_type': 'streaming', + 'web_url': web_url, + 'hydrus_url': hydrus_url, + 'access_key': access_key + } + ) + ctx.emit(result_dict) + return 0 + else: + # Fall back to browser + try: + import webbrowser + webbrowser.open(web_url) + log(f"[get-file] Opened in browser instead", file=sys.stderr) + except Exception: + pass + return 0 + else: + # Not media, open in browser + ipc_pipe = _get_fixed_ipc_pipe() + result_dict = create_pipe_object_result( + source='hydrus', + identifier=file_hash, + file_path=web_url, + cmdlet_name='get-file', + title=file_title, + file_hash=file_hash, + extra={ + 'ipc': ipc_pipe, + 'action_type': 'browser', + 'web_url': web_url, + 'hydrus_url': hydrus_url, + 'access_key': access_key + } + ) + ctx.emit(result_dict) + try: + import webbrowser + webbrowser.open(web_url) + log(f"[get-file] Opened in browser: {file_title}", file=sys.stderr) + except Exception: + pass + return 0 + + except Exception as e: + log(f"Error handling Hydrus file: {e}", file=sys.stderr) + import traceback + traceback.print_exc(file=sys.stderr) + return 1 + + +def _handle_local_file(file_path: Optional[str], file_title: str, args: Sequence[str], file_hash: Optional[str] = None) -> int: + """Handle file from local storage - auto-play in MPV if media, otherwise open with default app.""" + if not file_path: + log("Error: No file path provided", file=sys.stderr) + return 1 + + try: + source = Path(file_path) + if not source.exists(): + log(f"Error: File not found: {file_path}", file=sys.stderr) + return 1 + + # Check for explicit user flags + force_mpv = any(str(a).lower() in {'-mpv', '--mpv', 'mpv'} for a in args) + force_default = any(str(a).lower() in {'-open', '--open', 'open'} for a in args) + + # Check if it's a playable media file + is_media = _is_playable_in_mpv(str(source)) + + # Check MPV availability + from hydrus_health_check import check_mpv_availability + mpv_available, _ = check_mpv_availability() + + if force_default: + # User explicitly wants default application + import subprocess as sp + import platform + import os + try: + if platform.system() == 'Darwin': # macOS + sp.run(['open', file_path]) + elif platform.system() == 'Windows': + os.startfile(file_path) + else: # Linux + sp.run(['xdg-open', file_path]) + ctx.emit(f"Opened: {file_title}") + log(f"[get-file] Opened {file_title} with default app", file=sys.stderr) + return 0 + except Exception as e: + log(f"Error opening file: {e}", file=sys.stderr) + return 1 + elif force_mpv or (is_media and mpv_available): + # Auto-play in MPV for media files (if available), or user requested it + if _play_in_mpv(file_path, file_title, is_stream=False): + # Emit result as PipeObject-compatible dict for pipelining + ipc_pipe = _get_fixed_ipc_pipe() + result_dict = create_pipe_object_result( + source='local', + identifier=str(Path(file_path).stem) if file_path else 'unknown', + file_path=file_path, + cmdlet_name='get-file', + title=file_title, + file_hash=file_hash, # Include hash from search result if available + extra={ + 'ipc': ipc_pipe, # MPV IPC pipe for Lua script control + 'action_type': 'playing' # Distinguish from other get-file actions + } + ) + ctx.emit(result_dict) + return 0 + else: + # Fall back to default application + try: + import os + import platform + if platform.system() == 'Darwin': # macOS + _subprocess.run(['open', file_path]) + elif platform.system() == 'Windows': + os.startfile(file_path) + else: # Linux + _subprocess.run(['xdg-open', file_path]) + log(f"[get-file] Opened with default app instead", file=sys.stderr) + except Exception: + pass + return 0 + else: + # Not media - open with default application + import subprocess as sp + import platform + import os + try: + if platform.system() == 'Darwin': # macOS + sp.run(['open', file_path]) + elif platform.system() == 'Windows': + # Use os.startfile for more reliable Windows handling + os.startfile(file_path) + else: # Linux + sp.run(['xdg-open', file_path]) + print(f"Opened: {file_title}") + log(f"[get-file] Opened {file_title} with default app", file=sys.stderr) + + # Emit result for downstream processing + result_dict = create_pipe_object_result( + source='local', + identifier=str(Path(file_path).stem) if file_path else 'unknown', + file_path=file_path, + cmdlet_name='get-file', + title=file_title, + file_hash=file_hash, + extra={'action_type': 'opened'} + ) + ctx.emit(result_dict) + return 0 + except Exception as e: + log(f"Error opening file with default app: {e}", file=sys.stderr) + return 1 + + except Exception as e: + log(f"Error handling local file: {e}", file=sys.stderr) + return 1 + + +def _handle_debrid_file(magnet_id: int, magnet_title: str, config: Dict[str, Any], args: Sequence[str]) -> int: + """Handle magnet file from AllDebrid storage - download to local path.""" + # Parse output path argument + out_path = None + i = 0 + args_list = [str(a) for a in args] + while i < len(args_list): + if args_list[i].lower() in {"-path", "--path", "path"} and i + 1 < len(args_list): + out_path = Path(args_list[i + 1]).expanduser() + i += 2 + else: + i += 1 + + if not out_path: + log("✗ -Path required for debrid downloads", file=sys.stderr) + return 1 + + # Ensure output directory exists + try: + out_path.mkdir(parents=True, exist_ok=True) + except Exception as e: + log(f"✗ Error creating output directory: {e}", file=sys.stderr) + return 1 + + # Get API key + from config import get_debrid_api_key + api_key = get_debrid_api_key(config) + if not api_key: + log("✗ AllDebrid API key not configured in config.json", file=sys.stderr) + return 1 + + try: + client = AllDebridClient(api_key) + + log(f"[get-file] Downloading magnet {magnet_id}: {magnet_title}", file=sys.stderr) + + # Fetch magnet files + try: + magnet_info = client.magnet_status(magnet_id, include_files=True) + except Exception as e: + log(f"✗ Failed to fetch magnet files: {e}", file=sys.stderr) + return 1 + + # Extract files from magnet + files_list = _extract_files_from_magnet(magnet_info) + + if not files_list: + log(f"✗ No files in magnet {magnet_id}", file=sys.stderr) + return 1 + + log(f"✓ Found {len(files_list)} file(s) in magnet {magnet_id}", file=sys.stderr) + + # Download each file + total_files = 0 + failed_files = 0 + + for file_info in files_list: + try: + link = file_info['link'] + filename = file_info['name'] + file_size = file_info['size'] + + # Unlock link to get direct URL + try: + direct_url = client.unlock_link(link) + if not direct_url: + log(f"✗ Failed to unlock link for {filename}", file=sys.stderr) + failed_files += 1 + continue + except Exception as e: + log(f"✗ Error unlocking link: {e}", file=sys.stderr) + failed_files += 1 + continue + + # Download file + output_file = out_path / filename + if _download_file_from_alldebrid(direct_url, output_file, filename, file_size): + log(f"✓ Downloaded: {filename}", file=sys.stderr) + total_files += 1 + else: + log(f"✗ Failed to download: {filename}", file=sys.stderr) + failed_files += 1 + + except Exception as e: + log(f"✗ Error downloading file: {e}", file=sys.stderr) + failed_files += 1 + + log(f"✓ Download complete: {total_files} file(s) downloaded, {failed_files} failed", file=sys.stderr) + + if total_files > 0: + # Emit result for downstream processing + result_dict = create_pipe_object_result( + source='debrid', + identifier=str(magnet_id), + file_path=str(out_path), + cmdlet_name='get-file', + title=magnet_title, + extra={ + 'magnet_id': magnet_id, + 'files_downloaded': total_files, + 'download_dir': str(out_path) + } + ) + ctx.emit(result_dict) + + return 0 if failed_files == 0 else 1 + + except Exception as e: + log(f"✗ Error processing debrid download: {e}", file=sys.stderr) + import traceback + traceback.print_exc(file=sys.stderr) + return 1 + + +@register(["get-file"]) # primary name +def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: + # Help: if any help token is present, print CMDLET JSON and exit + try: + if any(str(a).lower() in {"-?", "/?", "--help", "-h", "help", "--cmdlet"} for a in args): + log(json.dumps(CMDLET, ensure_ascii=False, indent=2)) + return 0 + except Exception: + pass + + # Helper to get field from both dict and object + def get_field(obj: Any, field: str, default: Any = None) -> Any: + if isinstance(obj, dict): + return obj.get(field, default) + else: + return getattr(obj, field, default) + + # Check if result is a list (from @N selection) and extract the first item + actual_result = result + if isinstance(result, list) and len(result) > 0: + actual_result = result[0] + + # Check if this is a FileStorage search result (has origin field indicating a backend) + # This handles both dict and ResultItem objects + origin = get_field(actual_result, 'origin', None) + # Also check for 'source' field (from add-file and other cmdlets) + if not origin: + origin = get_field(actual_result, 'source', None) + if origin and origin.lower() in {'hydrus', 'local', 'debrid', 'alldebrid'}: + # This is a search result with explicit origin - handle it via _handle_search_result + return _handle_search_result(actual_result, args, config) + + # Handle ResultItem from search-file via @N selection + # The result can be either: + # 1. A single ResultItem (direct call) + # 2. A list of ResultItems (from @N selection in CLI) + result_item = None + if result and hasattr(result, '__class__'): + if result.__class__.__name__ == 'ResultItem': + result_item = result + elif isinstance(result, list) and len(result) > 0: + # @N selection creates a list, extract the first item if it's a ResultItem + if hasattr(result[0], '__class__') and result[0].__class__.__name__ == 'ResultItem': + result_item = result[0] + + if result_item: + return _handle_search_result(result_item, args, config) + + # Handle PipeObject results from previous get-file call (for chaining) + if result and isinstance(result, dict) and result.get('action', '').startswith('cmdlet:get-file'): + # This is from a previous get-file result - just pass it through + # Don't treat it as a new file to play, just emit for pipeline chaining + ctx.emit(result) + return 0 + + # Check for AllDebrid pipe input (from search-debrid) + # Try to read first line from stdin to detect format + first_line = None + try: + # Try to read one line without blocking + if hasattr(sys.stdin, 'readable') and sys.stdin.readable(): + first_line = sys.stdin.readline().strip() + except Exception: + pass + + if first_line and _is_alldebrid_pipe_data(first_line): + # This is AllDebrid pipe data - handle it separately + # Put the line back by creating a chain with the rest of stdin + import io + try: + remaining_stdin = sys.stdin.read() + except: + remaining_stdin = "" + sys.stdin = io.StringIO(first_line + '\n' + remaining_stdin) + return _handle_alldebrid_pipe(config, args) + elif first_line: + # Not AllDebrid data, put it back for normal processing + import io + try: + remaining_stdin = sys.stdin.read() + except: + remaining_stdin = "" + sys.stdin = io.StringIO(first_line + '\n' + remaining_stdin) + + # Helpers + def _sanitize_name(text: str) -> str: + allowed = [] + for ch in text: + allowed.append(ch if (ch.isalnum() or ch in {"-", "_", " ", "."}) else " ") + return (" ".join("".join(allowed).split()) or "export").strip() + + def _ffprobe_duration_seconds(path: Path) -> Optional[float]: + ffprobe_path = _shutil.which('ffprobe') + if not ffprobe_path: + return None + try: + res = _subprocess.run( + [ffprobe_path, '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', str(path)], + stdout=_subprocess.PIPE, + stderr=_subprocess.PIPE, + check=True, + text=True, + ) + out = (res.stdout or '').strip() + if not out: + return None + value = float(out) + return value if value > 0 else None + except Exception: + return None + + def _parse_args(tokens: Sequence[str]) -> tuple[Optional[Path], Optional[str], Optional[str], Optional[str], bool]: + out_override: Optional[Path] = None + size_spec: Optional[str] = None + convert_spec: Optional[str] = None + hash_spec: Optional[str] = None + export_metadata: bool = False + i = 0 + while i < len(tokens): + t = tokens[i] + low = t.lower() + if low in {"-path", "--path", "path"} and i + 1 < len(tokens): + try: + out_override = Path(tokens[i + 1]).expanduser() + except Exception: + out_override = None + i += 2 + continue + if low in {"size", "-size", "--size"} and i + 1 < len(tokens): + size_spec = tokens[i + 1] + i += 2 + continue + if low in {"convert", "-convert", "--convert"} and i + 1 < len(tokens): + convert_spec = tokens[i + 1] + i += 2 + continue + if low in {"-hash", "--hash", "hash"} and i + 1 < len(tokens): + hash_spec = tokens[i + 1] + i += 2 + continue + if low in {"-metadata", "--metadata", "metadata"}: + export_metadata = True + i += 1 + continue + i += 1 + return out_override, size_spec, convert_spec, hash_spec, export_metadata + + def _compute_target_bytes(size_spec: Optional[str], source_bytes: int) -> Optional[int]: + if not size_spec: + return None + text = str(size_spec).strip().lower() + if not text: + return None + if text.endswith('%'): + try: + pct = float(text[:-1]) + except ValueError: + return None + pct = max(0.0, min(100.0, pct)) + target = int(round(source_bytes * (pct / 100.0))) + else: + val = text + if val.endswith('mb'): + val = val[:-2] + elif val.endswith('m'): + val = val[:-1] + try: + mb = float(val) + except ValueError: + return None + target = int(round(mb * 1024 * 1024)) + min_bytes = 1 * 1024 * 1024 + if target <= 0: + target = min_bytes + return min(target, source_bytes) + + def _guess_kind_from_suffix(path: Path) -> str: + sfx = path.suffix.lower() + if sfx in {'.mp4', '.mkv', '.webm', '.mov', '.avi', '.flv', '.mpg', '.mpeg', '.ts', '.m4v', '.wmv'}: + return 'video' + if sfx in {'.mp3', '.flac', '.wav', '.m4a', '.aac', '.ogg', '.opus', '.mka'}: + return 'audio' + return 'other' + + def _extract_metadata_from_tags(tags_payload: Dict[str, Any], file_hash: str, input_kind: str = '') -> Dict[str, str]: + """Extract common metadata fields from Hydrus tags. + + Returns a dict mapping FFmpeg metadata keys to values. + Supports: title, artist, album, track, date, genre, etc. + + For audio files, applies sensible defaults: + - If no album, uses title as album + - If no track, defaults to 1 + - album_artist is set to artist value + """ + metadata = {} + + # Map of common tag namespaces to FFmpeg metadata keys + tag_map = { + 'title': 'title', + 'artist': 'artist', + 'album': 'album', + 'track': 'track', + 'track_number': 'track', + 'date': 'date', + 'year': 'date', + 'genre': 'genre', + 'composer': 'composer', + 'comment': 'comment', + } + + if not tags_payload or 'metadata' not in tags_payload or not tags_payload['metadata']: + return metadata + + entry = tags_payload['metadata'][0] + if 'tags' not in entry or not isinstance(entry['tags'], dict): + return metadata + + tags_dict = entry['tags'] + + # Extract metadata from tags + for _service_key, service_data in tags_dict.items(): + if not isinstance(service_data, dict): + continue + + display_tags = service_data.get('display_tags', {}) + if not isinstance(display_tags, dict): + continue + + current_tags = display_tags.get('0', []) + if not isinstance(current_tags, list): + continue + + for tag in current_tags: + tag_str = str(tag).strip() + if ':' in tag_str: + namespace, value = tag_str.split(':', 1) + namespace = namespace.lower().strip() + value = value.strip() + if namespace in tag_map and value: + ffmpeg_key = tag_map[namespace] + # Use first occurrence + if ffmpeg_key not in metadata: + metadata[ffmpeg_key] = value + + # Apply sensible defaults for audio files + if input_kind == 'audio': + # If no album, use title as album + if 'album' not in metadata and 'title' in metadata: + metadata['album'] = metadata['title'] + # If no track, default to 1 + if 'track' not in metadata: + metadata['track'] = '1' + # If no album_artist, use artist + if 'artist' in metadata: + metadata['album_artist'] = metadata['artist'] + + return metadata + + out_override, size_spec, convert_spec, hash_spec, export_metadata = _parse_args(args) + default_dir = resolve_output_dir(config) + + media_kind = (get_field(result, 'media_kind', '') or '').lower() + + _chk = [] + if out_override: + _chk.append(f"Path={out_override}") + if size_spec: + _chk.append(f"Size={size_spec}") + if convert_spec: + _chk.append(f"Convert={convert_spec}") + # Prefer explicit -hash over result hash for logging + file_hash_for_log = None + if hash_spec and looks_like_hash(hash_spec): + file_hash_for_log = normalize_hash(hash_spec) + else: + hash_value = get_field(result, 'hash_hex', None) + file_hash_for_log = normalize_hash(hash_value) if hash_value else None + if _chk or file_hash_for_log: + msg = "get-file: " + ", ".join(_chk) if _chk else "get-file" + if file_hash_for_log: + msg = f"{msg} (Hash={file_hash_for_log})" + ctx.emit(msg) + + base_name = _sanitize_name(get_field(result, 'title', None) or '') + if not base_name: + target_attr = get_field(result, 'target', None) + if isinstance(target_attr, str) and target_attr and not target_attr.startswith(('http://', 'https://')): + base_name = _sanitize_name(Path(target_attr).stem) + else: + base_name = 'export' + + local_target = get_field(result, 'target', None) + is_url = isinstance(local_target, str) and local_target.startswith(('http://', 'https://')) + # Establish file hash (prefer -hash override when provided and valid) + if hash_spec and looks_like_hash(hash_spec): + file_hash = normalize_hash(hash_spec) + else: + file_hash = normalize_hash(get_field(result, 'hash_hex', None)) if get_field(result, 'hash_hex', None) else None + + source_path: Optional[Path] = None + source_size: Optional[int] = None + duration_sec: Optional[float] = None + tags_payload: Dict[str, Any] = {} + urls_payload: Dict[str, Any] = {} + cleanup_source: bool = False + + if isinstance(local_target, str) and not is_url and not (hash_spec and file_hash): + p = Path(local_target) + if not p.exists(): + log(f"File missing: {p}") + return 1 + source_path = p + try: + source_size = p.stat().st_size + except OSError: + source_size = None + duration_sec = _ffprobe_duration_seconds(p) + if file_hash is None: + for sc in (p.with_suffix('.tags'), p.with_suffix('.tags.txt')): + try: + if sc.exists(): + text = sc.read_text(encoding='utf-8', errors='ignore') + for line in text.splitlines(): + ls = line.strip().lower() + if ls.startswith('hash:'): + candidate = line.split(':', 1)[1].strip() if ':' in line else '' + if looks_like_hash(candidate): + file_hash = candidate.lower() + break + except OSError: + pass + elif file_hash: + try: + client = hydrus_wrapper.get_client(config) + except Exception as exc: + log(f"Hydrus client unavailable: {exc}") + return 1 + + if client is None: + log("Hydrus client unavailable") + return 1 + + # Fetch metadata and tags (needed for both -metadata flag and audio tagging) + # Fetch tags + try: + tags_payload = client.fetch_file_metadata(hashes=[file_hash], include_service_keys_to_tags=True) + except Exception: + tags_payload = {} + + # Fetch URLs + try: + urls_payload = client.fetch_file_metadata(hashes=[file_hash], include_file_urls=True) + except Exception: + urls_payload = {} + + # Extract title from metadata if base_name is still 'export' + if base_name == 'export' and tags_payload: + try: + file_metadata = tags_payload.get('file_metadata', []) + if file_metadata and isinstance(file_metadata, list) and len(file_metadata) > 0: + meta = file_metadata[0] + if isinstance(meta, dict): + tags_dict = meta.get('tags', {}) + if isinstance(tags_dict, dict): + # Look for title in storage tags + for service in tags_dict.values(): + if isinstance(service, dict): + storage = service.get('storage_tags', {}) + if isinstance(storage, dict): + for tag_list in storage.values(): + if isinstance(tag_list, list): + for tag in tag_list: + if isinstance(tag, str) and tag.lower().startswith('title:'): + title_val = tag.split(':', 1)[1].strip() + if title_val: + base_name = _sanitize_name(title_val) + break + if base_name != 'export': + break + if base_name != 'export': + break + except Exception: + pass + + # Normal file export (happens regardless of -metadata flag) + try: + from downlow_helpers.hydrus import hydrus_export as _hydrus_export + except Exception: + _hydrus_export = None # type: ignore + if _hydrus_export is None: + log("Hydrus export helper unavailable") + return 1 + download_dir = out_override if (out_override and out_override.is_dir()) else default_dir + try: + download_dir.mkdir(parents=True, exist_ok=True) + except Exception: + # If mkdir fails, fall back to default_dir + download_dir = default_dir + + # Verify the directory is writable; if not, fall back to default + try: + test_file = download_dir / f".downlow_write_test_{_uuid.uuid4().hex[:8]}" + test_file.touch() + test_file.unlink() + except (OSError, PermissionError): + # Directory is not writable, use default_dir instead + download_dir = default_dir + try: + download_dir.mkdir(parents=True, exist_ok=True) + except Exception: + pass + token = (_uuid.uuid4().hex[:8]) + provisional_stem = f"{base_name}.dlhx_{token}" + provisional = download_dir / f"{provisional_stem}.bin" + class _Args: + pass + args_obj = _Args() + setattr(args_obj, 'output', provisional) + setattr(args_obj, 'format', 'copy') + setattr(args_obj, 'tmp_dir', str(download_dir)) + setattr(args_obj, 'metadata_json', None) + setattr(args_obj, 'hydrus_url', get_hydrus_url(config, "home") or "http://localhost:45869") + setattr(args_obj, 'access_key', get_hydrus_access_key(config, "home") or "") + setattr(args_obj, 'timeout', float(config.get('HydrusNetwork_Request_Timeout') or 60.0)) + try: + file_url = client.file_url(file_hash) + except Exception: + file_url = None + setattr(args_obj, 'file_url', file_url) + setattr(args_obj, 'file_hash', file_hash) + import io as _io, contextlib as _contextlib + _buf = _io.StringIO() + status = 1 + with _contextlib.redirect_stdout(_buf): + status = _hydrus_export(args_obj, None) + if status != 0: + stderr_text = _buf.getvalue().strip() + if stderr_text: + log(stderr_text) + return status + json_text = _buf.getvalue().strip().splitlines()[-1] if _buf.getvalue() else '' + final_from_json: Optional[Path] = None + try: + payload = json.loads(json_text) if json_text else None + if isinstance(payload, dict): + outp = payload.get('output') + if isinstance(outp, str) and outp: + final_from_json = Path(outp) + except Exception: + final_from_json = None + if final_from_json and final_from_json.exists(): + source_path = final_from_json + else: + candidates = [p for p in provisional.parent.glob(provisional_stem + '*') if p.exists() and p.is_file()] + non_provisional = [p for p in candidates if p.suffix.lower() not in {'.bin', '.hydrus'}] + pick_from = non_provisional if non_provisional else candidates + if pick_from: + try: + source_path = max(pick_from, key=lambda p: p.stat().st_mtime) + except Exception: + source_path = pick_from[0] + else: + source_path = provisional + candidates = [p for p in provisional.parent.glob(provisional_stem + '*') if p.exists() and p.is_file()] + non_provisional = [p for p in candidates if p.suffix.lower() not in {'.bin', '.hydrus'}] + pick_from = non_provisional if non_provisional else candidates + if pick_from: + try: + source_path = max(pick_from, key=lambda p: p.stat().st_mtime) + except Exception: + source_path = pick_from[0] + else: + source_path = provisional + try: + source_size = source_size or (source_path.stat().st_size if source_path.exists() else None) + except OSError: + source_size = source_size + if duration_sec is None: + duration_sec = _ffprobe_duration_seconds(source_path) + cleanup_source = True + else: + log("Selected result is neither a local file nor a Hydrus record") + return 1 + + convert = (str(convert_spec or '').strip().lower()) + if convert not in {'', 'copy', 'mp4', 'webm', 'audio', 'mp3', 'opus'}: + log(f"Unsupported Convert value: {convert_spec}") + return 1 + if not convert: + convert = 'copy' + input_kind = media_kind or _guess_kind_from_suffix(source_path) + if input_kind == 'audio' and convert in {'mp4', 'webm'}: + log("Cannot convert audio to video") + return 1 + + def _ext_for_convert(conv: str, src: Path) -> str: + if conv == 'mp4': + return '.mp4' + if conv == 'webm': + return '.webm' + if conv in {'audio', 'mp3'}: + return '.mp3' + if conv == 'opus': + return '.opus' + return src.suffix or '' + + auto_named = True + if out_override is not None and out_override.exists() and out_override.is_dir(): + dest_dir = out_override + dest_ext = _ext_for_convert(convert, source_path) + dest_path = dest_dir / f"{base_name}{dest_ext}" + else: + dest_dir = default_dir + dest_ext = _ext_for_convert(convert, source_path) + if out_override and not out_override.exists() and not str(out_override).endswith(('/', '\\')): + dest_path = out_override + auto_named = False + else: + dest_path = (dest_dir / f"{base_name}{dest_ext}") + + if source_size is None: + try: + source_size = source_path.stat().st_size + except OSError: + source_size = None + if source_size is None: + log("Unable to determine source size for sizing logic; proceeding without Size targeting") + target_bytes = None + else: + target_bytes = _compute_target_bytes(size_spec, int(source_size)) + if target_bytes and (source_size or 0): + try: + from ..downlow import _fmt_bytes as _fmt_bytes_helper + except ImportError: + try: + from downlow import _fmt_bytes as _fmt_bytes_helper # type: ignore + except ImportError: + _fmt_bytes_helper = lambda x: f"{x} bytes" # type: ignore + except Exception: + _fmt_bytes_helper = lambda x: f"{x} bytes" # type: ignore + ctx.emit(f"Resizing target: {_fmt_bytes_helper(source_size)} -> {_fmt_bytes_helper(target_bytes)}") + + cleanup_source = locals().get('cleanup_source', False) + if convert == 'copy' and (not target_bytes or target_bytes >= (source_size or 0)): + # Simple copy without FFmpeg processing + # Only skip this if we need to write metadata (then FFmpeg handles it) + if not (export_metadata or (tags_payload and tags_payload.get('metadata'))): + try: + dest_path.parent.mkdir(parents=True, exist_ok=True) + final_dest = _unique_path(dest_path) + _shutil.copy2(source_path, final_dest) + ctx.emit(f"Exported to {final_dest}") + log(f"Exported: {final_dest}", file=sys.stderr) + if cleanup_source: + try: + if source_path.exists() and source_path != final_dest: + source_path.unlink() + except OSError: + pass + + return 0 + except Exception as exc: + log(f"Copy failed: {exc}") + return 1 + else: + # Metadata exists, so we need to go through FFmpeg to embed and write sidecar + # Fall through to FFmpeg section below + pass + + convert_effective = convert + if convert == 'copy' and target_bytes and (source_size or 0) > target_bytes: + if input_kind == 'video': + convert_effective = 'mp4' + elif input_kind == 'audio': + convert_effective = 'copy' + else: + convert_effective = convert + + ffmpeg_path = _shutil.which('ffmpeg') + if not ffmpeg_path: + log("ffmpeg executable not found in PATH") + return 1 + + # Extract metadata from tags to embed in file + file_metadata = _extract_metadata_from_tags(tags_payload, file_hash or '', input_kind) + if file_metadata: + metadata_msg = ', '.join(f'{k}={v}' for k, v in file_metadata.items()) + ctx.emit(f"[metadata] Embedding: {metadata_msg}") + ctx.print_if_visible(f"[get-file] Embedding metadata: {metadata_msg}", file=sys.stderr) + else: + ctx.print_if_visible(f"[get-file] No metadata tags found to embed", file=sys.stderr) + + cmd: list[str] = [ffmpeg_path, '-y', '-i', str(source_path)] + + # Add metadata flags to FFmpeg command + for key, value in file_metadata.items(): + cmd.extend(['-metadata', f'{key}={value}']) + + conv = convert_effective + if conv in {'mp4', 'webm', 'copy'}: + video_bitrate: Optional[int] = None + audio_bitrate: int = 128_000 + if target_bytes and duration_sec and duration_sec > 0: + total_bps = max(1, int((target_bytes * 8) / duration_sec)) + if total_bps <= audio_bitrate + 50_000: + if input_kind == 'video': + video_bitrate = max(50_000, total_bps - audio_bitrate) + else: + video_bitrate = None + else: + video_bitrate = total_bps - audio_bitrate + if conv == 'webm': + cmd += ['-c:v', 'libvpx-vp9'] + if video_bitrate: + cmd += ['-b:v', str(video_bitrate)] + else: + cmd += ['-b:v', '0', '-crf', '32'] + cmd += ['-c:a', 'libopus', '-b:a', '160k'] + elif conv == 'mp4' or (conv == 'copy' and input_kind == 'video'): + cmd += ['-c:v', 'libx265', '-preset', 'medium', '-tag:v', 'hvc1', '-pix_fmt', 'yuv420p'] + if video_bitrate: + cmd += ['-b:v', str(video_bitrate)] + else: + cmd += ['-crf', '26'] + cmd += ['-c:a', 'aac', '-b:a', '192k'] + if conv == 'mp4' or (conv == 'copy' and input_kind == 'video'): + cmd += ['-movflags', '+faststart'] + if convert_spec and conv != 'copy': + ctx.emit(f"Converting video -> {conv} (duration={duration_sec or 'unknown'}s)") + else: + if target_bytes and duration_sec and duration_sec > 0: + total_bps = max(1, int((target_bytes * 8) / duration_sec)) + abr = max(32_000, min(320_000, total_bps)) + else: + abr = 192_000 + if conv in {'audio', 'mp3'}: + cmd += ['-vn', '-c:a', 'libmp3lame', '-b:a', str(abr)] + elif conv == 'opus': + cmd += ['-vn', '-c:a', 'libopus', '-b:a', str(abr)] + else: + ext = (source_path.suffix.lower() if source_path else '') + if ext in {'.mp3'}: + cmd += ['-vn', '-c:a', 'libmp3lame', '-b:a', str(abr)] + elif ext in {'.opus', '.ogg'}: + cmd += ['-vn', '-c:a', 'libopus', '-b:a', str(abr)] + elif ext in {'.m4a', '.aac'}: + cmd += ['-vn', '-c:a', 'aac', '-b:a', str(abr)] + else: + cmd += ['-vn', '-c:a', 'libmp3lame', '-b:a', str(abr)] + if convert_spec and conv != 'copy': + ctx.emit(f"Converting audio -> {conv}") + + if conv in {'audio','mp3'}: + desired_ext = '.mp3' + elif conv == 'opus': + desired_ext = '.opus' + elif conv == 'webm': + desired_ext = '.webm' + elif conv == 'mp4': + desired_ext = '.mp4' + else: + desired_ext = source_path.suffix + if (not dest_path.suffix) or auto_named or (dest_path.suffix.lower() in {'.hydrus', '.bin'}): + dest_path = dest_path.with_suffix(desired_ext) + + suffix_parts: list[str] = [] + def _size_label(raw: Optional[str], tb: Optional[int]) -> Optional[str]: + if not raw: + return None + text = str(raw).strip() + if text.endswith('%'): + return text + if not tb: + return None + mb = int(round(tb / (1024*1024))) + return f"{mb}Mb" + label = _size_label(size_spec, locals().get('target_bytes')) + if label: + suffix_parts.append(label) + if convert_spec and convert.lower() != 'copy': + label_map = {'mp4':'MP4','webm':'WEBM','audio':'AUDIO','mp3':'MP3','opus':'OPUS'} + suffix_parts.append(label_map.get(convert.lower(), convert.upper())) + if suffix_parts and auto_named: + _aug = f"{base_name} (" + ",".join(suffix_parts) + ")" + dest_path = dest_path.with_name(_aug + dest_path.suffix) + + try: + dest_path.parent.mkdir(parents=True, exist_ok=True) + final_dest = _unique_path(dest_path) + cmd.append(str(final_dest)) + completed = _subprocess.run(cmd, stdout=_subprocess.PIPE, stderr=_subprocess.PIPE, text=True) + if completed.returncode != 0: + stderr = (completed.stderr or '').strip() + log(f"ffmpeg failed ({completed.returncode}): {stderr}") + return 1 + ctx.emit(f"Exported to {final_dest}") + log(f"Exported: {final_dest}", file=sys.stderr) + + # Always write the .tags sidecar with metadata (hash, tags, URLs) + # This ensures metadata is preserved even if FFmpeg embedding didn't work + try: + metadata_lines = [] + + # Add hash + if file_hash: + metadata_lines.append(f"hash:{file_hash}") + + # Extract tags from metadata payload using correct structure + tags_set = set() + if 'metadata' in tags_payload and tags_payload['metadata']: + entry = tags_payload['metadata'][0] + if 'tags' in entry and isinstance(entry['tags'], dict): + for _service_key, service_data in entry['tags'].items(): + if isinstance(service_data, dict): + display_tags = service_data.get('display_tags', {}) + if isinstance(display_tags, dict): + current_tags = display_tags.get('0', []) + if isinstance(current_tags, list): + tags_set.update(current_tags) + + # Add tags (sorted, no prefix) + for tag in sorted(tags_set): + metadata_lines.append(tag) + + # Extract and add URLs + if 'metadata' in urls_payload and urls_payload['metadata']: + entry = urls_payload['metadata'][0] + if 'known_urls' in entry and isinstance(entry['known_urls'], list): + for url in entry['known_urls']: + metadata_lines.append(f"known_url:{url}") + + # Write sidecar if we have any metadata + if metadata_lines: + sidecar_path = final_dest.parent / f"{final_dest.name}.tags" + sidecar_path.write_text('\n'.join(metadata_lines), encoding='utf-8') + ctx.emit(f"Sidecar: {sidecar_path.name}") + log(f"Tags file: {sidecar_path}", file=sys.stderr) + except Exception as exc: + log(f"Warning: Could not write metadata sidecar: {exc}", file=sys.stderr) + + if cleanup_source: + try: + if source_path.exists() and source_path != final_dest: + source_path.unlink() + except OSError: + pass + return 0 + except Exception as exc: + log(f"Export failed: {exc}") + return 1 + + +def _unique_path(p: Path) -> Path: + if not p.exists(): + return p + stem = p.stem + suffix = p.suffix + parent = p.parent + for i in range(1, 1000): + candidate = parent / f"{stem} ({i}){suffix}" + if not candidate.exists(): + return candidate + return p + + +CMDLET = Cmdlet( + name="get-file", + summary="Export files: from Hydrus database OR from AllDebrid magnets via pipe. Auto-detects source and handles accordingly.", + usage="get-file [-Path ] [Size <50%|34MB>] [Convert ] [-metadata] [-file ]", + args=[ + CmdletArg("Path", description="Output directory for files."), + CmdletArg("Size", description="Target size (Hydrus only): 50% or 34MB."), + CmdletArg("Convert", description="Convert format (Hydrus only): mp4, webm, audio, mp3, opus."), + CmdletArg("metadata", type="flag", description="Export metadata to .tags file (Hydrus only)."), + CmdletArg("file", description="Filter files by pattern (AllDebrid only)."), + ], + details=[ + "Hydrus mode: exports media with optional size/format conversion", + "AllDebrid mode: downloads files from piped magnet IDs from search-debrid", + "Auto-detects pipe format and routes to correct handler", + "Magnet pipe format: ID|filename|size|statusCode|status|progress|...", + ], + +) \ No newline at end of file diff --git a/cmdlets/get_metadata.py b/cmdlets/get_metadata.py new file mode 100644 index 0000000..ac3b4f0 --- /dev/null +++ b/cmdlets/get_metadata.py @@ -0,0 +1,246 @@ +from __future__ import annotations + +from typing import Any, Dict, Sequence, Optional +import json +import sys + +from helper.logger import log +from pathlib import Path +import mimetypes +import os + +from helper import hydrus as hydrus_wrapper +from ._shared import Cmdlet, CmdletArg, normalize_hash + + +def _run(result: Any, _args: Sequence[str], config: Dict[str, Any]) -> int: + # Help + try: + if any(str(a).lower() in {"-?", "/?", "--help", "-h", "help", "--cmdlet"} for a in _args): + log(json.dumps(CMDLET.to_dict(), ensure_ascii=False, indent=2)) + return 0 + except Exception: + pass + + # Helper to get field from both dict and object + def get_field(obj: Any, field: str, default: Any = None) -> Any: + if isinstance(obj, dict): + return obj.get(field, default) + else: + return getattr(obj, field, default) + + # Parse -hash override + override_hash: str | None = None + args_list = list(_args) + i = 0 + while i < len(args_list): + a = args_list[i] + low = str(a).lower() + if low in {"-hash", "--hash", "hash"} and i + 1 < len(args_list): + override_hash = str(args_list[i + 1]).strip() + break + i += 1 + + # Try to determine if this is a local file or Hydrus file + local_path = get_field(result, "target", None) or get_field(result, "path", None) + is_local = False + if local_path and isinstance(local_path, str) and not local_path.startswith(("http://", "https://")): + is_local = True + + # LOCAL FILE PATH + if is_local and local_path: + try: + file_path = Path(str(local_path)) + if file_path.exists() and file_path.is_file(): + # Get the hash from result or compute it + hash_hex = normalize_hash(override_hash) if override_hash else normalize_hash(get_field(result, "hash_hex", None)) + + # If no hash, compute SHA256 of the file + if not hash_hex: + try: + import hashlib + with open(file_path, 'rb') as f: + hash_hex = hashlib.sha256(f.read()).hexdigest() + except Exception: + hash_hex = None + + # Get MIME type + mime_type, _ = mimetypes.guess_type(str(file_path)) + if not mime_type: + mime_type = "unknown" + + # Get file size + try: + file_size = file_path.stat().st_size + except Exception: + file_size = None + + # Try to get duration if it's a media file + duration_seconds = None + try: + # Try to use ffprobe if available + import subprocess + result_proc = subprocess.run( + ["ffprobe", "-v", "error", "-select_streams", "v:0", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", str(file_path)], + capture_output=True, + text=True, + timeout=5 + ) + if result_proc.returncode == 0 and result_proc.stdout.strip(): + try: + duration_seconds = float(result_proc.stdout.strip()) + except ValueError: + pass + except Exception: + pass + + # Get format helpers from search module + try: + from .search_file import _format_size as _fmt_size + from .search_file import _format_duration as _fmt_dur + except Exception: + _fmt_size = lambda x: str(x) if x is not None else "" + _fmt_dur = lambda x: str(x) if x is not None else "" + + size_label = _fmt_size(file_size) if file_size is not None else "" + dur_label = _fmt_dur(duration_seconds) if duration_seconds is not None else "" + + # Get known URLs from sidecar or result + urls = [] + sidecar_path = Path(str(file_path) + '.tags') + if sidecar_path.exists(): + try: + with open(sidecar_path, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if line.startswith('known_url:'): + url_value = line.replace('known_url:', '', 1).strip() + if url_value: + urls.append(url_value) + except Exception: + pass + + # Fallback to result URLs if not in sidecar + if not urls: + urls_from_result = get_field(result, "known_urls", None) or get_field(result, "urls", None) + if isinstance(urls_from_result, list): + urls.extend([str(u).strip() for u in urls_from_result if u]) + + # Display local file metadata + log(f"PATH: {file_path}") + if hash_hex: + log(f"HASH: {hash_hex}") + if mime_type: + log(f"MIME: {mime_type}") + if size_label: + log(f"Size: {size_label}") + if dur_label: + log(f"Duration: {dur_label}") + if urls: + log("URLs:") + for url in urls: + log(f" {url}") + + return 0 + except Exception as exc: + # Fall through to Hydrus if local file handling fails + pass + + # HYDRUS PATH + hash_hex = normalize_hash(override_hash) if override_hash else normalize_hash(get_field(result, "hash_hex", None)) + if not hash_hex: + log("Selected result does not include a Hydrus hash or local path", file=sys.stderr) + return 1 + + try: + client = hydrus_wrapper.get_client(config) + except Exception as exc: + log(f"Hydrus client unavailable: {exc}", file=sys.stderr) + return 1 + + if client is None: + log("Hydrus client unavailable", file=sys.stderr) + return 1 + + try: + payload = client.fetch_file_metadata( + hashes=[hash_hex], + include_service_keys_to_tags=False, + include_file_urls=True, + include_duration=True, + include_size=True, + include_mime=True, + ) + except Exception as exc: + log(f"Hydrus metadata fetch failed: {exc}", file=sys.stderr) + return 1 + + items = payload.get("metadata") if isinstance(payload, dict) else None + if not isinstance(items, list) or not items: + log("No metadata found.") + return 0 + + meta = items[0] if isinstance(items[0], dict) else None + if not isinstance(meta, dict): + log("No metadata found.") + return 0 + + mime = meta.get("mime") + size = meta.get("size") or meta.get("file_size") + duration_value = meta.get("duration") + inner = meta.get("metadata") if isinstance(meta.get("metadata"), dict) else None + if duration_value is None and isinstance(inner, dict): + duration_value = inner.get("duration") + + try: + from .search_file import _format_size as _fmt_size + from .search_file import _format_duration as _fmt_dur + from .search_file import _hydrus_duration_seconds as _dur_secs + except Exception: + _fmt_size = lambda x: str(x) if x is not None else "" + _dur_secs = lambda x: x + _fmt_dur = lambda x: str(x) if x is not None else "" + + dur_seconds = _dur_secs(duration_value) + dur_label = _fmt_dur(dur_seconds) if dur_seconds is not None else "" + size_label = _fmt_size(size) + + # Display Hydrus file metadata + log(f"PATH: hydrus://file/{hash_hex}") + log(f"Hash: {hash_hex}") + if mime: + log(f"MIME: {mime}") + if dur_label: + log(f"Duration: {dur_label}") + if size_label: + log(f"Size: {size_label}") + + urls = meta.get("known_urls") or meta.get("urls") + if isinstance(urls, list) and urls: + log("URLs:") + for url in urls: + try: + text = str(url).strip() + except Exception: + text = "" + if text: + log(f" {text}") + + return 0 + + +CMDLET = Cmdlet( + name="get-metadata", + summary="Print metadata for local or Hydrus files (hash, mime, duration, size, URLs).", + usage="get-metadata [-hash ]", + aliases=["meta"], + args=[ + CmdletArg("hash", description="Override the Hydrus file hash (SHA256) to target instead of the selected result."), + ], + details=[ + "- For local files: Shows path, hash (computed if needed), MIME type, size, duration, and known URLs from sidecar.", + "- For Hydrus files: Shows path (hydrus://), hash, MIME, duration, size, and known URLs.", + "- Automatically detects local vs Hydrus files.", + "- Local file hashes are computed via SHA256 if not already available.", + ], +) diff --git a/cmdlets/get_note.py b/cmdlets/get_note.py new file mode 100644 index 0000000..6acc920 --- /dev/null +++ b/cmdlets/get_note.py @@ -0,0 +1,87 @@ +from __future__ import annotations + +from typing import Any, Dict, Sequence +import json + +from . import register +import models +import pipeline as ctx +from helper import hydrus as hydrus_wrapper +from ._shared import Cmdlet, CmdletArg, normalize_hash +from helper.logger import log + +CMDLET = Cmdlet( + name="get-note", + summary="List notes on a Hydrus file.", + usage="get-note [-hash ]", + args=[ + CmdletArg("-hash", description="Override the Hydrus file hash (SHA256) to target instead of the selected result."), + ], + details=[ + "- Prints notes by service and note name.", + ], +) + + +@register(["get-note", "get-notes", "get_note"]) # aliases +def get_notes(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: + # Helper to get field from both dict and object + def get_field(obj: Any, field: str, default: Any = None) -> Any: + if isinstance(obj, dict): + return obj.get(field, default) + else: + return getattr(obj, field, default) + + # Help + try: + if any(str(a).lower() in {"-?", "/?", "--help", "-h", "help", "--cmdlet"} for a in args): + log(json.dumps(CMDLET, ensure_ascii=False, indent=2)) + return 0 + except Exception: + pass + + from ._shared import parse_cmdlet_args + parsed = parse_cmdlet_args(args, CMDLET) + override_hash = parsed.get("hash") + + hash_hex = normalize_hash(override_hash) if override_hash else normalize_hash(get_field(result, "hash_hex", None)) + if not hash_hex: + log("Selected result does not include a Hydrus hash") + return 1 + try: + client = hydrus_wrapper.get_client(config) + except Exception as exc: + log(f"Hydrus client unavailable: {exc}") + return 1 + + if client is None: + log("Hydrus client unavailable") + return 1 + try: + payload = client.fetch_file_metadata(hashes=[hash_hex], include_service_keys_to_tags=False, include_notes=True) + except Exception as exc: + log(f"Hydrus metadata fetch failed: {exc}") + return 1 + items = payload.get("metadata") if isinstance(payload, dict) else None + meta = items[0] if (isinstance(items, list) and items and isinstance(items[0], dict)) else None + notes = {} + if isinstance(meta, dict): + # Hydrus returns service_keys_to_tags; for notes we expect 'service_names_to_notes' in modern API + notes = meta.get('notes') or meta.get('service_names_to_notes') or {} + if notes: + ctx.emit("Notes:") + # Print flattened: service -> (name: text) + if isinstance(notes, dict) and any(isinstance(v, dict) for v in notes.values()): + for svc, mapping in notes.items(): + ctx.emit(f"- {svc}:") + if isinstance(mapping, dict): + for k, v in mapping.items(): + ctx.emit(f" • {k}: {str(v).strip()}") + elif isinstance(notes, dict): + for k, v in notes.items(): + ctx.emit(f"- {k}: {str(v).strip()}") + else: + ctx.emit("No notes found.") + return 0 + + diff --git a/cmdlets/get_relationship.py b/cmdlets/get_relationship.py new file mode 100644 index 0000000..922b1eb --- /dev/null +++ b/cmdlets/get_relationship.py @@ -0,0 +1,240 @@ +from __future__ import annotations + +from typing import Any, Dict, Sequence, List, Optional +import json +import sys + +from helper.logger import log + +from . import register +import models +import pipeline as ctx +from helper import hydrus as hydrus_wrapper +from ._shared import Cmdlet, CmdletArg, normalize_hash, fmt_bytes + +CMDLET = Cmdlet( + name="get-relationship", + summary="Print Hydrus relationships for the selected file.", + usage="get-relationship [-hash ]", + args=[ + CmdletArg("-hash", description="Override the Hydrus file hash (SHA256) to target instead of the selected result."), + ], + details=[ + "- Lists relationship data as returned by Hydrus.", + ], +) + +@register(["get-rel", "get-relationship", "get-relationships", "get-file-relationships"]) # aliases +def _run(result: Any, _args: Sequence[str], config: Dict[str, Any]) -> int: + # Help + try: + if any(str(a).lower() in {"-?", "/?", "--help", "-h", "help", "--cmdlet"} for a in _args): + log(json.dumps(CMDLET, ensure_ascii=False, indent=2)) + return 0 + except Exception: + pass + + # Parse -hash override + override_hash: str | None = None + args_list = list(_args) + i = 0 + while i < len(args_list): + a = args_list[i] + low = str(a).lower() + if low in {"-hash", "--hash", "hash"} and i + 1 < len(args_list): + override_hash = str(args_list[i + 1]).strip() + break + i += 1 + + # Handle @N selection which creates a list - extract the first item + if isinstance(result, list) and len(result) > 0: + result = result[0] + + hash_hex = normalize_hash(override_hash) if override_hash else normalize_hash(getattr(result, "hash_hex", None)) + if not hash_hex: + log("Selected result does not include a Hydrus hash", file=sys.stderr) + return 1 + try: + client = hydrus_wrapper.get_client(config) + except Exception as exc: + log(f"Hydrus client unavailable: {exc}", file=sys.stderr) + return 1 + + if client is None: + log("Hydrus client unavailable", file=sys.stderr) + return 1 + try: + rel = client.get_file_relationships(hash_hex) + except Exception as exc: + log(f"Hydrus relationships fetch failed: {exc}", file=sys.stderr) + return 1 + if not rel: + log("No relationships found.") + return 0 + + # Extract file_relationships from response + file_rels = rel.get("file_relationships", {}) + if not file_rels: + log("No relationships found.") + return 0 + + # Get the relationships dict for this specific hash + this_file_rels = file_rels.get(hash_hex) + if not this_file_rels: + log("No relationships found.") + return 0 + + # Extract related hashes from all relationship types + # Keys "0", "1", "3", "8" are relationship type IDs + # Values are lists of hashes + related_hashes = [] + for rel_type_id, hash_list in this_file_rels.items(): + # Skip non-numeric keys and metadata keys + if rel_type_id in {"is_king", "king", "king_is_on_file_domain", "king_is_local"}: + continue + if isinstance(hash_list, list): + for rel_hash in hash_list: + if isinstance(rel_hash, str) and rel_hash and rel_hash != hash_hex: + related_hashes.append(rel_hash) + + # Remove duplicates while preserving order + seen = set() + unique_hashes = [] + for h in related_hashes: + if h not in seen: + seen.add(h) + unique_hashes.append(h) + + if not unique_hashes: + log("No related files found.") + return 0 + + # Fetch metadata for all related files + try: + metadata_payload = client.fetch_file_metadata( + hashes=unique_hashes, + include_service_keys_to_tags=True, + include_duration=True, + include_size=True, + include_mime=True, + ) + except Exception as exc: + log(f"Hydrus metadata fetch failed: {exc}", file=sys.stderr) + return 1 + + metadata_list = metadata_payload.get("metadata") if isinstance(metadata_payload, dict) else None + if not isinstance(metadata_list, list): + log("Hydrus metadata response was not a list", file=sys.stderr) + return 1 + + # Build metadata map by hash + meta_by_hash: Dict[str, Dict[str, Any]] = {} + for item in metadata_list: + if isinstance(item, dict): + item_hash = normalize_hash(item.get("hash")) + if item_hash: + meta_by_hash[item_hash] = item + + # Helper functions for formatting + def _format_duration(seconds: Optional[float]) -> str: + if seconds is None: + return "" + try: + s = int(seconds) + hours = s // 3600 + minutes = (s % 3600) // 60 + secs = s % 60 + if hours > 0: + return f"{hours}:{minutes:02d}:{secs:02d}" + else: + return f"{minutes}:{secs:02d}" + except Exception: + return "" + + def _get_title(meta: Dict[str, Any]) -> str: + # Try to extract title from tags + tags_payload = meta.get("tags") + if isinstance(tags_payload, dict): + for service_data in tags_payload.values(): + if isinstance(service_data, dict): + storage_tags = service_data.get("storage_tags") + if isinstance(storage_tags, dict): + for tag_list in storage_tags.values(): + if isinstance(tag_list, list): + for tag in tag_list: + tag_str = str(tag).lower() + if tag_str.startswith("title:"): + return str(tag)[6:].strip() + # Fallback to hash prefix + h = meta.get("hash") + return str(h)[:12] if h else "unknown" + + def _get_mime_type(meta: Dict[str, Any]) -> str: + mime = meta.get("mime", "") + if not mime: + return "" + # Extract type from mime (e.g., "video/mp4" -> "video") + parts = str(mime).split("/") + return parts[0] if parts else "" + + # Print header and separator + log("# | Title | Type | Duration | Size") + log("--+---------------------------+-------+----------+--------") + + # Create result objects for each related file + results: List[Any] = [] + + # Print each related file + for idx, rel_hash in enumerate(unique_hashes, start=1): + meta = meta_by_hash.get(rel_hash) + if not meta: + continue + + title = _get_title(meta) + mime_type = _get_mime_type(meta) + + # Get duration + duration_value = meta.get("duration") + if duration_value is None and isinstance(meta.get("metadata"), dict): + duration_value = meta["metadata"].get("duration") + duration_str = _format_duration(duration_value) + + # Get size + size = meta.get("size") or meta.get("file_size") + size_str = fmt_bytes(size) if size else "" + + # Format and print row + title_display = title[:25].ljust(25) + type_display = mime_type[:5].ljust(5) + duration_display = duration_str[:8].ljust(8) + size_display = size_str[:7].ljust(7) + + log(f"{idx:2d} | {title_display} | {type_display} | {duration_display} | {size_display}") + + # Create result object for pipeline + result_obj = type("RelatedFile", (), { + "hash_hex": rel_hash, + "title": title, + "media_kind": mime_type or "other", + "size": size, + "duration": duration_value, + "known_urls": [], + "annotations": [], + "columns": [ + ("Title", title), + ("Type", mime_type), + ("Duration", duration_str), + ("Size", size_str), + ], + })() + results.append(result_obj) + + # Emit results to pipeline + try: + ctx._PIPE_EMITS.extend(results) + except Exception: + pass + + return 0 + + diff --git a/cmdlets/get_tag.py b/cmdlets/get_tag.py new file mode 100644 index 0000000..b0c6b27 --- /dev/null +++ b/cmdlets/get_tag.py @@ -0,0 +1,1191 @@ +"""Get tags from Hydrus or local sidecar metadata. + +This cmdlet retrieves tags for a selected result, supporting both: +- Hydrus Network (for files with hash_hex) +- Local sidecar files (.tags) + +In interactive mode: navigate with numbers, add/delete tags +In pipeline mode: display tags as read-only table, emit as structured JSON +""" + +from __future__ import annotations + +import sys + +from helper.logger import log +import subprocess +from pathlib import Path +from typing import Any, Dict, List, Optional, Sequence, Tuple + +import pipeline as ctx +from helper import hydrus +from helper.local_library import read_sidecar, write_sidecar, find_sidecar, LocalLibraryDB +from ._shared import normalize_hash, Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args +from config import get_local_storage_path + + +try: + from metadata import extract_title +except ImportError: + extract_title = None + + + + + +# Tag item for ResultTable display and piping +from dataclasses import dataclass + +@dataclass +class TagItem: + """Tag item for display in ResultTable and piping to other cmdlets. + + Allows tags to be selected and piped like: + - delete-tag @{3,4,9} (delete tags at indices 3, 4, 9) + - add-tag @"namespace:value" (add this tag) + """ + tag_name: str + tag_index: int # 1-based index for user reference + hash_hex: Optional[str] = None + source: str = "hydrus" + service_name: Optional[str] = None + + def __post_init__(self): + # Make ResultTable happy by adding standard fields + # NOTE: Don't set 'title' - we want only the tag column in ResultTable + self.origin = self.source + self.detail = f"Tag #{self.tag_index}" + self.target = self.tag_name + self.media_kind = "tag" + + def to_dict(self) -> Dict[str, Any]: + """Convert to dict for JSON serialization.""" + return { + "tag_name": self.tag_name, + "tag_index": self.tag_index, + "hash_hex": self.hash_hex, + "source": self.source, + "service_name": self.service_name, + } + + +def _extract_my_tags_from_hydrus_meta(meta: Dict[str, Any], service_key: Optional[str], service_name: str) -> List[str]: + """Extract current tags from Hydrus metadata dict. + + Prefers display_tags (includes siblings/parents, excludes deleted). + Falls back to storage_tags status '0' (current). + """ + tags_payload = meta.get("tags") + if not isinstance(tags_payload, dict): + return [] + svc_data = None + if service_key: + svc_data = tags_payload.get(service_key) + if not isinstance(svc_data, dict): + return [] + # Prefer display_tags (Hydrus computes siblings/parents) + display = svc_data.get("display_tags") + if isinstance(display, list) and display: + return [str(t) for t in display if isinstance(t, (str, bytes)) and str(t).strip()] + # Fallback to storage_tags status '0' (current) + storage = svc_data.get("storage_tags") + if isinstance(storage, dict): + current_list = storage.get("0") or storage.get(0) + if isinstance(current_list, list): + return [str(t) for t in current_list if isinstance(t, (str, bytes)) and str(t).strip()] + return [] + + +def _emit_tags_as_table( + tags_list: List[str], + hash_hex: Optional[str], + source: str = "hydrus", + service_name: Optional[str] = None, + config: Dict[str, Any] = None +) -> None: + """Emit tags as TagItem objects and display via ResultTable. + + This replaces _print_tag_list to make tags pipe-able. + Stores the table in ctx._LAST_RESULT_TABLE for downstream @ selection. + """ + from result_table import ResultTable + + # Create ResultTable with just tag column (no title) + table = ResultTable("Tags", max_columns=1) + table.set_source_command("get-tag", []) + + # Create TagItem for each tag + tag_items = [] + for idx, tag_name in enumerate(tags_list, start=1): + tag_item = TagItem( + tag_name=tag_name, + tag_index=idx, + hash_hex=hash_hex, + source=source, + service_name=service_name, + ) + tag_items.append(tag_item) + table.add_result(tag_item) + # Also emit to pipeline for downstream processing + ctx.emit(tag_item) + + # Store the table and items in history so @.. works to go back + # Use overlay mode so it doesn't push the previous search to history stack + # This makes get-tag behave like a transient view + try: + ctx.set_last_result_table_overlay(table, tag_items) + except AttributeError: + ctx.set_last_result_table(table, tag_items) + # Note: CLI will handle displaying the table via ResultTable formatting +def _summarize_tags(tags_list: List[str], limit: int = 8) -> str: + """Create a summary of tags for display.""" + shown = [t for t in tags_list[:limit] if t] + summary = ", ".join(shown) + remaining = max(0, len(tags_list) - len(shown)) + if remaining > 0: + summary = f"{summary} (+{remaining} more)" if summary else f"(+{remaining} more)" + if len(summary) > 200: + summary = summary[:197] + "..." + return summary + + +def _extract_title_from(tags_list: List[str]) -> Optional[str]: + """Extract title from tags list.""" + if extract_title: + try: + return extract_title(tags_list) + except Exception: + pass + for t in tags_list: + if isinstance(t, str) and t.lower().startswith("title:"): + val = t.split(":", 1)[1].strip() + if val: + return val + return None + + +def _rename_file_if_title_tag(media: Optional[Path], tags_added: List[str]) -> bool: + """Rename a local file if title: tag was added. + + Returns True if file was renamed, False otherwise. + """ + if not media or not tags_added: + return False + + # Check if any of the added tags is a title: tag + title_value = None + for tag in tags_added: + if isinstance(tag, str): + lower_tag = tag.lower() + if lower_tag.startswith("title:"): + title_value = tag.split(":", 1)[1].strip() + break + + if not title_value: + return False + + try: + # Get current file path + file_path = media + if not file_path.exists(): + return False + + # Parse file path + dir_path = file_path.parent + old_name = file_path.name + + # Get file extension + suffix = file_path.suffix or '' + + # Sanitize title for use as filename + import re + safe_title = re.sub(r'[<>:"/\\|?*]', '', title_value).strip() + if not safe_title: + return False + + new_name = safe_title + suffix + new_file_path = dir_path / new_name + + if new_file_path == file_path: + return False + + # Build sidecar paths BEFORE renaming the file + old_sidecar = Path(str(file_path) + '.tags') + new_sidecar = Path(str(new_file_path) + '.tags') + + # Rename file + try: + file_path.rename(new_file_path) + log(f"Renamed file: {old_name} → {new_name}") + + # Rename .tags sidecar if it exists + if old_sidecar.exists(): + try: + old_sidecar.rename(new_sidecar) + log(f"Renamed sidecar: {old_name}.tags → {new_name}.tags") + except Exception as e: + log(f"Failed to rename sidecar: {e}", file=sys.stderr) + + return True + except Exception as e: + log(f"Failed to rename file: {e}", file=sys.stderr) + return False + except Exception as e: + log(f"Error during file rename: {e}", file=sys.stderr) + return False + + +def _apply_result_updates_from_tags(result: Any, tag_list: List[str]) -> None: + """Update result object with title and tag summary from tags.""" + try: + new_title = _extract_title_from(tag_list) + if new_title: + setattr(result, "title", new_title) + setattr(result, "tag_summary", _summarize_tags(tag_list)) + except Exception: + pass + + +def _handle_title_rename(old_path: Path, tags_list: List[str]) -> Optional[Path]: + """If a title: tag is present, rename the file and its .tags sidecar to match. + + Returns the new path if renamed, otherwise returns None. + """ + # Extract title from tags + new_title = None + for tag in tags_list: + if isinstance(tag, str) and tag.lower().startswith('title:'): + new_title = tag.split(':', 1)[1].strip() + break + + if not new_title or not old_path.exists(): + return None + + try: + # Build new filename with same extension + old_name = old_path.name + old_suffix = old_path.suffix + + # Create new filename: title + extension + new_name = f"{new_title}{old_suffix}" + new_path = old_path.parent / new_name + + # Don't rename if already the same name + if new_path == old_path: + return None + + # Rename the main file + if new_path.exists(): + log(f"Warning: Target filename already exists: {new_name}", file=sys.stderr) + return None + + old_path.rename(new_path) + log(f"Renamed file: {old_name} → {new_name}", file=sys.stderr) + + # Rename the .tags sidecar if it exists + old_tags_path = old_path.parent / (old_name + '.tags') + if old_tags_path.exists(): + new_tags_path = old_path.parent / (new_name + '.tags') + if new_tags_path.exists(): + log(f"Warning: Target sidecar already exists: {new_tags_path.name}", file=sys.stderr) + else: + old_tags_path.rename(new_tags_path) + log(f"Renamed sidecar: {old_tags_path.name} → {new_tags_path.name}", file=sys.stderr) + + return new_path + except Exception as exc: + log(f"Warning: Failed to rename file: {exc}", file=sys.stderr) + return None + + + +def _read_sidecar_fallback(p: Path) -> tuple[Optional[str], List[str], List[str]]: + """Fallback sidecar reader if metadata module unavailable. + + Format: + - Lines with "hash:" prefix: file hash + - Lines with "known_url:" or "url:" prefix: URLs + - Lines with "relationship:" prefix: ignored (internal relationships) + - Lines with "key:", "namespace:value" format: treated as namespace tags + - Plain lines without colons: freeform tags + + Excluded namespaces (treated as metadata, not tags): hash, known_url, url, relationship + """ + try: + raw = p.read_text(encoding="utf-8", errors="ignore") + except OSError: + return None, [], [] + t: List[str] = [] + u: List[str] = [] + h: Optional[str] = None + + # Namespaces to exclude from tags + excluded_namespaces = {"hash", "known_url", "url", "relationship"} + + for line in raw.splitlines(): + s = line.strip() + if not s: + continue + low = s.lower() + + # Check if this is a hash line + if low.startswith("hash:"): + h = s.split(":", 1)[1].strip() if ":" in s else h + # Check if this is a URL line + elif low.startswith("known_url:") or low.startswith("url:"): + val = s.split(":", 1)[1].strip() if ":" in s else "" + if val: + u.append(val) + # Check if this is an excluded namespace + elif ":" in s: + namespace = s.split(":", 1)[0].strip().lower() + if namespace not in excluded_namespaces: + # Include as namespace tag (e.g., "title: The Freemasons") + t.append(s) + else: + # Plain text without colon = freeform tag + t.append(s) + + return h, t, u + + +def _write_sidecar(p: Path, media: Path, tag_list: List[str], known_urls: List[str], hash_in_sidecar: Optional[str]) -> Path: + """Write tags to sidecar file and handle title-based renaming. + + Returns the new media path if renamed, otherwise returns the original media path. + """ + success = write_sidecar(media, tag_list, known_urls, hash_in_sidecar) + if success: + _apply_result_updates_from_tags(None, tag_list) + # Check if we should rename the file based on title tag + new_media = _handle_title_rename(media, tag_list) + if new_media: + return new_media + return media + + # Fallback writer + ordered = [s for s in tag_list if s and s.strip()] + lines = [] + if hash_in_sidecar: + lines.append(f"hash:{hash_in_sidecar}") + lines.extend(ordered) + for u in known_urls: + lines.append(f"known_url:{u}") + try: + p.write_text("\n".join(lines) + "\n", encoding="utf-8") + # Check if we should rename the file based on title tag + new_media = _handle_title_rename(media, tag_list) + if new_media: + return new_media + return media + except OSError as exc: + log(f"Failed to write sidecar: {exc}", file=sys.stderr) + return media + + +def _emit_tag_payload(source: str, tags_list: List[str], *, hash_value: Optional[str], extra: Optional[Dict[str, Any]] = None, store_label: Optional[str] = None) -> int: + """Emit tags as structured payload to pipeline. + + Also emits individual tag objects to _PIPELINE_LAST_ITEMS so they can be selected by index. + """ + payload: Dict[str, Any] = { + "source": source, + "tags": list(tags_list), + "count": len(tags_list), + } + if hash_value: + payload["hash"] = hash_value + if extra: + for key, value in extra.items(): + if value is not None: + payload[key] = value + label = None + if store_label: + label = store_label + elif ctx._PIPE_ACTIVE: + label = "tags" + if label: + ctx.store_value(label, payload) + if ctx._PIPE_ACTIVE and label.lower() != "tags": + ctx.store_value("tags", payload) + + # Emit individual TagItem objects so they can be selected by bare index + # When in pipeline, emit individual TagItem objects + if ctx._PIPE_ACTIVE: + for idx, tag_name in enumerate(tags_list, start=1): + tag_item = TagItem( + tag_name=tag_name, + tag_index=idx, + hash_hex=hash_value, + source=source, + service_name=None + ) + ctx.emit(tag_item) + else: + # When not in pipeline, just emit the payload + ctx.emit(payload) + + return 0 + + + +def _extract_scrapable_identifiers(tags_list: List[str]) -> Dict[str, str]: + """Extract scrapable identifiers from tags.""" + identifiers = {} + scrapable_prefixes = {'openlibrary', 'isbn_10', 'isbn', 'musicbrainz', 'musicbrainzalbum', 'imdb', 'tmdb', 'tvdb'} + + for tag in tags_list: + if not isinstance(tag, str) or ':' not in tag: + continue + + parts = tag.split(':', 1) + if len(parts) != 2: + continue + + key = parts[0].strip().lower() + value = parts[1].strip() + + if key in scrapable_prefixes and value: + identifiers[key] = value + + return identifiers + + +def _scrape_url_metadata(url: str) -> Tuple[Optional[str], List[str], List[Tuple[str, str]], List[Dict[str, Any]]]: + """Scrape metadata from a URL using yt-dlp. + + Returns: + (title, tags, formats, playlist_items) tuple where: + - title: Video/content title + - tags: List of extracted tags (both namespaced and freeform) + - formats: List of (display_label, format_id) tuples + - playlist_items: List of playlist entry dicts (empty if not a playlist) + """ + try: + import json as json_module + + try: + from metadata import extract_ytdlp_tags + except ImportError: + extract_ytdlp_tags = None + + # Build yt-dlp command with playlist support + # IMPORTANT: Do NOT use --flat-playlist! It strips metadata like artist, album, uploader, genre + # Without it, yt-dlp gives us full metadata in an 'entries' array within a single JSON object + # This ensures we get album-level metadata from sources like BandCamp, YouTube Music, etc. + cmd = [ + "yt-dlp", + "-j", # Output JSON + "--no-warnings", + "--playlist-items", "1-10", # Get first 10 items if it's a playlist (provides entries) + "-f", "best", + url + ] + + result = subprocess.run(cmd, capture_output=True, text=True, timeout=30) + + if result.returncode != 0: + log(f"yt-dlp error: {result.stderr}", file=sys.stderr) + return None, [], [], [] + + # Parse JSON output - WITHOUT --flat-playlist, we get ONE JSON object with 'entries' array + # This gives us full metadata instead of flat format + lines = result.stdout.strip().split('\n') + if not lines or not lines[0]: + log("yt-dlp returned empty output", file=sys.stderr) + return None, [], [], [] + + # Parse the single JSON object + try: + data = json_module.loads(lines[0]) + except json_module.JSONDecodeError as e: + log(f"Failed to parse yt-dlp JSON: {e}", file=sys.stderr) + return None, [], [], [] + + # Extract title - use the main title + title = data.get('title', 'Unknown') + + # Determine if this is a playlist/album (has entries array) + # is_playlist = 'entries' in data and isinstance(data.get('entries'), list) + + # Extract tags and playlist items + tags = [] + playlist_items = [] + + # IMPORTANT: Extract album/playlist-level tags FIRST (before processing entries) + # This ensures we get metadata about the collection, not just individual tracks + if extract_ytdlp_tags: + album_tags = extract_ytdlp_tags(data) + tags.extend(album_tags) + + # Case 1: Entries are nested in the main object (standard playlist structure) + if 'entries' in data and isinstance(data.get('entries'), list): + entries = data['entries'] + # Build playlist items with title and duration + for idx, entry in enumerate(entries, 1): + if isinstance(entry, dict): + item_title = entry.get('title', entry.get('id', f'Track {idx}')) + item_duration = entry.get('duration', 0) + playlist_items.append({ + 'index': idx, + 'id': entry.get('id', f'track_{idx}'), + 'title': item_title, + 'duration': item_duration, + 'url': entry.get('url') or entry.get('webpage_url', ''), + }) + + # Extract tags from each entry and merge (but don't duplicate album-level tags) + # Only merge entry tags that are multi-value prefixes (not single-value like title:, artist:, etc.) + if extract_ytdlp_tags: + entry_tags = extract_ytdlp_tags(entry) + + # Single-value namespaces that should not be duplicated from entries + single_value_namespaces = {'title', 'artist', 'album', 'creator', 'channel', 'release_date', 'upload_date', 'license', 'location'} + + for tag in entry_tags: + # Extract the namespace (part before the colon) + tag_namespace = tag.split(':', 1)[0].lower() if ':' in tag else None + + # Skip if this namespace already exists in tags (from album level) + if tag_namespace and tag_namespace in single_value_namespaces: + # Check if any tag with this namespace already exists in tags + already_has_namespace = any( + t.split(':', 1)[0].lower() == tag_namespace + for t in tags if ':' in t + ) + if already_has_namespace: + continue # Skip this tag, keep the album-level one + + if tag not in tags: # Avoid exact duplicates + tags.append(tag) + + # Case 2: Playlist detected by playlist_count field (BandCamp albums, etc.) + # These need a separate call with --flat-playlist to get the actual entries + elif (data.get('playlist_count') or 0) > 0 and 'entries' not in data: + try: + # Make a second call with --flat-playlist to get the actual tracks + flat_cmd = [ + "yt-dlp", + "-j", + "--no-warnings", + "--flat-playlist", + "-f", "best", + url + ] + flat_result = subprocess.run(flat_cmd, capture_output=True, text=True, timeout=30) + if flat_result.returncode == 0: + flat_lines = flat_result.stdout.strip().split('\n') + # With --flat-playlist, each line is a separate track JSON object + # (not nested in a playlist container), so process ALL lines + for idx, line in enumerate(flat_lines, 1): + if line.strip().startswith('{'): + try: + entry = json_module.loads(line) + item_title = entry.get('title', entry.get('id', f'Track {idx}')) + item_duration = entry.get('duration', 0) + playlist_items.append({ + 'index': idx, + 'id': entry.get('id', f'track_{idx}'), + 'title': item_title, + 'duration': item_duration, + 'url': entry.get('url') or entry.get('webpage_url', ''), + }) + except json_module.JSONDecodeError: + pass + except Exception as e: + pass # Silently ignore if we can't get playlist entries + + + # Fallback: if still no tags detected, get from first item + if not tags and extract_ytdlp_tags: + tags = extract_ytdlp_tags(data) + + # Extract formats from the main data object + formats = [] + if 'formats' in data: + formats = _extract_url_formats(data.get('formats', [])) + + # Deduplicate tags by namespace to prevent duplicate title:, artist:, etc. + try: + from metadata import dedup_tags_by_namespace as _dedup + if _dedup: + tags = _dedup(tags, keep_first=True) + except Exception: + pass # If dedup fails, return tags as-is + + return title, tags, formats, playlist_items + + except subprocess.TimeoutExpired: + log("yt-dlp timeout (>30s)", file=sys.stderr) + return None, [], [], [] + except Exception as e: + log(f"URL scraping error: {e}", file=sys.stderr) + return None, [], [], [] + + +def _extract_url_formats(formats: list) -> List[Tuple[str, str]]: + """Extract best formats from yt-dlp formats list. + + Returns list of (display_label, format_id) tuples. + """ + try: + video_formats = {} # {resolution: format_data} + audio_formats = {} # {quality_label: format_data} + + for fmt in formats: + vcodec = fmt.get('vcodec', 'none') + acodec = fmt.get('acodec', 'none') + height = fmt.get('height') + ext = fmt.get('ext', 'unknown') + format_id = fmt.get('format_id', '') + tbr = fmt.get('tbr', 0) + abr = fmt.get('abr', 0) + + # Video format + if vcodec and vcodec != 'none' and height: + if height < 480: + continue + res_key = f"{height}p" + if res_key not in video_formats or tbr > video_formats[res_key].get('tbr', 0): + video_formats[res_key] = { + 'label': f"{height}p ({ext})", + 'format_id': format_id, + 'tbr': tbr, + } + + # Audio-only format + elif acodec and acodec != 'none' and (not vcodec or vcodec == 'none'): + audio_key = f"audio_{abr}" + if audio_key not in audio_formats or abr > audio_formats[audio_key].get('abr', 0): + audio_formats[audio_key] = { + 'label': f"audio ({ext})", + 'format_id': format_id, + 'abr': abr, + } + + result = [] + + # Add video formats in descending resolution order + for res in sorted(video_formats.keys(), key=lambda x: int(x.replace('p', '')), reverse=True): + fmt = video_formats[res] + result.append((fmt['label'], fmt['format_id'])) + + # Add best audio format + if audio_formats: + best_audio = max(audio_formats.values(), key=lambda x: x.get('abr', 0)) + result.append((best_audio['label'], best_audio['format_id'])) + + return result + + except Exception as e: + log(f"Error extracting formats: {e}", file=sys.stderr) + return [] + + +def _scrape_isbn_metadata(isbn: str) -> List[str]: + """Scrape metadata for an ISBN using Open Library API.""" + new_tags = [] + try: + from ..helper.http_client import HTTPClient + import json as json_module + + isbn_clean = isbn.replace('-', '').strip() + url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json" + + try: + with HTTPClient() as client: + response = client.get(url) + response.raise_for_status() + data = json_module.loads(response.content.decode('utf-8')) + except Exception as e: + log(f"Failed to fetch ISBN metadata: {e}", file=sys.stderr) + return [] + + if not data: + log(f"No ISBN metadata found for: {isbn}") + return [] + + book_data = next(iter(data.values()), None) + if not book_data: + return [] + + if 'title' in book_data: + new_tags.append(f"title:{book_data['title']}") + + if 'authors' in book_data and isinstance(book_data['authors'], list): + for author in book_data['authors'][:3]: + if 'name' in author: + new_tags.append(f"author:{author['name']}") + + if 'publish_date' in book_data: + new_tags.append(f"publish_date:{book_data['publish_date']}") + + if 'publishers' in book_data and isinstance(book_data['publishers'], list): + for pub in book_data['publishers'][:1]: + if 'name' in pub: + new_tags.append(f"publisher:{pub['name']}") + + if 'description' in book_data: + desc = book_data['description'] + if isinstance(desc, dict) and 'value' in desc: + desc = desc['value'] + if desc: + desc_str = str(desc).strip() + # Include description if available (limit to 200 chars to keep it manageable) + if len(desc_str) > 0: + new_tags.append(f"description:{desc_str[:200]}") + + if 'number_of_pages' in book_data: + page_count = book_data['number_of_pages'] + if page_count and isinstance(page_count, int) and page_count > 0: + new_tags.append(f"pages:{page_count}") + + if 'identifiers' in book_data and isinstance(book_data['identifiers'], dict): + identifiers = book_data['identifiers'] + + if 'openlibrary' in identifiers: + ol_ids = identifiers['openlibrary'] + if isinstance(ol_ids, list) and ol_ids: + new_tags.append(f"openlibrary:{ol_ids[0]}") + elif isinstance(ol_ids, str): + new_tags.append(f"openlibrary:{ol_ids}") + + if 'lccn' in identifiers: + lccn_list = identifiers['lccn'] + if isinstance(lccn_list, list) and lccn_list: + new_tags.append(f"lccn:{lccn_list[0]}") + elif isinstance(lccn_list, str): + new_tags.append(f"lccn:{lccn_list}") + + if 'oclc' in identifiers: + oclc_list = identifiers['oclc'] + if isinstance(oclc_list, list) and oclc_list: + new_tags.append(f"oclc:{oclc_list[0]}") + elif isinstance(oclc_list, str): + new_tags.append(f"oclc:{oclc_list}") + + if 'goodreads' in identifiers: + goodreads_list = identifiers['goodreads'] + if isinstance(goodreads_list, list) and goodreads_list: + new_tags.append(f"goodreads:{goodreads_list[0]}") + elif isinstance(goodreads_list, str): + new_tags.append(f"goodreads:{goodreads_list}") + + if 'librarything' in identifiers: + lt_list = identifiers['librarything'] + if isinstance(lt_list, list) and lt_list: + new_tags.append(f"librarything:{lt_list[0]}") + elif isinstance(lt_list, str): + new_tags.append(f"librarything:{lt_list}") + + if 'doi' in identifiers: + doi_list = identifiers['doi'] + if isinstance(doi_list, list) and doi_list: + new_tags.append(f"doi:{doi_list[0]}") + elif isinstance(doi_list, str): + new_tags.append(f"doi:{doi_list}") + + if 'internet_archive' in identifiers: + ia_list = identifiers['internet_archive'] + if isinstance(ia_list, list) and ia_list: + new_tags.append(f"internet_archive:{ia_list[0]}") + elif isinstance(ia_list, str): + new_tags.append(f"internet_archive:{ia_list}") + + log(f"Found {len(new_tags)} tag(s) from ISBN lookup") + return new_tags + except Exception as e: + log(f"ISBN scraping error: {e}", file=sys.stderr) + return [] + + +def _scrape_openlibrary_metadata(olid: str) -> List[str]: + """Scrape metadata for an OpenLibrary ID using the .json API endpoint. + + Fetches from https://openlibrary.org/books/{OLID}.json and extracts: + - Title, authors, publish date, publishers + - Description + - Subjects as freeform tags (without namespace prefix) + - Identifiers (ISBN, LCCN, OCLC, etc.) + """ + new_tags = [] + try: + from ..helper.http_client import HTTPClient + import json as json_module + + # Format: OL9674499M or just 9674499M + olid_clean = olid.replace('OL', '').replace('M', '') + if not olid_clean.isdigit(): + olid_clean = olid + + # Ensure we have the full OLID format for the URL + if not olid.startswith('OL'): + url = f"https://openlibrary.org/books/OL{olid_clean}M.json" + else: + url = f"https://openlibrary.org/books/{olid}.json" + + try: + with HTTPClient() as client: + response = client.get(url) + response.raise_for_status() + data = json_module.loads(response.content.decode('utf-8')) + except Exception as e: + log(f"Failed to fetch OpenLibrary metadata: {e}", file=sys.stderr) + return [] + + if not data: + log(f"No OpenLibrary metadata found for: {olid}") + return [] + + # Add title + if 'title' in data: + new_tags.append(f"title:{data['title']}") + + # Add authors + if 'authors' in data and isinstance(data['authors'], list): + for author in data['authors'][:3]: + if isinstance(author, dict) and 'name' in author: + new_tags.append(f"author:{author['name']}") + elif isinstance(author, str): + new_tags.append(f"author:{author}") + + # Add publish date + if 'publish_date' in data: + new_tags.append(f"publish_date:{data['publish_date']}") + + # Add publishers + if 'publishers' in data and isinstance(data['publishers'], list): + for pub in data['publishers'][:1]: + if isinstance(pub, dict) and 'name' in pub: + new_tags.append(f"publisher:{pub['name']}") + elif isinstance(pub, str): + new_tags.append(f"publisher:{pub}") + + # Add description + if 'description' in data: + desc = data['description'] + if isinstance(desc, dict) and 'value' in desc: + desc = desc['value'] + if desc: + desc_str = str(desc).strip() + if len(desc_str) > 0: + new_tags.append(f"description:{desc_str[:200]}") + + # Add number of pages + if 'number_of_pages' in data: + page_count = data['number_of_pages'] + if page_count and isinstance(page_count, int) and page_count > 0: + new_tags.append(f"pages:{page_count}") + + # Add subjects as FREEFORM tags (no namespace prefix) + if 'subjects' in data and isinstance(data['subjects'], list): + for subject in data['subjects'][:10]: + if subject and isinstance(subject, str): + subject_clean = str(subject).strip() + if subject_clean and subject_clean not in new_tags: + new_tags.append(subject_clean) + + # Add identifiers + if 'identifiers' in data and isinstance(data['identifiers'], dict): + identifiers = data['identifiers'] + + if 'isbn_10' in identifiers: + isbn_10_list = identifiers['isbn_10'] + if isinstance(isbn_10_list, list) and isbn_10_list: + new_tags.append(f"isbn_10:{isbn_10_list[0]}") + elif isinstance(isbn_10_list, str): + new_tags.append(f"isbn_10:{isbn_10_list}") + + if 'isbn_13' in identifiers: + isbn_13_list = identifiers['isbn_13'] + if isinstance(isbn_13_list, list) and isbn_13_list: + new_tags.append(f"isbn_13:{isbn_13_list[0]}") + elif isinstance(isbn_13_list, str): + new_tags.append(f"isbn_13:{isbn_13_list}") + + if 'lccn' in identifiers: + lccn_list = identifiers['lccn'] + if isinstance(lccn_list, list) and lccn_list: + new_tags.append(f"lccn:{lccn_list[0]}") + elif isinstance(lccn_list, str): + new_tags.append(f"lccn:{lccn_list}") + + if 'oclc_numbers' in identifiers: + oclc_list = identifiers['oclc_numbers'] + if isinstance(oclc_list, list) and oclc_list: + new_tags.append(f"oclc:{oclc_list[0]}") + elif isinstance(oclc_list, str): + new_tags.append(f"oclc:{oclc_list}") + + if 'goodreads' in identifiers: + goodreads_list = identifiers['goodreads'] + if isinstance(goodreads_list, list) and goodreads_list: + new_tags.append(f"goodreads:{goodreads_list[0]}") + elif isinstance(goodreads_list, str): + new_tags.append(f"goodreads:{goodreads_list}") + + log(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup") + return new_tags + except Exception as e: + log(f"OpenLibrary scraping error: {e}", file=sys.stderr) + return [] + + +def _perform_scraping(tags_list: List[str]) -> List[str]: + """Perform scraping based on identifiers in tags. + + Priority order: + 1. openlibrary: (preferred - more complete metadata) + 2. isbn_10 or isbn (fallback) + """ + identifiers = _extract_scrapable_identifiers(tags_list) + + if not identifiers: + log("No scrapable identifiers found (openlibrary, ISBN, musicbrainz, imdb)") + return [] + + log(f"Found scrapable identifiers: {', '.join(identifiers.keys())}") + + new_tags = [] + + # Prefer OpenLibrary over ISBN (more complete metadata) + if 'openlibrary' in identifiers: + olid = identifiers['openlibrary'] + if olid: + log(f"Scraping OpenLibrary: {olid}") + new_tags.extend(_scrape_openlibrary_metadata(olid)) + elif 'isbn_10' in identifiers or 'isbn' in identifiers: + isbn = identifiers.get('isbn_10') or identifiers.get('isbn') + if isbn: + log(f"Scraping ISBN: {isbn}") + new_tags.extend(_scrape_isbn_metadata(isbn)) + + existing_tags_lower = {tag.lower() for tag in tags_list} + scraped_unique = [] + seen = set() + for tag in new_tags: + tag_lower = tag.lower() + if tag_lower not in existing_tags_lower and tag_lower not in seen: + scraped_unique.append(tag) + seen.add(tag_lower) + + if scraped_unique: + log(f"Added {len(scraped_unique)} new tag(s) from scraping") + + return scraped_unique + + +def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: + """Get tags from Hydrus, local sidecar, or URL metadata. + + Usage: + get-tag [-hash ] [--store ] [--emit] + get-tag -scrape + + Options: + -hash : Override hash to use instead of result's hash_hex + --store : Store result to this key for pipeline + --emit: Emit result without interactive prompt (quiet mode) + -scrape : Scrape metadata from URL (returns tags as JSON) + """ + # Helper to get field from both dict and object + def get_field(obj: Any, field: str, default: Any = None) -> Any: + if isinstance(obj, dict): + return obj.get(field, default) + else: + return getattr(obj, field, default) + + # Parse arguments using shared parser + parsed_args = parse_cmdlet_args(args, CMDLET) + + # Extract values + hash_override = normalize_hash(parsed_args.get("hash")) + store_key = parsed_args.get("store") + emit_requested = parsed_args.get("emit", False) + scrape_url = parsed_args.get("scrape") + scrape_requested = scrape_url is not None + + # Handle URL scraping mode + if scrape_requested and scrape_url: + import json as json_module + # Don't print debug message - output should be JSON only for programmatic consumption + # logger.debug(f"Scraping URL: {scrape_url}") + title, tags, formats, playlist_items = _scrape_url_metadata(scrape_url) + + if not tags: + log("No tags extracted from URL", file=sys.stderr) + return 1 + + # Build result object + # result_obj = TagItem("url_scrape", tag_index=0, hash_hex=None, source="url", service_name=None) + # result_obj.title = title or "URL Content" + + # Emit tags as JSON for pipeline consumption (output should be pure JSON on stdout) + output = { + "title": title, + "tags": tags, + "formats": [(label, fmt_id) for label, fmt_id in formats], + "playlist_items": playlist_items, + } + + # Use print() directly to stdout for JSON output (NOT log() which adds prefix) + # This ensures the output is capturable by the download modal and other pipelines + # The modal filters for lines starting with '{' so the prefix breaks parsing + print(json_module.dumps(output, ensure_ascii=False)) + return 0 + + # If -scrape was requested but no URL, that's an error + if scrape_requested and not scrape_url: + log("-scrape requires a URL argument", file=sys.stderr) + return 1 + + # Handle @N selection which creates a list - extract the first item + if isinstance(result, list) and len(result) > 0: + result = result[0] + + hash_from_result = normalize_hash(get_field(result, "hash_hex", None)) + hash_hex = hash_override or hash_from_result + # Only use emit mode if explicitly requested with --emit flag, not just because we're in a pipeline + # This allows interactive REPL to work even in pipelines + emit_mode = emit_requested or bool(store_key) + store_label = (store_key.strip() if store_key and store_key.strip() else None) + + # Check Hydrus availability + hydrus_available, _ = hydrus.is_available(config) + + # Try to find path in result object + local_path = get_field(result, "target", None) or get_field(result, "path", None) or get_field(result, "file_path", None) + + # Determine if local file + is_local_file = False + media: Optional[Path] = None + if local_path and isinstance(local_path, str) and not local_path.startswith(("http://", "https://")): + is_local_file = True + try: + media = Path(str(local_path)) + except Exception: + media = None + + # Try Hydrus first (always prioritize if available and has hash) + use_hydrus = False + hydrus_meta = None # Cache the metadata from first fetch + if hash_hex and hydrus_available: + try: + client = hydrus.get_client(config) + payload = client.fetch_file_metadata(hashes=[str(hash_hex)], include_service_keys_to_tags=True, include_file_urls=False) + items = payload.get("metadata") if isinstance(payload, dict) else None + if isinstance(items, list) and items: + meta = items[0] if isinstance(items[0], dict) else None + # Only accept file if it has a valid file_id (not None) + if isinstance(meta, dict) and meta.get("file_id") is not None: + use_hydrus = True + hydrus_meta = meta # Cache for tag extraction + except Exception: + pass + + # Get tags - try Hydrus first, fallback to sidecar + current = [] + service_name = "" + service_key = None + source = "unknown" + + if use_hydrus and hash_hex and hydrus_meta: + try: + # Use cached metadata from above, don't fetch again + service_name = hydrus.get_tag_service_name(config) + client = hydrus.get_client(config) + service_key = hydrus.get_tag_service_key(client, service_name) + current = _extract_my_tags_from_hydrus_meta(hydrus_meta, service_key, service_name) + source = "hydrus" + except Exception as exc: + log(f"Warning: Failed to extract tags from Hydrus: {exc}", file=sys.stderr) + + # Fallback to local sidecar or local DB if no tags + if not current and is_local_file and media and media.exists(): + try: + # First try local library DB + library_root = get_local_storage_path(config) + if library_root: + try: + with LocalLibraryDB(library_root) as db: + db_tags = db.get_tags(media) + if db_tags: + current = db_tags + source = "local_db" + except Exception as exc: + log(f"[get_tag] DB lookup failed, trying sidecar: {exc}", file=sys.stderr) + + # Fall back to sidecar if DB didn't have tags + if not current: + sidecar_path = find_sidecar(media) + if sidecar_path and sidecar_path.exists(): + try: + _, current, _ = read_sidecar(sidecar_path) + except Exception: + _, current, _ = _read_sidecar_fallback(sidecar_path) + if current: + source = "sidecar" + except Exception as exc: + log(f"Warning: Failed to load tags from local storage: {exc}", file=sys.stderr) + + # Fallback to tags in the result object if Hydrus/local lookup returned nothing + if not current: + # Check if result has 'tags' attribute (PipeObject) + if hasattr(result, 'tags') and getattr(result, 'tags', None): + current = getattr(result, 'tags') + source = "pipeline_result" + # Check if result is a dict with 'tags' key + elif isinstance(result, dict) and 'tags' in result: + tags_val = result['tags'] + if isinstance(tags_val, list): + current = tags_val + source = "pipeline_result" + source = "pipeline_result" + + # Error if no tags found + if not current: + log("No tags found", file=sys.stderr) + return 1 + + # Always output to ResultTable (pipeline mode only) + if source == "hydrus": + _emit_tags_as_table(current, hash_hex=hash_hex, source="hydrus", service_name=service_name, config=config) + else: + _emit_tags_as_table(current, hash_hex=hash_hex, source="local", service_name=None, config=config) + + # If emit requested or store key provided, emit payload + if emit_mode: + _emit_tag_payload(source, current, hash_value=hash_hex, store_label=store_label) + + return 0 + + +CMDLET = Cmdlet( + name="get-tag", + summary="Get tags from Hydrus or local sidecar metadata", + usage="get-tag [-hash ] [--store ] [--emit] [-scrape ]", + aliases=["tags"], + args=[ + SharedArgs.HASH, + CmdletArg( + name="-store", + type="string", + description="Store result to this key for pipeline", + alias="store" + ), + CmdletArg( + name="-emit", + type="flag", + description="Emit result without interactive prompt (quiet mode)", + alias="emit-only" + ), + CmdletArg( + name="-scrape", + type="string", + description="Scrape metadata from URL (returns tags as JSON)", + required=False + ) + ] +) + + diff --git a/cmdlets/get_url.py b/cmdlets/get_url.py new file mode 100644 index 0000000..271b53c --- /dev/null +++ b/cmdlets/get_url.py @@ -0,0 +1,90 @@ +from __future__ import annotations + +from typing import Any, Dict, Sequence +import json + +from . import register +import models +import pipeline as ctx +from helper import hydrus as hydrus_wrapper +from ._shared import Cmdlet, CmdletArg, normalize_hash +from helper.logger import log + +CMDLET = Cmdlet( + name="get-url", + summary="List URLs associated with a Hydrus file.", + usage="get-url [-hash ]", + args=[ + CmdletArg("-hash", description="Override the Hydrus file hash (SHA256) to target instead of the selected result."), + ], + details=[ + "- Prints the known URLs for the selected Hydrus file.", + ], +) + + +def _parse_hash_and_rest(args: Sequence[str]) -> tuple[str | None, list[str]]: + override_hash: str | None = None + rest: list[str] = [] + i = 0 + while i < len(args): + a = args[i] + low = str(a).lower() + if low in {"-hash", "--hash", "hash"} and i + 1 < len(args): + override_hash = str(args[i + 1]).strip() + i += 2 + continue + rest.append(a) + i += 1 + return override_hash, rest + + +@register(["get-url", "get-urls", "get_url"]) # aliases +def get_urls(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: + # Helper to get field from both dict and object + def get_field(obj: Any, field: str, default: Any = None) -> Any: + if isinstance(obj, dict): + return obj.get(field, default) + else: + return getattr(obj, field, default) + + # Help + try: + if any(str(a).lower() in {"-?", "/?", "--help", "-h", "help", "--cmdlet"} for a in args): + log(json.dumps(CMDLET, ensure_ascii=False, indent=2)) + return 0 + except Exception: + pass + override_hash, _ = _parse_hash_and_rest(args) + hash_hex = normalize_hash(override_hash) if override_hash else normalize_hash(get_field(result, "hash_hex", None)) + if not hash_hex: + log("Selected result does not include a Hydrus hash") + return 1 + try: + client = hydrus_wrapper.get_client(config) + except Exception as exc: + log(f"Hydrus client unavailable: {exc}") + return 1 + + if client is None: + log("Hydrus client unavailable") + return 1 + try: + payload = client.fetch_file_metadata(hashes=[hash_hex], include_file_urls=True) + except Exception as exc: + log(f"Hydrus metadata fetch failed: {exc}") + return 1 + items = payload.get("metadata") if isinstance(payload, dict) else None + meta = items[0] if (isinstance(items, list) and items and isinstance(items[0], dict)) else None + urls = (meta.get("known_urls") if isinstance(meta, dict) else None) or [] + if urls: + ctx.emit("URLs:") + for u in urls: + text = str(u).strip() + if text: + ctx.emit(f"- {text}") + else: + ctx.emit("No URLs found.") + return 0 + + diff --git a/cmdlets/manage_config.py b/cmdlets/manage_config.py new file mode 100644 index 0000000..ac7126f --- /dev/null +++ b/cmdlets/manage_config.py @@ -0,0 +1,138 @@ +from typing import List, Dict, Any +from ._shared import Cmdlet, CmdletArg +from config import load_config, save_config + +CMDLET = Cmdlet( + name=".config", + summary="Manage configuration settings", + usage=".config [key] [value]", + args=[ + CmdletArg( + name="key", + description="Configuration key to update (dot-separated)", + required=False + ), + CmdletArg( + name="value", + description="New value for the configuration key", + required=False + ) + ] +) + +def flatten_config(config: Dict[str, Any], parent_key: str = '', sep: str = '.') -> List[Dict[str, Any]]: + items = [] + for k, v in config.items(): + if k.startswith('_'): # Skip internal keys + continue + + new_key = f"{parent_key}{sep}{k}" if parent_key else k + if isinstance(v, dict): + items.extend(flatten_config(v, new_key, sep=sep)) + else: + items.append({ + "Key": new_key, + "Value": str(v), + "Type": type(v).__name__, + "_selection_args": [new_key] + }) + return items + +def set_nested_config(config: Dict[str, Any], key: str, value: str) -> bool: + keys = key.split('.') + d = config + + # Navigate to the parent dict + for k in keys[:-1]: + if k not in d or not isinstance(d[k], dict): + d[k] = {} + d = d[k] + + last_key = keys[-1] + + # Try to preserve type if key exists + if last_key in d: + current_val = d[last_key] + if isinstance(current_val, bool): + if value.lower() in ('true', 'yes', '1', 'on'): + d[last_key] = True + elif value.lower() in ('false', 'no', '0', 'off'): + d[last_key] = False + else: + # Fallback to boolean conversion of string (usually True for non-empty) + # But for config, explicit is better. + print(f"Warning: Could not convert '{value}' to boolean. Using string.") + d[last_key] = value + elif isinstance(current_val, int): + try: + d[last_key] = int(value) + except ValueError: + print(f"Warning: Could not convert '{value}' to int. Using string.") + d[last_key] = value + elif isinstance(current_val, float): + try: + d[last_key] = float(value) + except ValueError: + print(f"Warning: Could not convert '{value}' to float. Using string.") + d[last_key] = value + else: + d[last_key] = value + else: + # New key, try to infer type + if value.lower() in ('true', 'false'): + d[last_key] = (value.lower() == 'true') + elif value.isdigit(): + d[last_key] = int(value) + else: + d[last_key] = value + + return True + +def _run(piped_result: Any, args: List[str], config: Dict[str, Any]) -> int: + # Reload config to ensure we have the latest on disk + # We don't use the passed 'config' because we want to edit the file + # and 'config' might contain runtime objects (like worker manager) + # But load_config() returns a fresh dict from disk (or cache) + # We should use load_config() + + current_config = load_config() + + # Parse args + # We handle args manually because of the potential for spaces in values + # and the @ expansion logic in CLI.py passing args + + if not args: + # List mode + items = flatten_config(current_config) + # Sort by key + items.sort(key=lambda x: x['Key']) + + # Emit items for ResultTable + import pipeline as ctx + for item in items: + ctx.emit(item) + return 0 + + # Update mode + key = args[0] + + if len(args) < 2: + print(f"Error: Value required for key '{key}'") + return 1 + + value = " ".join(args[1:]) + + # Remove quotes if present + if (value.startswith('"') and value.endswith('"')) or (value.startswith("'") and value.endswith("'")): + value = value[1:-1] + + try: + set_nested_config(current_config, key, value) + save_config(current_config) + print(f"Updated '{key}' to '{value}'") + return 0 + except Exception as e: + print(f"Error updating config: {e}") + return 1 + +CMDLET.exec = _run diff --git a/cmdlets/merge_file.py b/cmdlets/merge_file.py new file mode 100644 index 0000000..e4fb54c --- /dev/null +++ b/cmdlets/merge_file.py @@ -0,0 +1,916 @@ +"""Merge multiple files into a single output file.""" +from __future__ import annotations + +from typing import Any, Dict, Optional, Sequence, List +from pathlib import Path +import json +import sys + +from helper.logger import log +from helper.download import download_media +from models import DownloadOptions +from config import resolve_output_dir +import subprocess as _subprocess +import shutil as _shutil +from ._shared import parse_cmdlet_args + +try: + from PyPDF2 import PdfWriter, PdfReader + HAS_PYPDF2 = True +except ImportError: + HAS_PYPDF2 = False + PdfWriter = None + PdfReader = None + +try: + from metadata import ( + read_tags_from_file, + write_tags_to_file, + dedup_tags_by_namespace, + merge_multiple_tag_lists, + write_tags, + write_metadata + ) + HAS_METADATA_API = True +except ImportError: + HAS_METADATA_API = False + +from . import register +from ._shared import ( + Cmdlet, + CmdletArg, + normalize_result_input, + get_pipe_object_path, + get_pipe_object_hash, +) +import models +import pipeline as ctx + + +def _get_item_value(item: Any, key: str, default: Any = None) -> Any: + """Helper to read either dict keys or attributes.""" + if isinstance(item, dict): + return item.get(key, default) + return getattr(item, key, default) + + + + +def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: + """Merge multiple files into one.""" + + # Parse help + try: + if any(str(a).lower() in {"-?", "/?", "--help", "-h", "help", "--cmdlet"} for a in args): + log(json.dumps(CMDLET, ensure_ascii=False, indent=2)) + return 0 + except Exception: + pass + + # Parse arguments + parsed = parse_cmdlet_args(args, CMDLET) + delete_after = parsed.get("delete", False) + + output_override: Optional[Path] = None + output_arg = parsed.get("output") + if output_arg: + try: + output_override = Path(str(output_arg)).expanduser() + except Exception: + output_override = None + + format_spec = parsed.get("format") + if format_spec: + format_spec = str(format_spec).lower().strip() + + # Collect files from piped results + # Use normalize_result_input to handle both single items and lists + files_to_merge: List[Dict[str, Any]] = normalize_result_input(result) + + if not files_to_merge: + log("No files provided to merge", file=sys.stderr) + return 1 + + if len(files_to_merge) < 2: + # Only 1 file - pass it through unchanged + # (merge only happens when multiple files are collected) + item = files_to_merge[0] + ctx.emit(item) + return 0 + + # Extract file paths and metadata from result objects + source_files: List[Path] = [] + source_tags_files: List[Path] = [] + source_hashes: List[str] = [] + source_urls: List[str] = [] + source_tags: List[str] = [] # NEW: collect tags from source files + source_relationships: List[str] = [] # NEW: collect relationships from source files + + for item in files_to_merge: + raw_path = get_pipe_object_path(item) + target_path = None + if isinstance(raw_path, Path): + target_path = raw_path + elif isinstance(raw_path, str) and raw_path.strip(): + candidate = Path(raw_path).expanduser() + if candidate.exists(): + target_path = candidate + + # Check for playlist item that needs downloading + if not target_path and isinstance(item, dict) and item.get('__action', '').startswith('playlist-item:'): + try: + playlist_url = item.get('__file_path') + item_idx = int(item['__action'].split(':')[1]) + log(f"Downloading playlist item #{item_idx} from {playlist_url}...", flush=True) + + output_dir = resolve_output_dir(config) + opts = DownloadOptions( + url=playlist_url, + output_dir=output_dir, + playlist_items=str(item_idx), + mode="audio" if format_spec == "m4b" else "auto" # Infer mode if possible + ) + + res = download_media(opts) + if res and res.path and res.path.exists(): + target_path = res.path + log(f"✓ Downloaded: {target_path.name}", flush=True) + except Exception as e: + log(f"Failed to download playlist item: {e}", file=sys.stderr) + + if target_path and target_path.exists(): + source_files.append(target_path) + + # Track the .tags file for this source + tags_file = target_path.with_suffix(target_path.suffix + '.tags') + if tags_file.exists(): + source_tags_files.append(tags_file) + + # Try to read hash, tags, urls, and relationships from .tags sidecar file + try: + tags_content = tags_file.read_text(encoding='utf-8') + for line in tags_content.split('\n'): + line = line.strip() + if not line: + continue + if line.startswith('hash:'): + hash_value = line[5:].strip() + if hash_value: + source_hashes.append(hash_value) + elif line.startswith('known_url:') or line.startswith('url:'): + # Extract URLs from tags file + url_value = line.split(':', 1)[1].strip() if ':' in line else '' + if url_value and url_value not in source_urls: + source_urls.append(url_value) + elif line.startswith('relationship:'): + # Extract relationships from tags file + rel_value = line.split(':', 1)[1].strip() if ':' in line else '' + if rel_value and rel_value not in source_relationships: + source_relationships.append(rel_value) + else: + # Collect actual tags (not metadata like hash: or known_url:) + source_tags.append(line) + except Exception: + pass + + # Extract hash if available in item (as fallback) + hash_value = get_pipe_object_hash(item) + if hash_value and hash_value not in source_hashes: + source_hashes.append(str(hash_value)) + + # Extract known URLs if available + known_urls = _get_item_value(item, 'known_urls', []) + if isinstance(known_urls, str): + source_urls.append(known_urls) + elif isinstance(known_urls, list): + source_urls.extend(known_urls) + else: + title = _get_item_value(item, 'title', 'unknown') or _get_item_value(item, 'id', 'unknown') + log(f"Warning: Could not locate file for item: {title}", file=sys.stderr) + + if len(source_files) < 2: + log("At least 2 valid files required to merge", file=sys.stderr) + return 1 + + # Detect file types + file_types = set() + for f in source_files: + suffix = f.suffix.lower() + if suffix in {'.mp3', '.flac', '.wav', '.m4a', '.aac', '.ogg', '.opus', '.mka'}: + file_types.add('audio') + elif suffix in {'.mp4', '.mkv', '.webm', '.mov', '.avi', '.flv', '.mpg', '.mpeg', '.ts', '.m4v', '.wmv'}: + file_types.add('video') + elif suffix in {'.pdf'}: + file_types.add('pdf') + elif suffix in {'.txt', '.srt', '.vtt', '.md', '.log'}: + file_types.add('text') + else: + file_types.add('other') + + if len(file_types) > 1 and 'other' not in file_types: + log(f"Mixed file types detected: {', '.join(sorted(file_types))}", file=sys.stderr) + log(f"Can only merge files of the same type", file=sys.stderr) + return 1 + + file_kind = list(file_types)[0] if file_types else 'other' + + # Determine output format + output_format = format_spec or 'auto' + if output_format == 'auto': + if file_kind == 'audio': + output_format = 'mka' # Default audio codec - mka supports chapters and stream copy + elif file_kind == 'video': + output_format = 'mp4' # Default video codec + elif file_kind == 'pdf': + output_format = 'pdf' + else: + output_format = 'txt' + + # Determine output path + if output_override: + if output_override.is_dir(): + base_name = _sanitize_name(getattr(files_to_merge[0], 'title', 'merged')) + output_path = output_override / f"{base_name} (merged).{_ext_for_format(output_format)}" + else: + output_path = output_override + else: + first_file = source_files[0] + output_path = first_file.parent / f"{first_file.stem} (merged).{_ext_for_format(output_format)}" + + # Ensure output directory exists + output_path.parent.mkdir(parents=True, exist_ok=True) + + # Perform merge based on file type + if file_kind == 'audio': + success = _merge_audio(source_files, output_path, output_format) + elif file_kind == 'video': + success = _merge_video(source_files, output_path, output_format) + elif file_kind == 'pdf': + success = _merge_pdf(source_files, output_path) + elif file_kind == 'text': + success = _merge_text(source_files, output_path) + else: + log(f"Unsupported file type: {file_kind}", file=sys.stderr) + return 1 + + if not success: + log("Merge failed", file=sys.stderr) + return 1 + + log(f"Merged {len(source_files)} files into: {output_path}", file=sys.stderr) + + # Create .tags sidecar file for the merged output using unified API + tags_path = output_path.with_suffix(output_path.suffix + '.tags') + try: + # Start with title tag + merged_tags = [f"title:{output_path.stem}"] + + # Merge tags from source files using metadata API + if source_tags and HAS_METADATA_API: + # Use dedup function to normalize and deduplicate + merged_source_tags = dedup_tags_by_namespace(source_tags) + merged_tags.extend(merged_source_tags) + log(f"Merged {len(merged_source_tags)} unique tags from source files", file=sys.stderr) + elif source_tags: + # Fallback: simple deduplication if metadata API unavailable + merged_tags.extend(list(dict.fromkeys(source_tags))) # Preserve order, remove duplicates + + # Write merged tags to sidecar file + if HAS_METADATA_API and write_tags_to_file: + # Use unified API for file writing + source_hashes_list = source_hashes if source_hashes else None + source_urls_list = source_urls if source_urls else None + write_tags_to_file(tags_path, merged_tags, source_hashes_list, source_urls_list) + else: + # Fallback: manual file writing + tags_lines = [] + + # Add hash first (if available) + if source_hashes: + tags_lines.append(f"hash:{source_hashes[0]}") + + # Add regular tags + tags_lines.extend(merged_tags) + + # Add known URLs + if source_urls: + for url in source_urls: + tags_lines.append(f"known_url:{url}") + + # Add relationships (if available) + if source_relationships: + for rel in source_relationships: + tags_lines.append(f"relationship:{rel}") + + with open(tags_path, 'w', encoding='utf-8') as f: + f.write('\n'.join(tags_lines) + '\n') + + log(f"Created sidecar: {tags_path.name}", file=sys.stderr) + + # Also create .metadata file using centralized function + try: + write_metadata(output_path, source_hashes[0] if source_hashes else None, source_urls, source_relationships) + log(f"Created metadata: {output_path.name}.metadata", file=sys.stderr) + except Exception as e: + log(f"Warning: Could not create metadata file: {e}", file=sys.stderr) + + except Exception as e: + log(f"Warning: Could not create sidecar: {e}", file=sys.stderr) + + # Emit PipelineItem so the merged file can be piped to next command + try: + # Try to import PipelineItem from downlow module + try: + from downlow import PipelineItem + except ImportError: + # Fallback: create a simple object with the required attributes + class SimpleItem: + def __init__(self, target, title, media_kind, tags=None, known_urls=None): + self.target = target + self.title = title + self.media_kind = media_kind + self.tags = tags or [] + self.known_urls = known_urls or [] + PipelineItem = SimpleItem + + merged_item = PipelineItem( + target=str(output_path), + title=output_path.stem, + media_kind=file_kind, + tags=merged_tags, # Include merged tags + known_urls=source_urls # Include known URLs + ) + ctx.emit(merged_item) + except Exception as e: + log(f"Warning: Could not emit pipeline item: {e}", file=sys.stderr) + # Still emit a string representation for feedback + ctx.emit(f"Merged: {output_path}") + + # Delete source files if requested + if delete_after: + # First delete all .tags files + for tags_file in source_tags_files: + try: + tags_file.unlink() + log(f"Deleted: {tags_file.name}", file=sys.stderr) + except Exception as e: + log(f"Warning: Could not delete {tags_file.name}: {e}", file=sys.stderr) + + # Then delete all source files + for f in source_files: + try: + f.unlink() + log(f"Deleted: {f.name}", file=sys.stderr) + except Exception as e: + log(f"Warning: Could not delete {f.name}: {e}", file=sys.stderr) + + return 0 + + +def _sanitize_name(text: str) -> str: + """Sanitize filename.""" + allowed = [] + for ch in text: + allowed.append(ch if (ch.isalnum() or ch in {"-", "_", " ", "."}) else " ") + return (" ".join("".join(allowed).split()) or "merged").strip() + + +def _ext_for_format(fmt: str) -> str: + """Get file extension for format.""" + format_map = { + 'mp3': 'mp3', + 'm4a': 'm4a', + 'aac': 'aac', + 'opus': 'opus', + 'mka': 'mka', # Matroska Audio - EXCELLENT chapter support (recommended) + 'mkv': 'mkv', + 'mp4': 'mp4', + 'webm': 'webm', + 'pdf': 'pdf', + 'txt': 'txt', + 'auto': 'mka', # Default - MKA for chapters + } + return format_map.get(fmt.lower(), 'mka') + + +def _add_chapters_to_m4a(file_path: Path, chapters: List[Dict]) -> bool: + """Add chapters to an M4A file using mutagen. + + Args: + file_path: Path to M4A file + chapters: List of chapter dicts with 'title', 'start_ms', 'end_ms' + + Returns: + True if successful, False otherwise + """ + import logging + logger = logging.getLogger(__name__) + + if not chapters: + return True + + try: + from mutagen.mp4 import MP4, Atom + from mutagen.mp4._util import Atom as MP4Atom + except ImportError: + logger.warning("[merge-file] mutagen not available for chapter writing") + return False + + try: + # Load the MP4 file + audio = MP4(str(file_path)) + + # Build the chapter atom + # MP4 chapters are stored in a 'chap' atom with specific structure + chapter_data = b'' + + for i, chapter in enumerate(chapters, 1): + # Each chapter entry: 10-byte header + title + title = chapter.get('title', f'Chapter {i}').encode('utf-8') + start_time_ms = int(chapter.get('start_ms', 0)) + + # Chapter atom format for M4A: + # (uint32: size)(uint32: 'chap')(uint8: reserved)(uint24: atom type) + more... + # This is complex, so we'll use a simpler atom approach + pass + + # Unfortunately, mutagen doesn't have built-in chapter writing for MP4 + # Chapter writing requires low-level atom manipulation + # For now, we'll just return and note this limitation + logger.info("[merge-file] MP4 chapter writing via mutagen not fully supported") + return False + + except Exception as e: + logger.warning(f"[merge-file] Error writing chapters: {e}") + return False + + +def _merge_audio(files: List[Path], output: Path, output_format: str) -> bool: + """Merge audio files with chapters based on file boundaries.""" + import logging + logger = logging.getLogger(__name__) + + ffmpeg_path = _shutil.which('ffmpeg') + if not ffmpeg_path: + log("ffmpeg not found in PATH", file=sys.stderr) + return False + + try: + # Step 1: Get duration of each file to calculate chapter timestamps + chapters = [] + current_time_ms = 0 + + log(f"Analyzing {len(files)} files for chapter information...", file=sys.stderr) + logger.info(f"[merge-file] Analyzing files for chapters") + + for file_path in files: + # Get duration using ffprobe + try: + ffprobe_cmd = [ + 'ffprobe', '-v', 'error', '-show_entries', + 'format=duration', '-print_format', + 'default=noprint_wrappers=1:nokey=1', str(file_path) + ] + + probe_result = _subprocess.run(ffprobe_cmd, capture_output=True, text=True, timeout=10) + if probe_result.returncode == 0 and probe_result.stdout.strip(): + try: + duration_sec = float(probe_result.stdout.strip()) + except ValueError: + logger.warning(f"[merge-file] Could not parse duration from ffprobe output: {probe_result.stdout}") + duration_sec = 0 + else: + logger.warning(f"[merge-file] ffprobe failed for {file_path.name}: {probe_result.stderr}") + duration_sec = 0 + except Exception as e: + logger.warning(f"[merge-file] Could not get duration for {file_path.name}: {e}") + duration_sec = 0 + + # Create chapter entry - use title: tag from metadata if available + title = file_path.stem # Default to filename without extension + if HAS_METADATA_API: + try: + # Try to read tags from .tags sidecar file + tags_file = file_path.with_suffix(file_path.suffix + '.tags') + if tags_file.exists(): + tags = read_tags_from_file(tags_file) + if tags: + # Look for title: tag + for tag in tags: + if isinstance(tag, str) and tag.lower().startswith('title:'): + # Extract the title value after the colon + title = tag.split(':', 1)[1].strip() + break + except Exception as e: + logger.debug(f"[merge-file] Could not read metadata for {file_path.name}: {e}") + pass # Fall back to filename + + # Convert seconds to HH:MM:SS.mmm format + hours = int(current_time_ms // 3600000) + minutes = int((current_time_ms % 3600000) // 60000) + seconds = int((current_time_ms % 60000) // 1000) + millis = int(current_time_ms % 1000) + + chapters.append({ + 'time_ms': current_time_ms, + 'time_str': f"{hours:02d}:{minutes:02d}:{seconds:02d}.{millis:03d}", + 'title': title, + 'duration_sec': duration_sec + }) + + logger.info(f"[merge-file] Chapter: {title} @ {chapters[-1]['time_str']} (duration: {duration_sec:.2f}s)") + current_time_ms += int(duration_sec * 1000) + + # Step 2: Create concat demuxer file + concat_file = output.parent / f".concat_{output.stem}.txt" + concat_lines = [] + for f in files: + # Escape quotes in path + safe_path = str(f).replace("'", "'\\''") + concat_lines.append(f"file '{safe_path}'") + + concat_file.write_text('\n'.join(concat_lines), encoding='utf-8') + + # Step 3: Create FFmpeg metadata file with chapters + metadata_file = output.parent / f".metadata_{output.stem}.txt" + metadata_lines = [';FFMETADATA1'] + + for i, chapter in enumerate(chapters): + # FFMetadata format for chapters (note: [CHAPTER] not [CHAPTER01]) + metadata_lines.append('[CHAPTER]') + metadata_lines.append('TIMEBASE=1/1000') + metadata_lines.append(f'START={chapter["time_ms"]}') + # Calculate end time (start of next chapter or end of file) + if i < len(chapters) - 1: + metadata_lines.append(f'END={chapters[i+1]["time_ms"]}') + else: + metadata_lines.append(f'END={current_time_ms}') + metadata_lines.append(f'title={chapter["title"]}') + + metadata_file.write_text('\n'.join(metadata_lines), encoding='utf-8') + log(f"Created chapters metadata file with {len(chapters)} chapters", file=sys.stderr) + logger.info(f"[merge-file] Created {len(chapters)} chapters") + + # Step 4: Build FFmpeg command to merge and embed chapters + # Strategy: First merge audio, then add metadata in separate pass + cmd = [ffmpeg_path, '-y', '-f', 'concat', '-safe', '0', '-i', str(concat_file)] + + # Add threading options for speed + cmd.extend(['-threads', '0']) # Use all available threads + + # Audio codec selection for first input + if output_format == 'mp3': + cmd.extend(['-c:a', 'libmp3lame', '-q:a', '2']) + elif output_format == 'm4a': + # Use copy if possible (much faster), otherwise re-encode + # Check if inputs are already AAC/M4A to avoid re-encoding + # For now, default to copy if format matches, otherwise re-encode + # But since we are merging potentially different codecs, re-encoding is safer + # To speed up re-encoding, we can use a faster preset or hardware accel if available + cmd.extend(['-c:a', 'aac', '-b:a', '256k']) # M4A with better quality + elif output_format == 'aac': + cmd.extend(['-c:a', 'aac', '-b:a', '192k']) + elif output_format == 'opus': + cmd.extend(['-c:a', 'libopus', '-b:a', '128k']) + elif output_format == 'mka': + # FLAC is fast to encode but large. Copy is fastest if inputs are compatible. + # If we want speed, copy is best. If we want compatibility, re-encode. + # Let's try copy first if inputs are same format, but that's hard to detect here. + # Defaulting to copy for MKA as it's a container that supports many codecs + cmd.extend(['-c:a', 'copy']) + else: + cmd.extend(['-c:a', 'copy']) # Copy without re-encoding + + # Add the output file + cmd.append(str(output)) + + log(f"Merging {len(files)} audio files to {output_format}...", file=sys.stderr) + logger.info(f"[merge-file] Running ffmpeg merge: {' '.join(cmd)}") + + # Run ffmpeg with progress monitoring + try: + from helper.progress import print_progress, print_final_progress + import re + + process = _subprocess.Popen( + cmd, + stdout=_subprocess.PIPE, + stderr=_subprocess.PIPE, + text=True, + encoding='utf-8', + errors='replace' + ) + + # Monitor progress + duration_re = re.compile(r"time=(\d{2}):(\d{2}):(\d{2})\.(\d{2})") + total_duration_sec = current_time_ms / 1000.0 + + while True: + # Read stderr line by line (ffmpeg writes progress to stderr) + if process.stderr: + line = process.stderr.readline() + if not line and process.poll() is not None: + break + + if line: + # Parse time=HH:MM:SS.mm + match = duration_re.search(line) + if match and total_duration_sec > 0: + h, m, s, cs = map(int, match.groups()) + current_sec = h * 3600 + m * 60 + s + cs / 100.0 + + # Calculate speed/bitrate if available (optional) + # For now just show percentage + print_progress( + output.name, + int(current_sec * 1000), # Use ms as "bytes" for progress bar + int(total_duration_sec * 1000), + speed=0 + ) + else: + break + + # Wait for completion + stdout, stderr = process.communicate() + + if process.returncode != 0: + log(f"FFmpeg error: {stderr}", file=sys.stderr) + raise _subprocess.CalledProcessError(process.returncode, cmd, output=stdout, stderr=stderr) + + print_final_progress(output.name, int(total_duration_sec * 1000), 0) + + except Exception as e: + logger.exception(f"[merge-file] ffmpeg process error: {e}") + raise + + log(f"Merge successful, adding chapters metadata...", file=sys.stderr) + + # Step 5: Embed chapters into container (MKA, MP4/M4A, or note limitation) + if output_format == 'mka' or output.suffix.lower() == '.mka': + # MKA/MKV format has native chapter support via FFMetadata + # Re-mux the file with chapters embedded (copy streams, no re-encode) + log(f"Embedding chapters into Matroska container...", file=sys.stderr) + logger.info(f"[merge-file] Adding chapters to MKA file via FFMetadata") + + temp_output = output.parent / f".temp_{output.stem}.mka" + + # Use mkvmerge if available (best for MKA chapters), otherwise fall back to ffmpeg + mkvmerge_path = _shutil.which('mkvmerge') + + if mkvmerge_path: + # mkvmerge is the best tool for embedding chapters in Matroska files + log(f"Using mkvmerge for optimal chapter embedding...", file=sys.stderr) + cmd2 = [ + mkvmerge_path, '-o', str(temp_output), + '--chapters', str(metadata_file), + str(output) + ] + else: + # Fallback to ffmpeg with proper chapter embedding for Matroska + log(f"Using ffmpeg for chapter embedding (install mkvtoolnix for better quality)...", file=sys.stderr) + # For Matroska files, the metadata must be provided via -f ffmetadata input + cmd2 = [ + ffmpeg_path, '-y', + '-i', str(output), # Input: merged audio + '-i', str(metadata_file), # Input: FFMetadata file + '-c:a', 'copy', # Copy audio without re-encoding + '-threads', '0', # Use all threads + '-map', '0', # Map all from first input + '-map_chapters', '1', # Map CHAPTERS from second input (FFMetadata) + str(temp_output) # Output + ] + + logger.info(f"[merge-file] Running chapter embedding: {' '.join(cmd2)}") + + try: + # Run chapter embedding silently (progress handled by worker thread) + _subprocess.run( + cmd2, + capture_output=True, + text=True, + stdin=_subprocess.DEVNULL, + timeout=600, + check=False + ) + + # Replace original with temp if successful + if temp_output.exists() and temp_output.stat().st_size > 0: + try: + import shutil + if output.exists(): + output.unlink() + shutil.move(str(temp_output), str(output)) + log(f"✓ Chapters successfully embedded!", file=sys.stderr) + logger.info(f"[merge-file] Chapters embedded successfully") + except Exception as e: + logger.warning(f"[merge-file] Could not replace file: {e}") + log(f"Warning: Could not embed chapters, using merge without chapters", file=sys.stderr) + try: + temp_output.unlink() + except Exception: + pass + else: + logger.warning(f"[merge-file] Chapter embedding did not create output") + except Exception as e: + logger.exception(f"[merge-file] Chapter embedding failed: {e}") + log(f"Warning: Chapter embedding failed, using merge without chapters", file=sys.stderr) + elif output_format == 'm4a' or output.suffix.lower() in ['.m4a', '.mp4']: + # MP4/M4A format has native chapter support via iTunes metadata atoms + log(f"Embedding chapters into MP4 container...", file=sys.stderr) + logger.info(f"[merge-file] Adding chapters to M4A/MP4 file via iTunes metadata") + + temp_output = output.parent / f".temp_{output.stem}{output.suffix}" + + # ffmpeg embeds chapters in MP4 using -map_metadata and -map_chapters + log(f"Using ffmpeg for MP4 chapter embedding...", file=sys.stderr) + cmd2 = [ + ffmpeg_path, '-y', + '-i', str(output), # Input: merged audio + '-i', str(metadata_file), # Input: FFMetadata file + '-c:a', 'copy', # Copy audio without re-encoding + '-threads', '0', # Use all threads + '-map', '0', # Map all from first input + '-map_metadata', '1', # Map metadata from second input (FFMetadata) + '-map_chapters', '1', # Map CHAPTERS from second input (FFMetadata) + str(temp_output) # Output + ] + + logger.info(f"[merge-file] Running MP4 chapter embedding: {' '.join(cmd2)}") + + try: + # Run MP4 chapter embedding silently (progress handled by worker thread) + _subprocess.run( + cmd2, + capture_output=True, + text=True, + stdin=_subprocess.DEVNULL, + timeout=600, + check=False + ) + + # Replace original with temp if successful + if temp_output.exists() and temp_output.stat().st_size > 0: + try: + import shutil + if output.exists(): + output.unlink() + shutil.move(str(temp_output), str(output)) + log(f"✓ Chapters successfully embedded in MP4!", file=sys.stderr) + logger.info(f"[merge-file] MP4 chapters embedded successfully") + except Exception as e: + logger.warning(f"[merge-file] Could not replace file: {e}") + log(f"Warning: Could not embed chapters, using merge without chapters", file=sys.stderr) + try: + temp_output.unlink() + except Exception: + pass + else: + logger.warning(f"[merge-file] MP4 chapter embedding did not create output") + except Exception as e: + logger.exception(f"[merge-file] MP4 chapter embedding failed: {e}") + log(f"Warning: MP4 chapter embedding failed, using merge without chapters", file=sys.stderr) + else: + # For other formats, chapters would require external tools + logger.info(f"[merge-file] Format {output_format} does not have native chapter support") + log(f"Note: For chapter support, use MKA or M4A format", file=sys.stderr) + + # Clean up temp files + try: + concat_file.unlink() + except Exception: + pass + try: + metadata_file.unlink() + except Exception: + pass + + return True + + except Exception as e: + log(f"Audio merge error: {e}", file=sys.stderr) + logger.error(f"[merge-file] Audio merge error: {e}", exc_info=True) + return False + + +def _merge_video(files: List[Path], output: Path, output_format: str) -> bool: + """Merge video files.""" + ffmpeg_path = _shutil.which('ffmpeg') + if not ffmpeg_path: + log("ffmpeg not found in PATH", file=sys.stderr) + return False + + try: + # Create concat demuxer file + concat_file = output.parent / f".concat_{output.stem}.txt" + concat_lines = [] + for f in files: + safe_path = str(f).replace("'", "'\\''") + concat_lines.append(f"file '{safe_path}'") + + concat_file.write_text('\n'.join(concat_lines), encoding='utf-8') + + # Build FFmpeg command for video merge + cmd = [ffmpeg_path, '-y', '-f', 'concat', '-safe', '0', '-i', str(concat_file)] + + # Video codec selection + if output_format == 'mp4': + cmd.extend(['-c:v', 'libx265', '-preset', 'fast', '-tag:v', 'hvc1', '-c:a', 'aac', '-b:a', '192k']) + elif output_format == 'mkv': + cmd.extend(['-c:v', 'libx265', '-preset', 'fast', '-c:a', 'aac', '-b:a', '192k']) + else: + cmd.extend(['-c', 'copy']) # Copy without re-encoding + + cmd.append(str(output)) + + log(f"Merging {len(files)} video files...", file=sys.stderr) + result = _subprocess.run(cmd, capture_output=True, text=True) + + # Clean up concat file + try: + concat_file.unlink() + except Exception: + pass + + if result.returncode != 0: + stderr = (result.stderr or '').strip() + log(f"FFmpeg error: {stderr}", file=sys.stderr) + return False + + return True + + except Exception as e: + log(f"Video merge error: {e}", file=sys.stderr) + return False + + +def _merge_text(files: List[Path], output: Path) -> bool: + """Merge text files.""" + try: + with open(output, 'w', encoding='utf-8') as outf: + for i, f in enumerate(files): + if i > 0: + outf.write('\n---\n') # Separator between files + try: + content = f.read_text(encoding='utf-8', errors='replace') + outf.write(content) + except Exception as e: + log(f"Warning reading {f.name}: {e}", file=sys.stderr) + + return True + + except Exception as e: + log(f"Text merge error: {e}", file=sys.stderr) + return False + + +def _merge_pdf(files: List[Path], output: Path) -> bool: + """Merge PDF files.""" + if not HAS_PYPDF2: + log("PyPDF2 is required for PDF merging. Install with: pip install PyPDF2", file=sys.stderr) + return False + + try: + if HAS_PYPDF2: + writer = PdfWriter() + else: + log("PyPDF2 is required for PDF merging. Install with: pip install PyPDF2", file=sys.stderr) + return False + + for f in files: + try: + reader = PdfReader(f) + for page in reader.pages: + writer.add_page(page) + log(f"Added {len(reader.pages)} pages from {f.name}", file=sys.stderr) + except Exception as e: + log(f"Error reading PDF {f.name}: {e}", file=sys.stderr) + return False + + with open(output, 'wb') as outf: + writer.write(outf) + + return True + + except Exception as e: + log(f"PDF merge error: {e}", file=sys.stderr) + return False + +CMDLET = Cmdlet( + name="merge-file", + summary="Merge multiple files into a single output file. Supports audio, video, PDF, and text merging with optional cleanup.", + usage="merge-file [-delete] [-output ] [-format ]", + args=[ + CmdletArg("-delete", type="flag", description="Delete source files after successful merge."), + CmdletArg("-output", description="Override output file path."), + CmdletArg("-format", description="Output format (auto/mp3/aac/opus/mp4/mkv/pdf/txt). Default: auto-detect from first file."), + ], + details=[ + "- Pipe multiple files: search-file query | [1,2,3] | merge-file", + "- Audio files merge with minimal quality loss using specified codec.", + "- Video files merge into MP4 or MKV containers.", + "- PDF files merge into a single PDF document.", + "- Text/document files are concatenated.", + "- Output name derived from first file with ' (merged)' suffix.", + "- -delete flag removes all source files after successful merge.", + ], +) diff --git a/cmdlets/pipe.py b/cmdlets/pipe.py new file mode 100644 index 0000000..e178caa --- /dev/null +++ b/cmdlets/pipe.py @@ -0,0 +1,335 @@ +from typing import Any, Dict, Sequence, List, Optional +import sys +import json +import platform +import socket +import re +import subprocess +from ._shared import Cmdlet, CmdletArg, parse_cmdlet_args +from helper.logger import log +from result_table import ResultTable +from .get_file import _get_fixed_ipc_pipe +import pipeline as ctx + +def _send_ipc_command(command: Dict[str, Any]) -> Optional[Any]: + """Send a command to the MPV IPC pipe and return the response.""" + ipc_pipe = _get_fixed_ipc_pipe() + request = json.dumps(command) + "\n" + + try: + if platform.system() == 'Windows': + # Windows named pipe + # Opening in r+b mode to read response + try: + with open(ipc_pipe, 'r+b', buffering=0) as pipe: + pipe.write(request.encode('utf-8')) + pipe.flush() + + # Read response + # We'll try to read a line. This might block if MPV is unresponsive. + response_line = pipe.readline() + if response_line: + return json.loads(response_line.decode('utf-8')) + except FileNotFoundError: + return None # MPV not running + except Exception as e: + log(f"Windows IPC Error: {e}", file=sys.stderr) + return None + else: + # Unix socket + af_unix = getattr(socket, 'AF_UNIX', None) + if af_unix is None: + log("Unix sockets not supported on this platform", file=sys.stderr) + return None + + try: + sock = socket.socket(af_unix, socket.SOCK_STREAM) + sock.settimeout(2.0) + sock.connect(ipc_pipe) + sock.sendall(request.encode('utf-8')) + + # Read response + response_data = b"" + while True: + try: + chunk = sock.recv(4096) + if not chunk: + break + response_data += chunk + if b"\n" in chunk: + break + except socket.timeout: + break + + sock.close() + + if response_data: + # Parse lines, look for response to our request + lines = response_data.decode('utf-8').strip().split('\n') + for line in lines: + try: + resp = json.loads(line) + # If it has 'error' field, it's a response + if 'error' in resp: + return resp + except: + pass + except (FileNotFoundError, ConnectionRefusedError): + return None # MPV not running + except Exception as e: + log(f"Unix IPC Error: {e}", file=sys.stderr) + return None + + except Exception as e: + log(f"IPC Error: {e}", file=sys.stderr) + return None + + return None + +def _get_playlist() -> List[Dict[str, Any]]: + """Get the current playlist from MPV.""" + cmd = {"command": ["get_property", "playlist"], "request_id": 100} + resp = _send_ipc_command(cmd) + if resp and resp.get("error") == "success": + return resp.get("data", []) + return [] + +def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: + """Manage and play items in the MPV playlist via IPC.""" + + parsed = parse_cmdlet_args(args, CMDLET) + + # Handle positional index argument if provided + index_arg = parsed.get("index") + + clear_mode = parsed.get("clear") + list_mode = parsed.get("list") + + # Handle piped input (add to playlist) + if result: + # If result is a list of items, add them to playlist + items_to_add = [] + if isinstance(result, list): + items_to_add = result + elif isinstance(result, dict): + items_to_add = [result] + + added_count = 0 + for i, item in enumerate(items_to_add): + # Extract URL/Path + target = None + title = None + + if isinstance(item, dict): + target = item.get("target") or item.get("url") or item.get("path") + title = item.get("title") or item.get("name") + elif hasattr(item, "target"): + target = item.target + title = getattr(item, "title", None) + elif isinstance(item, str): + target = item + + if target: + # Add to MPV playlist + # We use loadfile with append flag + # Configure 1080p limit for streams (bestvideo<=1080p + bestaudio) + options = { + "ytdl-format": "bestvideo[height<=?1080]+bestaudio/best[height<=?1080]" + } + + if title: + options["force-media-title"] = title + + cmd = {"command": ["loadfile", target, "append", options], "request_id": 200} + resp = _send_ipc_command(cmd) + + if resp is None: + # MPV not running (or died) + # Start MPV with remaining items + _start_mpv(items_to_add[i:]) + return 0 + elif resp.get("error") == "success": + added_count += 1 + if title: + log(f"Queued: {title}") + else: + log(f"Queued: {target}") + + if added_count > 0: + # If we added items, we might want to play the first one if nothing is playing? + # For now, just list the playlist + pass + + # Get playlist from MPV + items = _get_playlist() + + if not items: + log("MPV playlist is empty or MPV is not running.") + return 0 + + # If index is provided, perform action (Play or Clear) + if index_arg is not None: + try: + # Handle 1-based index + idx = int(index_arg) - 1 + + if idx < 0 or idx >= len(items): + log(f"Index {index_arg} out of range (1-{len(items)}).") + return 1 + + item = items[idx] + title = item.get("title") or item.get("filename") or "Unknown" + + if clear_mode: + # Remove item + cmd = {"command": ["playlist-remove", idx], "request_id": 101} + resp = _send_ipc_command(cmd) + if resp and resp.get("error") == "success": + log(f"Removed: {title}") + # Refresh items for listing + items = _get_playlist() + list_mode = True + index_arg = None + else: + log(f"Failed to remove item: {resp.get('error') if resp else 'No response'}") + return 1 + else: + # Play item + cmd = {"command": ["playlist-play-index", idx], "request_id": 102} + resp = _send_ipc_command(cmd) + if resp and resp.get("error") == "success": + log(f"Playing: {title}") + return 0 + else: + log(f"Failed to play item: {resp.get('error') if resp else 'No response'}") + return 1 + + except ValueError: + log(f"Invalid index: {index_arg}") + return 1 + + # List items (Default action or after clear) + if list_mode or index_arg is None: + if not items: + log("MPV playlist is empty.") + return 0 + + table = ResultTable("MPV Playlist") + + for i, item in enumerate(items): + is_current = item.get("current", False) + title = item.get("title") or "" + filename = item.get("filename") or "" + + # Special handling for memory:// M3U playlists (used to pass titles via IPC) + if "memory://" in filename and "#EXTINF:" in filename: + try: + # Extract title from #EXTINF:-1,Title + # Use regex to find title between #EXTINF:-1, and newline + match = re.search(r"#EXTINF:-1,(.*?)(?:\n|\r|$)", filename) + if match: + extracted_title = match.group(1).strip() + if not title or title == "memory://": + title = extracted_title + + # Extract actual URL + # Find the first line that looks like a URL and not a directive + lines = filename.splitlines() + for line in lines: + line = line.strip() + if line and not line.startswith('#') and not line.startswith('memory://'): + filename = line + break + except Exception: + pass + + # Truncate if too long + if len(title) > 57: + title = title[:57] + "..." + if len(filename) > 27: + filename = filename[:27] + "..." + + row = table.add_row() + row.add_column("#", str(i + 1)) + row.add_column("Current", "*" if is_current else "") + row.add_column("Title", title) + row.add_column("Filename", filename) + + table.set_row_selection_args(i, [str(i + 1)]) + + table.set_source_command(".pipe") + + # Register results with pipeline context so @N selection works + ctx.set_last_result_table_overlay(table, items) + ctx.set_current_stage_table(table) + + print(table) + + return 0 + +def _start_mpv(items: List[Any]) -> None: + """Start MPV with a list of items.""" + ipc_pipe = _get_fixed_ipc_pipe() + + cmd = ['mpv', f'--input-ipc-server={ipc_pipe}'] + cmd.append('--ytdl-format=bestvideo[height<=?1080]+bestaudio/best[height<=?1080]') + + # Add items + first_title_set = False + + for item in items: + target = None + title = None + + if isinstance(item, dict): + target = item.get("target") or item.get("url") or item.get("path") + title = item.get("title") or item.get("name") + elif hasattr(item, "target"): + target = item.target + title = getattr(item, "title", None) + elif isinstance(item, str): + target = item + + if target: + if not first_title_set and title: + cmd.append(f'--force-media-title={title}') + first_title_set = True + cmd.append(target) + + if len(cmd) > 3: # mpv + ipc + format + at least one file + try: + kwargs = {} + if platform.system() == 'Windows': + kwargs['creationflags'] = 0x00000008 # DETACHED_PROCESS + + subprocess.Popen(cmd, stdin=subprocess.DEVNULL, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, **kwargs) + log(f"Started MPV with {len(cmd)-3} items") + except Exception as e: + log(f"Error starting MPV: {e}", file=sys.stderr) + +CMDLET = Cmdlet( + name=".pipe", + aliases=["pipe", "playlist", "queue", "ls-pipe"], + summary="Manage and play items in the MPV playlist via IPC", + usage=".pipe [index] [-clear]", + args=[ + CmdletArg( + name="index", + type="int", + description="Index of item to play or clear", + required=False + ), + CmdletArg( + name="clear", + type="flag", + description="Remove the selected item from the playlist" + ), + CmdletArg( + name="list", + type="flag", + description="List items (default)" + ), + ], + exec=_run +) + diff --git a/cmdlets/screen_shot.py b/cmdlets/screen_shot.py new file mode 100644 index 0000000..856de49 --- /dev/null +++ b/cmdlets/screen_shot.py @@ -0,0 +1,739 @@ +"""Screen-shot cmdlet for capturing screenshots of URLs in a pipeline. + +This cmdlet processes files through the pipeline and creates screenshots using +Playwright, marking them as temporary artifacts for cleanup. +""" + +from __future__ import annotations + +import contextlib +import hashlib +import importlib +import sys +import time +import httpx +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Dict, List, Optional, Sequence, Tuple +from urllib.parse import urlsplit, quote, urljoin + +from helper.logger import log +from helper.http_client import HTTPClient + +from . import register +from ._shared import Cmdlet, CmdletArg, SharedArgs, create_pipe_object_result, normalize_result_input +import models +import pipeline as pipeline_context + +# ============================================================================ +# CMDLET Metadata Declaration +# ============================================================================ + + + +# ============================================================================ +# Playwright & Screenshot Dependencies +# ============================================================================ + +try: + from playwright.sync_api import ( + TimeoutError as PlaywrightTimeoutError, + ViewportSize, + sync_playwright, + ) +except Exception as exc: + raise RuntimeError( + "playwright is required for screenshot capture; install with 'pip install playwright'" + ) from exc + +try: + from config import resolve_output_dir +except ImportError: + try: + _parent_dir = str(Path(__file__).parent.parent) + if _parent_dir not in sys.path: + sys.path.insert(0, _parent_dir) + from config import resolve_output_dir + except ImportError: + resolve_output_dir = None + +# ============================================================================ +# Screenshot Constants & Configuration +# ============================================================================ + +USER_AGENT = ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/120.0.0.0 Safari/537.36" +) + +DEFAULT_VIEWPORT: ViewportSize = {"width": 1280, "height": 1200} +ARCHIVE_TIMEOUT = 30.0 + + +class ScreenshotError(RuntimeError): + """Raised when screenshot capture or upload fails.""" + + +@dataclass(slots=True) +class ScreenshotOptions: + """Options controlling screenshot capture and post-processing.""" + + url: str + output_dir: Path + output_path: Optional[Path] = None + full_page: bool = True + headless: bool = True + wait_after_load: float = 2.0 + wait_for_article: bool = False + replace_video_posters: bool = True + tags: Sequence[str] = () + archive: bool = False + archive_timeout: float = ARCHIVE_TIMEOUT + known_urls: Sequence[str] = () + output_format: Optional[str] = None + prefer_platform_target: bool = False + target_selectors: Optional[Sequence[str]] = None + selector_timeout_ms: int = 10_000 + + +@dataclass(slots=True) +class ScreenshotResult: + """Details about the captured screenshot.""" + + path: Path + url: str + tags_applied: List[str] + archive_urls: List[str] + known_urls: List[str] + warnings: List[str] = field(default_factory=list) + + +# ============================================================================ +# Helper Functions +# ============================================================================ + +def _ensure_directory(path: Path) -> None: + """Ensure directory exists.""" + if not isinstance(path, Path): + path = Path(path) + path.mkdir(parents=True, exist_ok=True) + + +def _unique_path(path: Path) -> Path: + """Get unique path by appending numbers if file exists.""" + if not path.exists(): + return path + stem = path.stem + suffix = path.suffix + parent = path.parent + counter = 1 + while True: + new_path = parent / f"{stem}_{counter}{suffix}" + if not new_path.exists(): + return new_path + counter += 1 + + +def _unique_preserve_order(items: Sequence[str]) -> List[str]: + """Remove duplicates while preserving order.""" + seen = set() + result = [] + for item in items: + if item not in seen: + seen.add(item) + result.append(item) + return result + + +def _slugify_url(url: str) -> str: + """Convert URL to filesystem-safe slug.""" + parsed = urlsplit(url) + candidate = f"{parsed.netloc}{parsed.path}" + if parsed.query: + candidate += f"?{parsed.query}" + slug = "".join(char if char.isalnum() else "-" for char in candidate.lower()) + slug = slug.strip("-") or "screenshot" + return slug[:100] + + +def _normalise_format(fmt: Optional[str]) -> str: + """Normalize output format to valid values.""" + if not fmt: + return "png" + value = fmt.strip().lower() + if value in {"jpg", "jpeg"}: + return "jpeg" + if value in {"png", "pdf"}: + return value + return "png" + + +def _format_suffix(fmt: str) -> str: + """Get file suffix for format.""" + if fmt == "jpeg": + return ".jpg" + return f".{fmt}" + + +def _selectors_for_url(url: str) -> List[str]: + """Return a list of likely content selectors for known platforms.""" + u = url.lower() + sels: List[str] = [] + # Twitter/X + if "twitter.com" in u or "x.com" in u: + sels.extend([ + "article[role='article']", + "div[data-testid='tweet']", + "div[data-testid='cellInnerDiv'] article", + ]) + # Instagram + if "instagram.com" in u: + sels.extend([ + "article[role='presentation']", + "article[role='article']", + "div[role='dialog'] article", + "section main article", + ]) + # Reddit + if "reddit.com" in u: + sels.extend([ + "shreddit-post", + "div[data-testid='post-container']", + "div[data-click-id='background']", + "article", + ]) + # Rumble (video post) + if "rumble.com" in u: + sels.extend([ + "rumble-player, iframe.rumble", + "div.video-item--main", + "main article", + ]) + return sels or ["article"] + + +def _platform_preprocess(url: str, page: Any, warnings: List[str], timeout_ms: int = 10_000) -> None: + """Best-effort page tweaks for popular platforms before capture.""" + u = url.lower() + + def _try_click_texts(texts: List[str], passes: int = 2, per_timeout: int = 700) -> int: + clicks = 0 + for _ in range(max(1, passes)): + for t in texts: + try: + page.locator(f"text=/{t}/i").first.click(timeout=per_timeout) + clicks += 1 + except PlaywrightTimeoutError: + pass + except Exception: + pass + time.sleep(0.1) + return clicks + + # Dismiss common cookie/consent prompts + _try_click_texts(["accept", "i agree", "agree", "got it", "allow all", "consent"]) + + # Platform-specific expansions + if "reddit.com" in u: + _try_click_texts(["see more", "read more", "show more", "more"]) + if ("twitter.com" in u) or ("x.com" in u): + _try_click_texts(["show more", "more"]) + if "instagram.com" in u: + _try_click_texts(["more", "see more"]) + if "tiktok.com" in u: + _try_click_texts(["more", "see more"]) + if ("facebook.com" in u) or ("fb.watch" in u): + _try_click_texts(["see more", "show more", "more"]) + if "rumble.com" in u: + _try_click_texts(["accept", "agree", "close"]) + + +def _submit_wayback(url: str, timeout: float) -> Optional[str]: + """Submit URL to Internet Archive Wayback Machine.""" + encoded = quote(url, safe="/:?=&") + with HTTPClient() as client: + response = client.get(f"https://web.archive.org/save/{encoded}") + response.raise_for_status() + content_location = response.headers.get("Content-Location") + if content_location: + return urljoin("https://web.archive.org", content_location) + return str(response.url) + + +def _submit_archive_today(url: str, timeout: float) -> Optional[str]: + """Submit URL to Archive.today.""" + encoded = quote(url, safe=":/?#[]@!$&'()*+,;=") + with HTTPClient(headers={"User-Agent": USER_AGENT}) as client: + response = client.get(f"https://archive.today/submit/?url={encoded}") + response.raise_for_status() + final = str(response.url) + if final and ("archive.today" in final or "archive.ph" in final): + return final + return None + + +def _submit_archive_ph(url: str, timeout: float) -> Optional[str]: + """Submit URL to Archive.ph.""" + encoded = quote(url, safe=":/?#[]@!$&'()*+,;=") + with HTTPClient(headers={"User-Agent": USER_AGENT}) as client: + response = client.get(f"https://archive.ph/submit/?url={encoded}") + response.raise_for_status() + final = str(response.url) + if final and "archive.ph" in final: + return final + return None + + +def _archive_url(url: str, timeout: float) -> Tuple[List[str], List[str]]: + """Submit URL to all available archive services.""" + archives: List[str] = [] + warnings: List[str] = [] + for submitter, label in ( + (_submit_wayback, "wayback"), + (_submit_archive_today, "archive.today"), + (_submit_archive_ph, "archive.ph"), + ): + try: + log(f"Archiving to {label}...", flush=True) + archived = submitter(url, timeout) + except httpx.HTTPStatusError as exc: + if exc.response.status_code == 429: + warnings.append(f"archive {label} rate limited (HTTP 429)") + log(f"{label}: Rate limited (HTTP 429)", flush=True) + else: + warnings.append(f"archive {label} failed: HTTP {exc.response.status_code}") + log(f"{label}: HTTP {exc.response.status_code}", flush=True) + except httpx.RequestError as exc: + warnings.append(f"archive {label} failed: {exc}") + log(f"{label}: Connection error: {exc}", flush=True) + except Exception as exc: + warnings.append(f"archive {label} failed: {exc}") + log(f"{label}: {exc}", flush=True) + else: + if archived: + archives.append(archived) + log(f"{label}: Success - {archived}", flush=True) + else: + log(f"{label}: No archive link returned", flush=True) + return archives, warnings + + +def _prepare_output_path(options: ScreenshotOptions) -> Path: + """Prepare and validate output path for screenshot.""" + _ensure_directory(options.output_dir) + explicit_format = _normalise_format(options.output_format) if options.output_format else None + inferred_format: Optional[str] = None + if options.output_path is not None: + path = options.output_path + if not path.is_absolute(): + path = options.output_dir / path + suffix = path.suffix.lower() + if suffix: + inferred_format = _normalise_format(suffix[1:]) + else: + stamp = time.strftime("%Y%m%d_%H%M%S") + filename = f"{_slugify_url(options.url)}_{stamp}" + path = options.output_dir / filename + final_format = explicit_format or inferred_format or "png" + if not path.suffix: + path = path.with_suffix(_format_suffix(final_format)) + else: + current_suffix = path.suffix.lower() + expected = _format_suffix(final_format) + if current_suffix != expected: + path = path.with_suffix(expected) + options.output_format = final_format + return _unique_path(path) + + +def _capture_with_playwright(options: ScreenshotOptions, destination: Path, warnings: List[str]) -> None: + """Capture screenshot using Playwright.""" + playwright = None + browser = None + context = None + try: + log("Starting Playwright...", flush=True) + playwright = sync_playwright().start() + log("Launching Chromium browser...", flush=True) + format_name = _normalise_format(options.output_format) + headless = options.headless or format_name == "pdf" + if format_name == "pdf" and not options.headless: + warnings.append("pdf output requires headless Chromium; overriding headless mode") + browser = playwright.chromium.launch( + headless=headless, + args=["--disable-blink-features=AutomationControlled"], + ) + log("Creating browser context...", flush=True) + context = browser.new_context( + user_agent=USER_AGENT, + viewport=DEFAULT_VIEWPORT, + ignore_https_errors=True, + ) + page = context.new_page() + log(f"Navigating to {options.url}...", flush=True) + try: + page.goto(options.url, timeout=90_000, wait_until="domcontentloaded") + log("Page loaded successfully", flush=True) + except PlaywrightTimeoutError: + warnings.append("navigation timeout; capturing current page state") + log("Navigation timeout; proceeding with current state", flush=True) + + # Skip article lookup by default (wait_for_article defaults to False) + if options.wait_for_article: + try: + log("Waiting for article element...", flush=True) + page.wait_for_selector("article", timeout=10_000) + log("Article element found", flush=True) + except PlaywrightTimeoutError: + warnings.append("
selector not found; capturing fallback") + log("Article element not found; using fallback", flush=True) + + if options.wait_after_load > 0: + log(f"Waiting {options.wait_after_load}s for page stabilization...", flush=True) + time.sleep(min(10.0, max(0.0, options.wait_after_load))) + if options.replace_video_posters: + log("Replacing video elements with posters...", flush=True) + page.evaluate( + """ + document.querySelectorAll('video').forEach(v => { + if (v.poster) { + const img = document.createElement('img'); + img.src = v.poster; + img.style.maxWidth = '100%'; + img.style.borderRadius = '12px'; + v.replaceWith(img); + } + }); + """ + ) + # Attempt platform-specific target capture if requested (and not PDF) + element_captured = False + if options.prefer_platform_target and format_name != "pdf": + log("Attempting platform-specific content capture...", flush=True) + try: + _platform_preprocess(options.url, page, warnings) + except Exception: + pass + selectors = list(options.target_selectors or []) + if not selectors: + selectors = _selectors_for_url(options.url) + for sel in selectors: + try: + log(f"Trying selector: {sel}", flush=True) + el = page.wait_for_selector(sel, timeout=max(0, int(options.selector_timeout_ms))) + except PlaywrightTimeoutError: + log(f"Selector not found: {sel}", flush=True) + continue + try: + if el is not None: + log(f"Found element with selector: {sel}", flush=True) + try: + el.scroll_into_view_if_needed(timeout=1000) + except Exception: + pass + log(f"Capturing element to {destination}...", flush=True) + el.screenshot(path=str(destination), type=("jpeg" if format_name == "jpeg" else None)) + element_captured = True + log("Element captured successfully", flush=True) + break + except Exception as exc: + warnings.append(f"element capture failed for '{sel}': {exc}") + log(f"Failed to capture element: {exc}", flush=True) + # Fallback to default capture paths + if element_captured: + pass + elif format_name == "pdf": + log("Generating PDF...", flush=True) + page.emulate_media(media="print") + page.pdf(path=str(destination), print_background=True) + log(f"PDF saved to {destination}", flush=True) + else: + log(f"Capturing full page to {destination}...", flush=True) + screenshot_kwargs: Dict[str, Any] = {"path": str(destination)} + if format_name == "jpeg": + screenshot_kwargs["type"] = "jpeg" + screenshot_kwargs["quality"] = 90 + if options.full_page: + page.screenshot(full_page=True, **screenshot_kwargs) + else: + article = page.query_selector("article") + if article is not None: + article_kwargs = dict(screenshot_kwargs) + article_kwargs.pop("full_page", None) + article.screenshot(**article_kwargs) + else: + page.screenshot(**screenshot_kwargs) + log(f"Screenshot saved to {destination}", flush=True) + except Exception as exc: + raise ScreenshotError(f"Failed to capture screenshot: {exc}") from exc + finally: + log("Cleaning up browser resources...", flush=True) + with contextlib.suppress(Exception): + if context is not None: + context.close() + with contextlib.suppress(Exception): + if browser is not None: + browser.close() + with contextlib.suppress(Exception): + if playwright is not None: + playwright.stop() + log("Cleanup complete", flush=True) + + +def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult: + """Capture a screenshot for the given options.""" + destination = _prepare_output_path(options) + warnings: List[str] = [] + _capture_with_playwright(options, destination, warnings) + + known_urls = _unique_preserve_order([options.url, *options.known_urls]) + archive_urls: List[str] = [] + if options.archive: + archives, archive_warnings = _archive_url(options.url, options.archive_timeout) + archive_urls.extend(archives) + warnings.extend(archive_warnings) + if archives: + known_urls = _unique_preserve_order([*known_urls, *archives]) + + applied_tags = _unique_preserve_order(list(tag for tag in options.tags if tag.strip())) + + return ScreenshotResult( + path=destination, + url=options.url, + tags_applied=applied_tags, + archive_urls=archive_urls, + known_urls=known_urls, + warnings=warnings, + ) + + +# ============================================================================ +# Main Cmdlet Function +# ============================================================================ + +def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: + """Take screenshots of URLs in the pipeline. + + Accepts: + - Single result object (dict or PipeObject) with 'file_path' field + - List of result objects to screenshot each + - Direct URL as string + + Emits PipeObject-formatted results for each screenshot with: + - action: 'cmdlet:screen-shot' + - is_temp: True (screenshots are temporary artifacts) + - parent_id: hash of the original file/URL + + Screenshots are created using Playwright and marked as temporary + so they can be cleaned up later with the cleanup cmdlet. + """ + from ._shared import parse_cmdlet_args + + # Help check + try: + if any(str(a).lower() in {"-?", "/?", "--help", "-h", "help", "--cmdlet"} for a in args): + log(json.dumps(CMDLET, ensure_ascii=False, indent=2)) + return 0 + except Exception: + pass + + # ======================================================================== + # ARGUMENT PARSING + # ======================================================================== + + parsed = parse_cmdlet_args(args, CMDLET) + + format_value = parsed.get("format") + storage_value = parsed.get("storage") + selector_arg = parsed.get("selector") + selectors = [selector_arg] if selector_arg else [] + archive_enabled = parsed.get("archive", False) + + # Positional URL argument (if provided) + url_arg = parsed.get("url") + positional_urls = [str(url_arg)] if url_arg else [] + + # ======================================================================== + # INPUT PROCESSING - Extract URLs from pipeline or command arguments + # ======================================================================== + + piped_results = normalize_result_input(result) + urls_to_process = [] + + # Extract URLs from piped results + if piped_results: + for item in piped_results: + url = None + if isinstance(item, dict): + url = item.get('file_path') or item.get('path') or item.get('url') or item.get('target') + else: + url = getattr(item, 'file_path', None) or getattr(item, 'path', None) or getattr(item, 'url', None) or getattr(item, 'target', None) + + if url: + urls_to_process.append(str(url)) + + # Use positional arguments if no pipeline input + if not urls_to_process and positional_urls: + urls_to_process = positional_urls + + if not urls_to_process: + log(f"No URLs to process for screen-shot cmdlet", file=sys.stderr) + return 1 + + # ======================================================================== + # OUTPUT DIRECTORY RESOLUTION - Priority chain + # ======================================================================== + + screenshot_dir: Optional[Path] = None + + # Primary: Use --storage if provided (highest priority) + if storage_value: + try: + screenshot_dir = SharedArgs.resolve_storage(storage_value) + log(f"[screen_shot] Using --storage {storage_value}: {screenshot_dir}", flush=True) + except ValueError as e: + log(str(e), file=sys.stderr) + return 1 + + # Secondary: Use config-based resolver ONLY if --storage not provided + if screenshot_dir is None and resolve_output_dir is not None: + try: + screenshot_dir = resolve_output_dir(config) + log(f"[screen_shot] Using config resolver: {screenshot_dir}", flush=True) + except Exception: + pass + + # Tertiary: Use config outfile ONLY if neither --storage nor resolver worked + if screenshot_dir is None and config and config.get("outfile"): + try: + screenshot_dir = Path(config["outfile"]).expanduser() + log(f"[screen_shot] Using config outfile: {screenshot_dir}", flush=True) + except Exception: + pass + + # Default: User's Videos directory + if screenshot_dir is None: + screenshot_dir = Path.home() / "Videos" + log(f"[screen_shot] Using default directory: {screenshot_dir}", flush=True) + + _ensure_directory(screenshot_dir) + + # ======================================================================== + # PREPARE SCREENSHOT OPTIONS + # ======================================================================== + + format_name = _normalise_format(format_value) + filtered_selectors = [str(s).strip() for s in selectors if str(s).strip()] + target_selectors = filtered_selectors if filtered_selectors else None + + all_emitted = [] + exit_code = 0 + # ======================================================================== + # PROCESS URLs AND CAPTURE SCREENSHOTS + # ======================================================================== + + for url in urls_to_process: + # Validate URL format + if not url.lower().startswith(("http://", "https://", "file://")): + log(f"[screen_shot] Skipping non-URL input: {url}", file=sys.stderr) + continue + + try: + # Create screenshot with provided options + options = ScreenshotOptions( + url=url, + output_dir=screenshot_dir, + output_format=format_name, + archive=archive_enabled, + target_selectors=target_selectors, + prefer_platform_target=False, + wait_for_article=False, + full_page=True, + ) + + screenshot_result = _capture_screenshot(options) + + # Log results and warnings + log(f"Screenshot captured to {screenshot_result.path}", flush=True) + if screenshot_result.archive_urls: + log(f"Archives: {', '.join(screenshot_result.archive_urls)}", flush=True) + for warning in screenshot_result.warnings: + log(f"Warning: {warning}", flush=True) + + # Compute hash of screenshot file + screenshot_hash = None + try: + with open(screenshot_result.path, 'rb') as f: + screenshot_hash = hashlib.sha256(f.read()).hexdigest() + except Exception: + pass + + # Create PipeObject result - marked as TEMP since derivative artifact + pipe_obj = create_pipe_object_result( + source='screenshot', + identifier=Path(screenshot_result.path).stem, + file_path=str(screenshot_result.path), + cmdlet_name='screen-shot', + title=f"Screenshot: {Path(screenshot_result.path).name}", + file_hash=screenshot_hash, + is_temp=True, + parent_hash=hashlib.sha256(url.encode()).hexdigest(), + extra={ + 'source_url': url, + 'archive_urls': screenshot_result.archive_urls, + 'known_urls': screenshot_result.known_urls, + 'target': str(screenshot_result.path), # Explicit target for add-file + } + ) + + # Emit the result so downstream cmdlets (like add-file) can use it + pipeline_context.emit(pipe_obj) + all_emitted.append(pipe_obj) + + except ScreenshotError as exc: + log(f"Error taking screenshot of {url}: {exc}", file=sys.stderr) + exit_code = 1 + except Exception as exc: + log(f"Unexpected error taking screenshot of {url}: {exc}", file=sys.stderr) + import traceback + traceback.print_exc(file=sys.stderr) + exit_code = 1 + + if not all_emitted: + log(f"No screenshots were successfully captured", file=sys.stderr) + return 1 + + # Log completion message + log(f"✓ Successfully captured {len(all_emitted)} screenshot(s)", flush=True) + + return exit_code +CMDLET = Cmdlet( + name="screen-shot", + summary="Capture a screenshot of a URL or file and mark as temporary artifact", + usage="screen-shot [options] or download-data | screen-shot [options]", + aliases=["screenshot", "ss"], + args=[ + CmdletArg(name="url", type="string", required=False, description="URL to screenshot (or from pipeline)"), + CmdletArg(name="format", type="string", description="Output format: png, jpeg, or pdf"), + CmdletArg(name="selector", type="string", description="CSS selector for element capture"), + SharedArgs.ARCHIVE, # Use shared archive argument + SharedArgs.STORAGE, # Use shared storage argument + ], + details=[ + "Take screenshots of URLs with optional archiving and element targeting.", + "Screenshots are marked as temporary artifacts for cleanup by the cleanup cmdlet.", + "", + "Arguments:", + " url URL to capture (optional if piped from pipeline)", + " --format FORMAT Output format: png (default), jpeg, or pdf", + " --selector SEL CSS selector for capturing specific element", + " --archive, -arch Archive URL to Wayback/Archive.today/Archive.ph", + " --storage LOCATION Storage destination: hydrus, local, 0x0, debrid, or ftp", + "", + "Examples:", + " download-data https://example.com | screen-shot --storage local", + " download-data https://twitter.com/user/status/123 | screen-shot --selector 'article[role=article]' --storage hydrus --archive", + " screen-shot https://example.com --format jpeg --storage 0x0 --archive", + ] +) diff --git a/cmdlets/search_file.py b/cmdlets/search_file.py new file mode 100644 index 0000000..54f03a5 --- /dev/null +++ b/cmdlets/search_file.py @@ -0,0 +1,351 @@ +"""Search-file cmdlet: Search for files by query, tags, size, type, duration, etc.""" +from __future__ import annotations + +from typing import Any, Dict, Sequence, List, Optional, Tuple, Callable +from fnmatch import fnmatchcase +from pathlib import Path +from dataclasses import dataclass, field +import json +import os +import sys + +from helper.logger import log, debug +import shutil +import subprocess + +from helper.file_storage import FileStorage +from helper.search_provider import get_provider, list_providers, SearchResult +from metadata import import_pending_sidecars + +from . import register +from ._shared import Cmdlet, CmdletArg +import models +import pipeline as ctx + +# Optional dependencies +try: + import mutagen # type: ignore +except ImportError: # pragma: no cover + mutagen = None # type: ignore + +try: + from config import get_hydrus_url, resolve_output_dir +except Exception: # pragma: no cover + get_hydrus_url = None # type: ignore + resolve_output_dir = None # type: ignore + +try: + from helper.hydrus import HydrusClient, HydrusRequestError +except ImportError: # pragma: no cover + HydrusClient = None # type: ignore + HydrusRequestError = RuntimeError # type: ignore + +try: + from helper.utils import sha256_file +except ImportError: # pragma: no cover + sha256_file = None # type: ignore + +try: + from helper.utils_constant import mime_maps +except ImportError: # pragma: no cover + mime_maps = {} # type: ignore + + +# ============================================================================ +# Data Classes (from helper/search.py) +# ============================================================================ + +@dataclass(slots=True) +class SearchRecord: + path: str + size_bytes: int | None = None + duration_seconds: str | None = None + tags: str | None = None + hash_hex: str | None = None + + def as_dict(self) -> dict[str, str]: + payload: dict[str, str] = {"path": self.path} + if self.size_bytes is not None: + payload["size"] = str(self.size_bytes) + if self.duration_seconds: + payload["duration"] = self.duration_seconds + if self.tags: + payload["tags"] = self.tags + if self.hash_hex: + payload["hash"] = self.hash_hex + return payload + + +@dataclass +class ResultItem: + origin: str + title: str + detail: str + annotations: List[str] + target: str + media_kind: str = "other" + hash_hex: Optional[str] = None + columns: List[tuple[str, str]] = field(default_factory=list) + tag_summary: Optional[str] = None + duration_seconds: Optional[float] = None + size_bytes: Optional[int] = None + full_metadata: Optional[Dict[str, Any]] = None + tags: Optional[set[str]] = field(default_factory=set) + relationships: Optional[List[str]] = field(default_factory=list) + known_urls: Optional[List[str]] = field(default_factory=list) + + def to_dict(self) -> Dict[str, Any]: + payload: Dict[str, Any] = { + "title": self.title, + } + + # Always include these core fields for downstream cmdlets (get-file, download-data, etc) + payload["origin"] = self.origin + payload["target"] = self.target + payload["media_kind"] = self.media_kind + + # Always include full_metadata if present (needed by download-data, etc) + # This is NOT for display, but for downstream processing + if self.full_metadata: + payload["full_metadata"] = self.full_metadata + + # Include columns if defined (result renderer will use these for display) + if self.columns: + payload["columns"] = list(self.columns) + else: + # If no columns, include the detail for backwards compatibility + payload["detail"] = self.detail + payload["annotations"] = list(self.annotations) + + # Include optional fields + if self.hash_hex: + payload["hash"] = self.hash_hex + if self.tag_summary: + payload["tags"] = self.tag_summary + if self.tags: + payload["tags_set"] = list(self.tags) + if self.relationships: + payload["relationships"] = self.relationships + if self.known_urls: + payload["known_urls"] = self.known_urls + return payload + + +STORAGE_ORIGINS = {"local", "hydrus", "debrid"} + + +def _ensure_storage_columns(payload: Dict[str, Any]) -> Dict[str, Any]: + """Attach Title/Store columns for storage-origin results to keep CLI display compact.""" + origin_value = str(payload.get("origin") or payload.get("source") or "").lower() + if origin_value not in STORAGE_ORIGINS: + return payload + title = payload.get("title") or payload.get("name") or payload.get("target") or payload.get("path") or "Result" + store_label = payload.get("origin") or payload.get("source") or origin_value + normalized = dict(payload) + normalized["columns"] = [("Title", str(title)), ("Store", str(store_label))] + return normalized + + +CMDLET = Cmdlet( + name="search-file", + summary="Unified search cmdlet for searchable backends (Hydrus, Local, Debrid, LibGen, OpenLibrary, Soulseek).", + usage="search-file [query] [-tag TAG] [-size >100MB|<50MB] [-type audio|video|image] [-duration >10:00] [-storage BACKEND] [-provider PROVIDER]", + args=[ + CmdletArg("query", description="Search query string"), + CmdletArg("tag", description="Filter by tag (can be used multiple times)"), + CmdletArg("size", description="Filter by size: >100MB, <50MB, =10MB"), + CmdletArg("type", description="Filter by type: audio, video, image, document"), + CmdletArg("duration", description="Filter by duration: >10:00, <1:30:00"), + CmdletArg("limit", type="integer", description="Limit results (default: 100)"), + CmdletArg("storage", description="Search storage backend: hydrus, local, debrid (default: all searchable)"), + CmdletArg("provider", description="Search provider: libgen, openlibrary, soulseek, debrid, local (overrides -storage)"), + ], + details=[ + "Search across multiple providers: File storage (Hydrus, Local, Debrid), Books (LibGen, OpenLibrary), Music (Soulseek)", + "Use -provider to search a specific source, or -storage to search file backends", + "Filter results by: tag, size, type, duration", + "Results can be piped to other commands", + "Examples:", + "search-file foo # Search all file backends", + "search-file -provider libgen 'python programming' # Search LibGen books", + "search-file -provider debrid 'movie' # Search AllDebrid magnets", + "search-file 'music' -provider soulseek # Search Soulseek P2P", + "search-file -provider openlibrary 'tolkien' # Search OpenLibrary", + "search-file song -storage hydrus -type audio # Search only Hydrus audio", + "search-file movie -tag action -provider debrid # Debrid with filters", + ], +) + + +@register(["search-file", "search"]) +def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: + """Search across multiple providers: Hydrus, Local, Debrid, LibGen, etc.""" + args_list = [str(arg) for arg in (args or [])] + + # Parse arguments + query = "" + tag_filters: List[str] = [] + size_filter: Optional[Tuple[str, int]] = None + duration_filter: Optional[Tuple[str, float]] = None + type_filter: Optional[str] = None + storage_backend: Optional[str] = None + provider_name: Optional[str] = None + limit = 100 + + # Simple argument parsing + i = 0 + while i < len(args_list): + arg = args_list[i] + low = arg.lower() + + if low in {"-provider", "--provider"} and i + 1 < len(args_list): + provider_name = args_list[i + 1].lower() + i += 2 + elif low in {"-storage", "--storage"} and i + 1 < len(args_list): + storage_backend = args_list[i + 1].lower() + i += 2 + elif low in {"-tag", "--tag"} and i + 1 < len(args_list): + tag_filters.append(args_list[i + 1]) + i += 2 + elif low in {"-limit", "--limit"} and i + 1 < len(args_list): + try: + limit = int(args_list[i + 1]) + except ValueError: + limit = 100 + i += 2 + elif low in {"-type", "--type"} and i + 1 < len(args_list): + type_filter = args_list[i + 1].lower() + i += 2 + elif not query and not arg.startswith("-"): + query = arg + i += 1 + else: + i += 1 + + if not query: + log("Provide a search query", file=sys.stderr) + return 1 + + # Initialize worker for this search command + from helper.local_library import LocalLibraryDB + from config import get_local_storage_path + import uuid + worker_id = str(uuid.uuid4()) + library_root = get_local_storage_path(config or {}) + if not library_root: + log("No library root configured", file=sys.stderr) + return 1 + db = LocalLibraryDB(library_root) + db.insert_worker( + worker_id, + "search", + title=f"Search: {query}", + description=f"Query: {query}", + pipe=ctx.get_current_command_text() + ) + + try: + results_list = [] + + # Try to search using provider (libgen, soulseek, debrid, openlibrary) + if provider_name: + debug(f"[search_file] Attempting provider search with: {provider_name}") + provider = get_provider(provider_name, config) + if not provider: + log(f"Provider '{provider_name}' not available", file=sys.stderr) + db.update_worker_status(worker_id, 'error') + return 1 + + debug(f"[search_file] Provider loaded, calling search with query: {query}") + search_result = provider.search(query, limit=limit) + debug(f"[search_file] Provider search returned {len(search_result)} results") + + for item in search_result: + item_dict = item.to_dict() + results_list.append(item_dict) + ctx.emit(item_dict) + + debug(f"[search_file] Emitted {len(results_list)} results") + + # Write results to worker stdout + db.append_worker_stdout(worker_id, json.dumps(results_list, indent=2)) + db.update_worker_status(worker_id, 'completed') + return 0 + + # Otherwise search using FileStorage (Hydrus, Local, Debrid backends) + from helper.file_storage import FileStorage + storage = FileStorage(config=config or {}) + + backend_to_search = storage_backend or None + if backend_to_search: + # Check if requested backend is available + if backend_to_search == "hydrus": + from helper.hydrus import is_hydrus_available + if not is_hydrus_available(config or {}): + log(f"Backend 'hydrus' is not available (Hydrus service not running)", file=sys.stderr) + db.update_worker_status(worker_id, 'error') + return 1 + if not storage.supports_search(backend_to_search): + log(f"Backend '{backend_to_search}' does not support searching", file=sys.stderr) + db.update_worker_status(worker_id, 'error') + return 1 + results = storage[backend_to_search].search(query, limit=limit) + else: + # Search all searchable backends, but skip hydrus if unavailable + from helper.hydrus import is_hydrus_available + hydrus_available = is_hydrus_available(config or {}) + + all_results = [] + for backend_name in storage.list_searchable_backends(): + # Skip hydrus if not available + if backend_name == "hydrus" and not hydrus_available: + continue + try: + backend_results = storage[backend_name].search(query, limit=limit - len(all_results)) + if backend_results: + all_results.extend(backend_results) + if len(all_results) >= limit: + break + except Exception as exc: + log(f"Backend {backend_name} search failed: {exc}", file=sys.stderr) + results = all_results[:limit] + + # Emit results and collect for workers table + if results: + for item in results: + if isinstance(item, dict): + normalized = _ensure_storage_columns(item) + results_list.append(normalized) + ctx.emit(normalized) + elif isinstance(item, ResultItem): + item_dict = item.to_dict() + results_list.append(item_dict) + ctx.emit(item_dict) + else: + item_dict = {"title": str(item)} + results_list.append(item_dict) + ctx.emit(item_dict) + + # Write results to worker stdout + db.append_worker_stdout(worker_id, json.dumps(results_list, indent=2)) + else: + log("No results found", file=sys.stderr) + db.append_worker_stdout(worker_id, json.dumps([], indent=2)) + + db.update_worker_status(worker_id, 'completed') + return 0 + + except Exception as exc: + log(f"Search failed: {exc}", file=sys.stderr) + import traceback + traceback.print_exc(file=sys.stderr) + db.update_worker_status(worker_id, 'error') + return 1 + + finally: + # Always close the database connection + try: + db.close() + except Exception: + pass + diff --git a/cmdlets/worker.py b/cmdlets/worker.py new file mode 100644 index 0000000..83653e6 --- /dev/null +++ b/cmdlets/worker.py @@ -0,0 +1,325 @@ +"""Worker cmdlet: Display workers table in ResultTable format.""" +from __future__ import annotations + +from typing import Any, Dict, Sequence, List +import json +import sys +from datetime import datetime, timezone + +from . import register +from ._shared import Cmdlet, CmdletArg +import pipeline as ctx +from helper.logger import log +from config import get_local_storage_path + + +CMDLET = Cmdlet( + name=".worker", + summary="Display workers table in result table format.", + usage=".worker [status] [-limit N] [@N]", + args=[ + CmdletArg("status", description="Filter by status: running, completed, error (default: all)"), + CmdletArg("limit", type="integer", description="Limit results (default: 100)"), + CmdletArg("@N", description="Select worker by index (1-based) and display full logs"), + ], + details=[ + "- Shows all background worker tasks and their output", + "- Can filter by status: running, completed, error", + "- Search result stdout is captured from each worker", + "- Use @N to select a specific worker by index and display its full logs", + "Examples:", + ".worker # Show all workers", + ".worker running # Show running workers only", + ".worker completed -limit 50 # Show 50 most recent completed workers", + ".worker @3 # Show full logs for the 3rd worker", + ".worker running @2 # Show full logs for the 2nd running worker", + ], +) + + +@register([".worker", "worker", "workers"]) +def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: + """Display workers table or show detailed logs for a specific worker.""" + args_list = [str(arg) for arg in (args or [])] + selection_indices = ctx.get_last_selection() + selection_requested = bool(selection_indices) and isinstance(result, list) and len(result) > 0 + + # Parse arguments for list view + status_filter: str | None = None + limit = 100 + clear_requested = False + worker_id_arg: str | None = None + i = 0 + while i < len(args_list): + arg = args_list[i] + low = arg.lower() + if low in {"-limit", "--limit"} and i + 1 < len(args_list): + try: + limit = max(1, int(args_list[i + 1])) + except ValueError: + limit = 100 + i += 2 + elif low in {"-id", "--id"} and i + 1 < len(args_list): + worker_id_arg = args_list[i + 1] + i += 2 + elif low in {"-clear", "--clear"}: + clear_requested = True + i += 1 + elif low in {"running", "completed", "error", "cancelled"}: + status_filter = low + i += 1 + elif not arg.startswith("-"): + status_filter = low + i += 1 + else: + i += 1 + + try: + if any(str(a).lower() in {"-?", "/?", "--help", "-h", "help", "--cmdlet"} for a in args): + log(json.dumps(CMDLET, ensure_ascii=False, indent=2)) + return 0 + except Exception: + pass + + library_root = get_local_storage_path(config or {}) + if not library_root: + log("No library root configured", file=sys.stderr) + return 1 + + try: + from helper.local_library import LocalLibraryDB + db: LocalLibraryDB | None = None + try: + db = LocalLibraryDB(library_root) + if clear_requested: + count = db.clear_finished_workers() + log(f"Cleared {count} finished workers.") + return 0 + + if worker_id_arg: + worker = db.get_worker(worker_id_arg) + if worker: + events = [] + try: + wid = worker.get("worker_id") + if wid and hasattr(db, "get_worker_events"): + events = db.get_worker_events(wid) + except Exception: + pass + _emit_worker_detail(worker, events) + return 0 + else: + log(f"Worker not found: {worker_id_arg}", file=sys.stderr) + return 1 + + if selection_requested: + return _render_worker_selection(db, result) + return _render_worker_list(db, status_filter, limit) + finally: + if db: + db.close() + except Exception as exc: + log(f"Workers query failed: {exc}", file=sys.stderr) + import traceback + traceback.print_exc(file=sys.stderr) + return 1 + + +def _render_worker_list(db, status_filter: str | None, limit: int) -> int: + workers = db.get_all_workers(limit=limit) + if status_filter: + workers = [w for w in workers if str(w.get("status", "")).lower() == status_filter] + + if not workers: + log("No workers found", file=sys.stderr) + return 0 + + for worker in workers: + started = worker.get("started_at", "") + ended = worker.get("completed_at", worker.get("last_updated", "")) + + date_str = _extract_date(started) + start_time = _format_event_timestamp(started) + end_time = _format_event_timestamp(ended) + + item = { + "columns": [ + ("Status", worker.get("status", "")), + ("Pipe", _summarize_pipe(worker.get("pipe"))), + ("Date", date_str), + ("Start Time", start_time), + ("End Time", end_time), + ], + "__worker_metadata": worker, + "_selection_args": ["-id", worker.get("worker_id")] + } + ctx.emit(item) + return 0 + + +def _render_worker_selection(db, selected_items: Any) -> int: + if not isinstance(selected_items, list): + log("Selection payload missing", file=sys.stderr) + return 1 + + emitted = False + for item in selected_items: + worker = _resolve_worker_record(db, item) + if not worker: + continue + events = [] + try: + events = db.get_worker_events(worker.get("worker_id")) if hasattr(db, "get_worker_events") else [] + except Exception: + events = [] + _emit_worker_detail(worker, events) + emitted = True + if not emitted: + log("Selected rows no longer exist", file=sys.stderr) + return 1 + return 0 + + +def _resolve_worker_record(db, payload: Any) -> Dict[str, Any] | None: + if not isinstance(payload, dict): + return None + worker_data = payload.get("__worker_metadata") + worker_id = None + if isinstance(worker_data, dict): + worker_id = worker_data.get("worker_id") + else: + worker_id = payload.get("worker_id") + worker_data = None + if worker_id: + fresh = db.get_worker(worker_id) + if fresh: + return fresh + return worker_data if isinstance(worker_data, dict) else None + + +def _emit_worker_detail(worker: Dict[str, Any], events: List[Dict[str, Any]]) -> None: + # Parse stdout logs into rows + stdout_content = worker.get("stdout", "") or "" + + # Try to parse lines if they follow the standard log format + # Format: YYYY-MM-DD HH:MM:SS - name - level - message + lines = stdout_content.splitlines() + + for line in lines: + line = line.strip() + if not line: + continue + + # Default values + timestamp = "" + level = "INFO" + message = line + + # Try to parse standard format + try: + parts = line.split(" - ", 3) + if len(parts) >= 4: + # Full format + ts_str, _, lvl, msg = parts + timestamp = _format_event_timestamp(ts_str) + level = lvl + message = msg + elif len(parts) == 3: + # Missing name or level + ts_str, lvl, msg = parts + timestamp = _format_event_timestamp(ts_str) + level = lvl + message = msg + except Exception: + pass + + item = { + "columns": [ + ("Time", timestamp), + ("Level", level), + ("Message", message) + ] + } + ctx.emit(item) + + # Also emit events if available and not redundant + # (For now, just focusing on stdout logs as requested) + + +def _summarize_pipe(pipe_value: Any, limit: int = 60) -> str: + text = str(pipe_value or "").strip() + if not text: + return "(none)" + return text if len(text) <= limit else text[: limit - 3] + "..." + + +def _format_event_timestamp(raw_timestamp: Any) -> str: + dt = _parse_to_local(raw_timestamp) + if dt: + return dt.strftime("%H:%M:%S") + + if not raw_timestamp: + return "--:--:--" + text = str(raw_timestamp) + if "T" in text: + time_part = text.split("T", 1)[1] + elif " " in text: + time_part = text.split(" ", 1)[1] + else: + time_part = text + return time_part[:8] if len(time_part) >= 8 else time_part + + +def _parse_to_local(timestamp_str: Any) -> datetime | None: + if not timestamp_str: + return None + text = str(timestamp_str).strip() + if not text: + return None + + try: + # Check for T separator (Python isoformat - Local time) + if 'T' in text: + return datetime.fromisoformat(text) + + # Check for space separator (SQLite CURRENT_TIMESTAMP - UTC) + # Format: YYYY-MM-DD HH:MM:SS + if ' ' in text: + # Assume UTC + dt = datetime.strptime(text, "%Y-%m-%d %H:%M:%S") + dt = dt.replace(tzinfo=timezone.utc) + return dt.astimezone() # Convert to local + + except Exception: + pass + + return None + + +def _extract_date(raw_timestamp: Any) -> str: + dt = _parse_to_local(raw_timestamp) + if dt: + return dt.strftime("%m-%d-%y") + + # Fallback + if not raw_timestamp: + return "" + text = str(raw_timestamp) + # Extract YYYY-MM-DD part + date_part = "" + if "T" in text: + date_part = text.split("T", 1)[0] + elif " " in text: + date_part = text.split(" ", 1)[0] + else: + date_part = text + + # Convert YYYY-MM-DD to MM-DD-YY + try: + parts = date_part.split("-") + if len(parts) == 3: + year, month, day = parts + return f"{month}-{day}-{year[2:]}" + except Exception: + pass + return date_part \ No newline at end of file diff --git a/config.py b/config.py new file mode 100644 index 0000000..27f649c --- /dev/null +++ b/config.py @@ -0,0 +1,360 @@ + +"""Unified configuration helpers for downlow.""" +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, Dict, Optional +from pathlib import Path +from helper.logger import log + +DEFAULT_CONFIG_FILENAME = "config.json" +SCRIPT_DIR = Path(__file__).resolve().parent + +_CONFIG_CACHE: Dict[str, Dict[str, Any]] = {} + + +def _make_cache_key(config_dir: Optional[Path], filename: str, actual_path: Optional[Path]) -> str: + if actual_path: + return str(actual_path.resolve()) + base_dir = (config_dir or SCRIPT_DIR) + return str((base_dir / filename).resolve()) + + + +def get_hydrus_instance(config: Dict[str, Any], instance_name: str = "home") -> Optional[Dict[str, Any]]: + """Get a specific Hydrus instance config by name. + + Supports both formats: + - New: config["storage"]["hydrus"][instance_name] = {"key": "...", "url": "..."} + - Old: config["HydrusNetwork"][instance_name] = {"key": "...", "url": "..."} + + Args: + config: Configuration dict + instance_name: Name of the Hydrus instance (default: "home") + + Returns: + Dict with "key" and "url" keys, or None if not found + """ + # Try new format first + storage = config.get("storage", {}) + if isinstance(storage, dict): + hydrus_config = storage.get("hydrus", {}) + if isinstance(hydrus_config, dict): + instance = hydrus_config.get(instance_name) + if isinstance(instance, dict): + return instance + + # Fall back to old format + hydrus_network = config.get("HydrusNetwork") + if not isinstance(hydrus_network, dict): + return None + + instance = hydrus_network.get(instance_name) + if isinstance(instance, dict): + return instance + + return None + + +def get_hydrus_access_key(config: Dict[str, Any], instance_name: str = "home") -> Optional[str]: + """Get Hydrus access key for an instance. + + Supports both old flat format and new nested format: + - Old: config["HydrusNetwork_Access_Key"] + - New: config["HydrusNetwork"][instance_name]["key"] + + Args: + config: Configuration dict + instance_name: Name of the Hydrus instance (default: "home") + + Returns: + Access key string, or None if not found + """ + instance = get_hydrus_instance(config, instance_name) + key = instance.get("key") if instance else config.get("HydrusNetwork_Access_Key") + return str(key).strip() if key else None + + +def get_hydrus_url(config: Dict[str, Any], instance_name: str = "home") -> Optional[str]: + """Get Hydrus URL for an instance. + + Supports both old flat format and new nested format: + - Old: config["HydrusNetwork_URL"] or constructed from IP/Port/HTTPS + - New: config["HydrusNetwork"][instance_name]["url"] + + Args: + config: Configuration dict + instance_name: Name of the Hydrus instance (default: "home") + + Returns: + URL string, or None if not found + """ + instance = get_hydrus_instance(config, instance_name) + url = instance.get("url") if instance else config.get("HydrusNetwork_URL") + if url: # Check if not None and not empty + return str(url).strip() + # Build from IP/Port/HTTPS if not found + host = str(config.get("HydrusNetwork_IP") or "localhost").strip() or "localhost" + port = str(config.get("HydrusNetwork_Port") or "45869").strip() + scheme = "https" if str(config.get("HydrusNetwork_Use_HTTPS") or "").strip().lower() in {"1", "true", "yes", "on"} else "http" + authority = host if not (":" in host and not host.startswith("[")) else f"[{host}]" + return f"{scheme}://{authority}:{port}" + + + +def resolve_output_dir(config: Dict[str, Any]) -> Path: + """Resolve output directory from config with single source of truth. + + Priority: + 1. config["temp"] - explicitly set temp/output directory + 2. config["outfile"] - fallback to outfile setting + 3. Home/Videos - safe user directory fallback + + Returns: + Path to output directory + """ + # First try explicit temp setting from config + temp_value = config.get("temp") + if temp_value: + try: + path = Path(str(temp_value)).expanduser() + # Verify we can access it (not a system directory with permission issues) + if path.exists() or path.parent.exists(): + return path + except Exception: + pass + + # Then try outfile setting + outfile_value = config.get("outfile") + if outfile_value: + try: + return Path(str(outfile_value)).expanduser() + except Exception: + pass + + # Fallback to user's Videos directory + return Path.home() / "Videos" + + +def get_local_storage_path(config: Dict[str, Any]) -> Optional[Path]: + """Get local storage path from config. + + Supports both formats: + - New: config["storage"]["local"]["path"] + - Old: config["Local"]["path"] + + Args: + config: Configuration dict + + Returns: + Path object if found, None otherwise + """ + # Try new format first + storage = config.get("storage", {}) + if isinstance(storage, dict): + local_config = storage.get("local", {}) + if isinstance(local_config, dict): + path_str = local_config.get("path") + if path_str: + return Path(str(path_str)).expanduser() + + # Fall back to old format + local_config = config.get("Local", {}) + if isinstance(local_config, dict): + path_str = local_config.get("path") + if path_str: + return Path(str(path_str)).expanduser() + + return None + + +def get_debrid_api_key(config: Dict[str, Any], service: str = "All-debrid") -> Optional[str]: + """Get Debrid API key from config. + + Supports both formats: + - New: config["storage"]["debrid"]["All-debrid"] + - Old: config["Debrid"]["All-debrid"] + + Args: + config: Configuration dict + service: Service name (default: "All-debrid") + + Returns: + API key string if found, None otherwise + """ + # Try new format first + storage = config.get("storage", {}) + if isinstance(storage, dict): + debrid_config = storage.get("debrid", {}) + if isinstance(debrid_config, dict): + api_key = debrid_config.get(service) + if api_key: # Check if not None and not empty + return str(api_key).strip() if api_key else None + + # Fall back to old format + debrid_config = config.get("Debrid", {}) + if isinstance(debrid_config, dict): + api_key = debrid_config.get(service) + if api_key: # Check if not None and not empty + return str(api_key).strip() if api_key else None + + return None + + +def get_provider_credentials(config: Dict[str, Any], provider: str) -> Optional[Dict[str, str]]: + """Get provider credentials (email/password) from config. + + Supports both formats: + - New: config["provider"][provider] = {"email": "...", "password": "..."} + - Old: config[provider.capitalize()] = {"email": "...", "password": "..."} + + Args: + config: Configuration dict + provider: Provider name (e.g., "openlibrary", "soulseek") + + Returns: + Dict with credentials if found, None otherwise + """ + # Try new format first + provider_config = config.get("provider", {}) + if isinstance(provider_config, dict): + creds = provider_config.get(provider.lower(), {}) + if isinstance(creds, dict) and creds: + return creds + + # Fall back to old format (capitalized key) + old_key_map = { + "openlibrary": "OpenLibrary", + "archive": "Archive", + "soulseek": "Soulseek", + } + old_key = old_key_map.get(provider.lower()) + if old_key: + creds = config.get(old_key, {}) + if isinstance(creds, dict) and creds: + return creds + + return None + + +def resolve_cookies_path(config: Dict[str, Any], script_dir: Optional[Path] = None) -> Optional[Path]: + value = config.get("cookies") or config.get("Cookies_Path") + if value: + candidate = Path(str(value)).expanduser() + if candidate.is_file(): + return candidate + base_dir = script_dir or SCRIPT_DIR + default_path = base_dir / "cookies.txt" + if default_path.is_file(): + return default_path + return None + +def resolve_debug_log(config: Dict[str, Any]) -> Optional[Path]: + value = config.get("download_debug_log") + if not value: + return None + path = Path(str(value)).expanduser() + if not path.is_absolute(): + path = Path.cwd() / path + return path + +def load_config(config_dir: Optional[Path] = None, filename: str = DEFAULT_CONFIG_FILENAME) -> Dict[str, Any]: + base_dir = config_dir or SCRIPT_DIR + config_path = base_dir / filename + cache_key = _make_cache_key(config_dir, filename, config_path) + if cache_key in _CONFIG_CACHE: + return _CONFIG_CACHE[cache_key] + + try: + raw = config_path.read_text(encoding="utf-8") + except FileNotFoundError: + # Try alternate filename if default not found + if filename == DEFAULT_CONFIG_FILENAME: + alt_path = base_dir / "downlow.json" + try: + raw = alt_path.read_text(encoding="utf-8") + config_path = alt_path + cache_key = _make_cache_key(config_dir, filename, alt_path) + except FileNotFoundError: + _CONFIG_CACHE[cache_key] = {} + return {} + except OSError as exc: + log(f"Failed to read {alt_path}: {exc}") + _CONFIG_CACHE[cache_key] = {} + return {} + else: + _CONFIG_CACHE[cache_key] = {} + return {} + except OSError as exc: + log(f"Failed to read {config_path}: {exc}") + _CONFIG_CACHE[cache_key] = {} + return {} + + raw = raw.strip() + if not raw: + _CONFIG_CACHE[cache_key] = {} + return {} + try: + data = json.loads(raw) + except json.JSONDecodeError as exc: + log(f"Invalid JSON in {config_path}: {exc}") + _CONFIG_CACHE[cache_key] = {} + return {} + if not isinstance(data, dict): + log(f"Expected object in {config_path}, got {type(data).__name__}") + _CONFIG_CACHE[cache_key] = {} + return {} + + _CONFIG_CACHE[cache_key] = data + return data + + +def reload_config(config_dir: Optional[Path] = None, filename: str = DEFAULT_CONFIG_FILENAME) -> Dict[str, Any]: + cache_key = _make_cache_key(config_dir, filename, None) + _CONFIG_CACHE.pop(cache_key, None) + return load_config(config_dir=config_dir, filename=filename) + + +def clear_config_cache() -> None: + _CONFIG_CACHE.clear() + +def save_config( + config: Dict[str, Any], + config_dir: Optional[Path] = None, + filename: str = DEFAULT_CONFIG_FILENAME, +) -> None: + base_dir = config_dir or SCRIPT_DIR + config_path = base_dir / filename + + # Load existing config to preserve keys that aren't being changed + try: + existing_raw = config_path.read_text(encoding="utf-8") + existing_data = json.loads(existing_raw.strip()) + if isinstance(existing_data, dict): + # Merge: existing config as base, then overlay with new config + merged = existing_data.copy() + merged.update(config) + config = merged + except (FileNotFoundError, OSError, json.JSONDecodeError): + # File doesn't exist or is invalid, use provided config as-is + pass + + try: + config_path.write_text( + json.dumps(config, ensure_ascii=False, indent=2, sort_keys=True) + "\n", + encoding="utf-8", + ) + except OSError as exc: + raise RuntimeError(f"Failed to write config to {config_path}: {exc}") from exc + + cache_key = _make_cache_key(config_dir, filename, config_path) + _CONFIG_CACHE[cache_key] = config + +def load() -> Dict[str, Any]: + """Return the parsed downlow configuration.""" + return load_config() + +def save(config: Dict[str, Any]) -> None: + """Persist *config* back to disk.""" + save_config(config) diff --git a/helper/__init__.py b/helper/__init__.py new file mode 100644 index 0000000..ffe9ab4 --- /dev/null +++ b/helper/__init__.py @@ -0,0 +1,92 @@ +"""Helper modules for the downlow mpv integration.""" +from . import hydrus as _hydrus +from . import download as _download +from . import tasks as _tasks +from . import utils as _utils + +try: # Optional dependency on Playwright + from . import webshot as _webshot +except Exception as exc: # pragma: no cover - surfaced when Playwright is missing + _webshot = None # type: ignore + ScreenshotError = None # type: ignore[assignment] + ScreenshotOptions = None # type: ignore[assignment] + ScreenshotResult = None # type: ignore[assignment] + capture_screenshot = None # type: ignore[assignment] + ScreenshotImportError = exc # type: ignore[assignment] +else: + ScreenshotError = _webshot.ScreenshotError + ScreenshotOptions = _webshot.ScreenshotOptions + ScreenshotResult = _webshot.ScreenshotResult + capture_screenshot = _webshot.capture_screenshot + ScreenshotImportError = None +# CBOR utilities +decode_cbor = _utils.decode_cbor +jsonify = _utils.jsonify +# General utilities +CHUNK_SIZE = _utils.CHUNK_SIZE +ensure_directory = _utils.ensure_directory +unique_path = _utils.unique_path +download_hydrus_file = _hydrus.download_hydrus_file +sanitize_metadata_value = _utils.sanitize_metadata_value +unique_preserve_order = _utils.unique_preserve_order +sha256_file = _utils.sha256_file +create_metadata_sidecar = _utils.create_metadata_sidecar +create_tags_sidecar = _utils.create_tags_sidecar +# Format utilities +format_bytes = _utils.format_bytes +format_duration = _utils.format_duration +format_timestamp = _utils.format_timestamp +format_metadata_value = _utils.format_metadata_value +# Link utilities +extract_link = _utils.extract_link +extract_link_from_args = _utils.extract_link_from_args +extract_link_from_result = _utils.extract_link_from_result +get_api_key = _utils.get_api_key +add_direct_link_to_result = _utils.add_direct_link_to_result +# URL policy utilities +resolve_url_policy = _utils.resolve_url_policy +UrlPolicy = _utils.UrlPolicy +# Download utilities +DownloadOptions = _download.DownloadOptions +DownloadError = _download.DownloadError +DownloadMediaResult = _download.DownloadMediaResult +download_media = _download.download_media +is_url_supported_by_ytdlp = _download.is_url_supported_by_ytdlp +probe_url = _download.probe_url +# Hydrus utilities +hydrus_request = _hydrus.hydrus_request +hydrus_export = _hydrus.hydrus_export +HydrusClient = _hydrus.HydrusClient +HydrusRequestError = _hydrus.HydrusRequestError +connect_ipc = _tasks.connect_ipc +ipc_sender = _tasks.ipc_sender +__all__ = [ + 'decode_cbor', + 'jsonify', + 'CHUNK_SIZE', + 'ensure_directory', + 'unique_path', + 'download_hydrus_file', + 'sanitize_metadata_value', + 'unique_preserve_order', + 'sha256_file', + 'resolve_url_policy', + 'UrlPolicy', + 'ScreenshotError', + 'ScreenshotOptions', + 'ScreenshotResult', + 'capture_screenshot', + 'ScreenshotImportError', + 'DownloadOptions', + 'DownloadError', + 'DownloadMediaResult', + 'download_media', + 'is_url_supported_by_ytdlp', + 'probe_url', + 'HydrusClient', + 'HydrusRequestError', + 'hydrus_request', + 'hydrus_export', + 'connect_ipc', + 'ipc_sender', +] diff --git a/helper/adjective.json b/helper/adjective.json new file mode 100644 index 0000000..60f9413 --- /dev/null +++ b/helper/adjective.json @@ -0,0 +1,130 @@ +{ + "Occult": [ + "esoterica", + "ritual", + "alchemy", + "magic", + "hermetic", + "divination", + "grimoires", + "symbolism", + "ceremony" + ], + "Philosophy": [ + "ethics", + "metaphysics", + "epistemology", + "logic", + "existentialism", + "stoicism", + "phenomenology", + "dialectic", + "aesthetics" + ], + "Mystery": [ + "investigation", + "crime", + "detective", + "noir", + "thriller", + "suspense", + "conspiracy", + "whodunit", + "clues" + ], + "Religion": [ + "scripture", + "theology", + "worship", + "ritual", + "doctrine", + "faith", + "tradition", + "liturgy", + "sacred" + ], + "Mythology": [ + "gods", + "creation", + "heroes", + "legends", + "folklore", + "pantheon", + "epic", + "mythic", + "archetype" + ], + "Science": [ + "research", + "experiment", + "theory", + "biology", + "physics", + "chemistry", + "data", + "method", + "innovation" + ], + "Art": [ + "visual", + "painting", + "sculpture", + "modernism", + "technique", + "studio", + "curation", + "expression", + "composition" + ], + "Literature": [ + "fiction", + "poetry", + "novel", + "criticism", + "narrative", + "prose", + "drama", + "canonical", + "translation" + ], + "History": [ + "archaeology", + "chronicle", + "period", + "empire", + "revolution", + "archive", + "heritage", + "historiography", + "timeline" + ], + "Psychology": [ + "cognition", + "behavior", + "therapy", + "development", + "neuroscience", + "personality", + "perception", + "emotion", + "motivation" + ], + "gnostic": [ + "religion", + "scripture", + "gnostic", + "gospel", + "wisdom", + "spirituality", + "ancient", + "philosophy", + "esoteric", + "mysticism", + "mythology", + "theology", + "sacred", + "divine", + "apocrapha", + "gnosticism" + ] +} \ No newline at end of file diff --git a/helper/alldebrid.py b/helper/alldebrid.py new file mode 100644 index 0000000..653e0ed --- /dev/null +++ b/helper/alldebrid.py @@ -0,0 +1,829 @@ +"""AllDebrid API integration for converting free links to direct downloads. + +AllDebrid is a debrid service that unlocks free file hosters and provides direct download links. +API docs: https://docs.alldebrid.com/#general-informations +""" + +from __future__ import annotations + +import json +import sys + +from helper.logger import log, debug +import time +import logging +from pathlib import Path +from typing import Any, Dict, Optional, Set, List, Sequence +from urllib.parse import urlencode, urlparse +from .http_client import HTTPClient + +logger = logging.getLogger(__name__) + + +class AllDebridError(Exception): + """Raised when AllDebrid API request fails.""" + pass + + +# Cache for supported hosters (domain -> host info) +_SUPPORTED_HOSTERS_CACHE: Optional[Dict[str, Dict[str, Any]]] = None +_CACHE_TIMESTAMP: float = 0 +_CACHE_DURATION: float = 3600 # 1 hour + + +class AllDebridClient: + """Client for AllDebrid API.""" + + # Try both v4 and v3 APIs + BASE_URLS = [ + "https://api.alldebrid.com/v4", + "https://api.alldebrid.com/v3", + ] + + def __init__(self, api_key: str): + """Initialize AllDebrid client with API key. + + Args: + api_key: AllDebrid API key from config + """ + self.api_key = api_key.strip() + if not self.api_key: + raise AllDebridError("AllDebrid API key is empty") + self.base_url = self.BASE_URLS[0] # Start with v4 + + def _request(self, endpoint: str, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]: + """Make a request to AllDebrid API. + + Args: + endpoint: API endpoint (e.g., "user/profile", "link/unlock") + params: Query parameters + + Returns: + Parsed JSON response + + Raises: + AllDebridError: If request fails or API returns error + """ + if params is None: + params = {} + + # Add API key to params + params['apikey'] = self.api_key + + url = f"{self.base_url}/{endpoint}" + query_string = urlencode(params) + full_url = f"{url}?{query_string}" + + logger.debug(f"[AllDebrid] {endpoint} request to {full_url[:80]}...") + + try: + # Pass timeout to HTTPClient init, not to get() + with HTTPClient(timeout=30.0, headers={'User-Agent': 'downlow/1.0'}) as client: + try: + response = client.get(full_url) + response.raise_for_status() + except Exception as req_err: + # Log detailed error info + logger.error(f"[AllDebrid] Request error to {full_url[:80]}: {req_err}", exc_info=True) + if hasattr(req_err, 'response') and req_err.response is not None: # type: ignore + try: + error_body = req_err.response.content.decode('utf-8') # type: ignore + logger.error(f"[AllDebrid] Response body: {error_body[:200]}") + except: + pass + raise + + data = json.loads(response.content.decode('utf-8')) + logger.debug(f"[AllDebrid] Response status: {response.status_code}") + + # Check for API errors + if data.get('status') == 'error': + error_msg = data.get('error', {}).get('message', 'Unknown error') + logger.error(f"[AllDebrid] API error: {error_msg}") + raise AllDebridError(f"AllDebrid API error: {error_msg}") + + return data + except AllDebridError: + raise + except Exception as exc: + error_msg = f"AllDebrid request failed: {exc}" + logger.error(f"[AllDebrid] {error_msg}", exc_info=True) + raise AllDebridError(error_msg) + + def unlock_link(self, link: str) -> Optional[str]: + """Unlock a restricted link and get direct download URL. + + Args: + link: Restricted link to unlock + + Returns: + Direct download URL, or None if already unrestricted + + Raises: + AllDebridError: If unlock fails + """ + if not link.startswith(('http://', 'https://')): + raise AllDebridError(f"Invalid URL: {link}") + + try: + response = self._request('link/unlock', {'link': link}) + + # Check if unlock was successful + if response.get('status') == 'success': + data = response.get('data', {}) + + # AllDebrid returns the download info in 'link' field + if 'link' in data: + return data['link'] + + # Alternative: check for 'file' field + if 'file' in data: + return data['file'] + + # If no direct link, return the input link + return link + + return None + except AllDebridError: + raise + except Exception as exc: + raise AllDebridError(f"Failed to unlock link: {exc}") + + def check_host(self, hostname: str) -> Dict[str, Any]: + """Check if a host is supported by AllDebrid. + + Args: + hostname: Hostname to check (e.g., "uploadhaven.com") + + Returns: + Host information dict with support status + + Raises: + AllDebridError: If request fails + """ + try: + response = self._request('host', {'name': hostname}) + + if response.get('status') == 'success': + return response.get('data', {}) + + return {} + except AllDebridError: + raise + except Exception as exc: + raise AllDebridError(f"Failed to check host: {exc}") + + def get_user_info(self) -> Dict[str, Any]: + """Get current user account information. + + Returns: + User information dict + + Raises: + AllDebridError: If request fails + """ + try: + response = self._request('user/profile') + + if response.get('status') == 'success': + return response.get('data', {}) + + return {} + except AllDebridError: + raise + except Exception as exc: + raise AllDebridError(f"Failed to get user info: {exc}") + + def get_supported_hosters(self) -> Dict[str, Dict[str, Any]]: + """Get list of all supported hosters from AllDebrid API. + + Returns: + Dict mapping domain to host info (status, name, etc) + + Raises: + AllDebridError: If request fails + """ + try: + response = self._request('hosts/domains') + + if response.get('status') == 'success': + data = response.get('data', {}) + # The API returns hosts keyed by domain + return data if isinstance(data, dict) else {} + + return {} + except AllDebridError: + raise + except Exception as exc: + raise AllDebridError(f"Failed to get supported hosters: {exc}") + + def magnet_add(self, magnet_uri: str) -> Dict[str, Any]: + """Submit a magnet link or torrent hash to AllDebrid for processing. + + AllDebrid will download the torrent content and store it in the account. + Processing time varies based on torrent size and availability. + + Args: + magnet_uri: Magnet URI (magnet:?xt=urn:btih:...) or torrent hash + + Returns: + Dict with magnet info: + - id: Magnet ID (int) - needed for status checks + - name: Torrent name + - hash: Torrent hash + - size: Total file size (bytes) + - ready: Boolean - True if already available + + Raises: + AllDebridError: If submit fails (requires premium, invalid magnet, etc) + """ + if not magnet_uri: + raise AllDebridError("Magnet URI is empty") + + try: + # API endpoint: POST /v4/magnet/upload + # Format: /magnet/upload?apikey=key&magnets[]=magnet:?xt=... + response = self._request('magnet/upload', {'magnets[]': magnet_uri}) + + if response.get('status') == 'success': + data = response.get('data', {}) + magnets = data.get('magnets', []) + + if magnets and len(magnets) > 0: + magnet_info = magnets[0] + + # Check for errors in the magnet response + if 'error' in magnet_info: + error = magnet_info['error'] + error_msg = error.get('message', 'Unknown error') + raise AllDebridError(f"Magnet error: {error_msg}") + + return magnet_info + + raise AllDebridError("No magnet data in response") + + raise AllDebridError(f"API error: {response.get('error', 'Unknown')}") + except AllDebridError: + raise + except Exception as exc: + raise AllDebridError(f"Failed to submit magnet: {exc}") + + def magnet_status(self, magnet_id: int, include_files: bool = False) -> Dict[str, Any]: + """Get status of a magnet currently being processed or stored. + + Status codes: + 0-3: Processing (in queue, downloading, compressing, uploading) + 4: Ready (files available for download) + 5-15: Error (upload failed, not downloaded in 20min, too big, etc) + + Args: + magnet_id: Magnet ID from magnet_add() + include_files: If True, includes file list in response + + Returns: + Dict with status info: + - id: Magnet ID + - filename: Torrent name + - size: Total size (bytes) + - status: Human-readable status + - statusCode: Numeric code (0-15) + - downloaded: Bytes downloaded so far + - uploaded: Bytes uploaded so far + - seeders: Number of seeders + - downloadSpeed: Current speed (bytes/sec) + - uploadSpeed: Current speed (bytes/sec) + - files: (optional) Array of file objects when include_files=True + Each file: {n: name, s: size, l: download_link} + + Raises: + AllDebridError: If status check fails + """ + if not isinstance(magnet_id, int) or magnet_id <= 0: + raise AllDebridError(f"Invalid magnet ID: {magnet_id}") + + try: + # Use v4.1 endpoint for better response format + # Temporarily override base_url for this request + old_base = self.base_url + self.base_url = "https://api.alldebrid.com/v4.1" + + try: + response = self._request('magnet/status', {'id': str(magnet_id)}) + finally: + self.base_url = old_base + + if response.get('status') == 'success': + data = response.get('data', {}) + magnets = data.get('magnets', {}) + + # Handle both list and dict responses + if isinstance(magnets, list) and len(magnets) > 0: + return magnets[0] + elif isinstance(magnets, dict) and magnets: + return magnets + + raise AllDebridError(f"No magnet found with ID {magnet_id}") + + raise AllDebridError(f"API error: {response.get('error', 'Unknown')}") + except AllDebridError: + raise + except Exception as exc: + raise AllDebridError(f"Failed to get magnet status: {exc}") + + def magnet_status_live(self, magnet_id: int, session: int = None, counter: int = 0) -> Dict[str, Any]: + """Get live status of a magnet using delta sync mode. + + The live mode endpoint provides real-time progress by only sending + deltas (changed fields) instead of full status on each call. This + reduces bandwidth and server load compared to regular polling. + + Note: The "live" designation refers to the delta-sync mode where you + maintain state locally and apply diffs from the API, not a streaming + endpoint. Regular magnet_status() polling is simpler for single magnets. + + Docs: https://docs.alldebrid.com/#get-status-live-mode + + Args: + magnet_id: Magnet ID from magnet_add() + session: Session ID (use same ID across multiple calls). If None, will query current status + counter: Counter value from previous response (starts at 0) + + Returns: + Dict with magnet status. May contain only changed fields if counter > 0. + For single-magnet tracking, use magnet_status() instead. + + Raises: + AllDebridError: If request fails + """ + if not isinstance(magnet_id, int) or magnet_id <= 0: + raise AllDebridError(f"Invalid magnet ID: {magnet_id}") + + try: + # For single magnet queries, just use regular endpoint with ID + # The "live mode" with session/counter is for multi-magnet dashboards + # where bandwidth savings from diffs matter + response = self._request('magnet/status', {'id': magnet_id}) + + if response.get('status') == 'success': + data = response.get('data', {}) + magnets = data.get('magnets', []) + + # Handle list response + if isinstance(magnets, list) and len(magnets) > 0: + return magnets[0] + + raise AllDebridError(f"No magnet found with ID {magnet_id}") + + raise AllDebridError(f"API error: {response.get('error', 'Unknown')}") + except AllDebridError: + raise + except Exception as exc: + raise AllDebridError(f"Failed to get magnet live status: {exc}") + + def magnet_links(self, magnet_ids: list) -> Dict[str, Any]: + """Get files and download links for one or more magnets. + + Use this after magnet_status shows statusCode == 4 (Ready). + Returns the file tree structure with direct download links. + + Args: + magnet_ids: List of magnet IDs to get files for + + Returns: + Dict mapping magnet_id (as string) -> magnet_info: + - id: Magnet ID + - files: Array of file/folder objects + File: {n: name, s: size, l: direct_download_link} + Folder: {n: name, e: [sub_items]} + + Raises: + AllDebridError: If request fails + """ + if not magnet_ids: + raise AllDebridError("No magnet IDs provided") + + try: + # Build parameter: id[]=123&id[]=456 style + params = {} + for i, magnet_id in enumerate(magnet_ids): + params[f'id[{i}]'] = str(magnet_id) + + response = self._request('magnet/files', params) + + if response.get('status') == 'success': + data = response.get('data', {}) + magnets = data.get('magnets', []) + + # Convert list to dict keyed by ID (as string) for easier access + result = {} + for magnet_info in magnets: + magnet_id = magnet_info.get('id') + if magnet_id: + result[str(magnet_id)] = magnet_info + + return result + + raise AllDebridError(f"API error: {response.get('error', 'Unknown')}") + except AllDebridError: + raise + except Exception as exc: + raise AllDebridError(f"Failed to get magnet files: {exc}") + + def instant_available(self, magnet_hash: str) -> Optional[List[Dict[str, Any]]]: + """Check if magnet is available for instant streaming without downloading. + + AllDebrid's "instant" feature checks if a magnet can be streamed directly + without downloading all the data. Returns available video/audio files. + + Args: + magnet_hash: Torrent hash (with or without magnet: prefix) + + Returns: + List of available files for streaming, or None if not available + Each file: {n: name, s: size, e: extension, t: type} + Returns empty list if torrent not found or not available + + Raises: + AllDebridError: If API request fails + """ + try: + # Parse magnet hash if needed + if magnet_hash.startswith('magnet:'): + # Extract hash from magnet URI + import re + match = re.search(r'xt=urn:btih:([a-fA-F0-9]+)', magnet_hash) + if not match: + return None + hash_value = match.group(1) + else: + hash_value = magnet_hash.strip() + + if not hash_value or len(hash_value) < 32: + return None + + response = self._request('magnet/instant', {'magnet': hash_value}) + + if response.get('status') == 'success': + data = response.get('data', {}) + # Returns 'files' array if available, or empty + return data.get('files', []) + + # Not available is not an error, just return empty list + return [] + + except AllDebridError: + raise + except Exception as exc: + logger.debug(f"[AllDebrid] instant_available check failed: {exc}") + return None + + def magnet_delete(self, magnet_id: int) -> bool: + """Delete a magnet from the AllDebrid account. + + Args: + magnet_id: Magnet ID to delete + + Returns: + True if deletion was successful + + Raises: + AllDebridError: If deletion fails + """ + if not isinstance(magnet_id, int) or magnet_id <= 0: + raise AllDebridError(f"Invalid magnet ID: {magnet_id}") + + try: + response = self._request('magnet/delete', {'id': str(magnet_id)}) + + if response.get('status') == 'success': + return True + + raise AllDebridError(f"API error: {response.get('error', 'Unknown')}") + except AllDebridError: + raise + except Exception as exc: + raise AllDebridError(f"Failed to delete magnet: {exc}") + + +def _get_cached_supported_hosters(api_key: str) -> Set[str]: + """Get cached list of supported hoster domains. + + Uses AllDebrid API to fetch the list once per hour, + caching the result to avoid repeated API calls. + + Args: + api_key: AllDebrid API key + + Returns: + Set of supported domain names (lowercased) + """ + global _SUPPORTED_HOSTERS_CACHE, _CACHE_TIMESTAMP + + now = time.time() + + # Return cached result if still valid + if _SUPPORTED_HOSTERS_CACHE is not None and (now - _CACHE_TIMESTAMP) < _CACHE_DURATION: + return set(_SUPPORTED_HOSTERS_CACHE.keys()) + + # Fetch fresh list from API + try: + client = AllDebridClient(api_key) + hosters_dict = client.get_supported_hosters() + + if hosters_dict: + # API returns: hosts (list), streams (list), redirectors (list) + # Combine all into a single set + all_domains: Set[str] = set() + + # Add hosts + if 'hosts' in hosters_dict and isinstance(hosters_dict['hosts'], list): + all_domains.update(hosters_dict['hosts']) + + # Add streams + if 'streams' in hosters_dict and isinstance(hosters_dict['streams'], list): + all_domains.update(hosters_dict['streams']) + + # Add redirectors + if 'redirectors' in hosters_dict and isinstance(hosters_dict['redirectors'], list): + all_domains.update(hosters_dict['redirectors']) + + # Cache as dict for consistency + _SUPPORTED_HOSTERS_CACHE = {domain: {} for domain in all_domains} + _CACHE_TIMESTAMP = now + + if all_domains: + debug(f"✓ Cached {len(all_domains)} supported hosters") + + return all_domains + except Exception as exc: + log(f"⚠ Failed to fetch supported hosters: {exc}", file=sys.stderr) + # Return any cached hosters even if expired + if _SUPPORTED_HOSTERS_CACHE: + return set(_SUPPORTED_HOSTERS_CACHE.keys()) + + # Fallback: empty set if no cache available + return set() + + +def is_link_restrictable_hoster(url: str, api_key: str) -> bool: + """Check if a URL is from a hoster that AllDebrid can unlock. + + Intelligently queries the AllDebrid API to detect if the URL is + from a supported restricted hoster. + + Args: + url: URL to check + api_key: AllDebrid API key + + Returns: + True if URL is from a supported restrictable hoster + """ + if not url or not api_key: + return False + + try: + # Extract domain from URL + parsed = urlparse(url) + domain = parsed.netloc.lower() + + # Remove www. prefix for comparison + if domain.startswith('www.'): + domain = domain[4:] + + # Get supported hosters (cached) + supported = _get_cached_supported_hosters(api_key) + + if not supported: + # API check failed, fall back to manual detection + # Check for common restricted hosters + common_hosters = { + 'uploadhaven.com', 'uploaded.to', 'uploaded.net', + 'datafile.com', 'rapidfile.io', 'nitroflare.com', + '1fichier.com', 'mega.nz', 'mediafire.com' + } + return any(host in url.lower() for host in common_hosters) + + # Check if domain is in supported list + # Need to check exact match and with/without www + return domain in supported or f"www.{domain}" in supported + except Exception as exc: + log(f"⚠ Hoster detection failed: {exc}", file=sys.stderr) + return False + + +def convert_link_with_debrid(link: str, api_key: str) -> Optional[str]: + """Convert a restricted link to a direct download URL using AllDebrid. + + Args: + link: Restricted link + api_key: AllDebrid API key + + Returns: + Direct download URL, or original link if already unrestricted + """ + if not api_key: + return None + + try: + client = AllDebridClient(api_key) + direct_link = client.unlock_link(link) + + if direct_link and direct_link != link: + debug(f"✓ Converted link: {link[:60]}... → {direct_link[:60]}...") + return direct_link + + return None + except AllDebridError as exc: + log(f"⚠ Failed to convert link: {exc}", file=sys.stderr) + return None + except Exception as exc: + log(f"⚠ Unexpected error: {exc}", file=sys.stderr) + return None + + +def is_magnet_link(uri: str) -> bool: + """Check if a URI is a magnet link. + + Magnet links start with 'magnet:?xt=urn:btih:' or just 'magnet:' + + Args: + uri: URI to check + + Returns: + True if URI is a magnet link + """ + if not uri: + return False + return uri.lower().startswith('magnet:') + + +def is_torrent_hash(text: str) -> bool: + """Check if text looks like a torrent hash (40 or 64 hex characters). + + Common formats: + - Info hash v1: 40 hex chars (SHA-1) + - Info hash v2: 64 hex chars (SHA-256) + + Args: + text: Text to check + + Returns: + True if text matches torrent hash format + """ + if not text or not isinstance(text, str): + return False + + text = text.strip() + + # Check if it's 40 hex chars (SHA-1) or 64 hex chars (SHA-256) + if len(text) not in (40, 64): + return False + + try: + # Try to parse as hex + int(text, 16) + return True + except ValueError: + return False + + +def is_torrent_file(path: str) -> bool: + """Check if a file path is a .torrent file. + + Args: + path: File path to check + + Returns: + True if file has .torrent extension + """ + if not path: + return False + return path.lower().endswith('.torrent') + + +def parse_magnet_or_hash(uri: str) -> Optional[str]: + """Parse a magnet URI or hash into a format for AllDebrid API. + + AllDebrid's magnet/upload endpoint accepts: + - Full magnet URIs: magnet:?xt=urn:btih:... + - Info hashes: 40 or 64 hex characters + + Args: + uri: Magnet URI or hash + + Returns: + Normalized input for AllDebrid API, or None if invalid + """ + if not uri: + return None + + uri = uri.strip() + + # Already a magnet link - just return it + if is_magnet_link(uri): + return uri + + # Check if it's a valid hash + if is_torrent_hash(uri): + return uri + + # Not a recognized format + return None + + +# ============================================================================ +# Cmdlet: unlock_link +# ============================================================================ + +def unlock_link_cmdlet(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: + """Unlock a restricted link using AllDebrid. + + Converts free hosters and restricted links to direct download URLs. + + Usage: + unlock-link + unlock-link # Uses URL from pipeline result + + Requires: + - AllDebrid API key in config under Debrid.All-debrid + + Args: + result: Pipeline result object + args: Command arguments + config: Configuration dictionary + + Returns: + 0 on success, 1 on failure + """ + try: + from .link_utils import ( + extract_link, + get_api_key, + add_direct_link_to_result, + ) + except ImportError as e: + log(f"Required modules unavailable: {e}", file=sys.stderr) + return 1 + + # Get link from args or result + link = extract_link(result, args) + + if not link: + log("No valid URL provided", file=sys.stderr) + return 1 + + # Get AllDebrid API key from config + api_key = get_api_key(config, "AllDebrid", "Debrid.All-debrid") + + if not api_key: + log("AllDebrid API key not configured in Debrid.All-debrid", file=sys.stderr) + return 1 + + # Try to unlock the link + debug(f"Unlocking: {link}") + direct_link = convert_link_with_debrid(link, api_key) + + if direct_link: + debug(f"✓ Direct link: {direct_link}") + + # Update result with direct link + add_direct_link_to_result(result, direct_link, link) + + # Return the updated result via pipeline context + # Note: The cmdlet wrapper will handle emitting to pipeline + return 0 + else: + log(f"❌ Failed to unlock link or already unrestricted", file=sys.stderr) + return 1 + + +# ============================================================================ +# Cmdlet Registration +# ============================================================================ + +def _register_unlock_link(): + """Register unlock-link command with cmdlet registry if available.""" + try: + from cmdlets import register + + @register(["unlock-link"]) + def unlock_link_wrapper(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: + """Wrapper to make unlock_link_cmdlet available as cmdlet.""" + import pipeline as ctx + + ret_code = unlock_link_cmdlet(result, args, config) + + # If successful, emit the result + if ret_code == 0: + ctx.emit(result) + + return ret_code + + return unlock_link_wrapper + except ImportError: + # If cmdlets module not available, just return None + return None + + +# Register when module is imported +_unlock_link_registration = _register_unlock_link() diff --git a/helper/archive_client.py b/helper/archive_client.py new file mode 100644 index 0000000..b6c270d --- /dev/null +++ b/helper/archive_client.py @@ -0,0 +1,567 @@ +"""Archive.org API client for borrowing and downloading books. + +This module provides low-level functions for interacting with Archive.org: +- Authentication (login, credential management) +- Borrowing (loan, return_loan) +- Book metadata extraction (get_book_infos, get_book_metadata) +- Image downloading and deobfuscation +- PDF creation with metadata + +Used by unified_book_downloader.py for the borrowing workflow. +""" +from __future__ import annotations + +import base64 +import hashlib +import logging +import os +import re +import sys +import time +from concurrent import futures +from typing import Any, Dict, List, Optional, Sequence, Tuple + +import requests + +from helper.logger import log, debug + +try: + from Crypto.Cipher import AES # type: ignore + from Crypto.Util import Counter # type: ignore +except ImportError: + AES = None # type: ignore + Counter = None # type: ignore + +try: + from tqdm import tqdm # type: ignore +except ImportError: + tqdm = None # type: ignore + + +def credential_openlibrary(config: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]: + """Get OpenLibrary/Archive.org email and password from config. + + Supports both formats: + - New: {"provider": {"openlibrary": {"email": "...", "password": "..."}}} + - Old: {"Archive": {"email": "...", "password": "..."}} + {"archive_org_email": "...", "archive_org_password": "..."} + + Returns: (email, password) tuple, each can be None + """ + if not isinstance(config, dict): + return None, None + + # Try new format first + provider_config = config.get("provider", {}) + if isinstance(provider_config, dict): + openlibrary_config = provider_config.get("openlibrary", {}) + if isinstance(openlibrary_config, dict): + email = openlibrary_config.get("email") + password = openlibrary_config.get("password") + if email or password: + return email, password + + # Try old nested format + archive_config = config.get("Archive") + if isinstance(archive_config, dict): + email = archive_config.get("email") + password = archive_config.get("password") + if email or password: + return email, password + + # Fall back to old flat format + email = config.get("archive_org_email") + password = config.get("archive_org_password") + return email, password + + +def display_error(response: requests.Response, message: str) -> None: + """Display error and exit.""" + log(message, file=sys.stderr) + log(response.text, file=sys.stderr) + sys.exit(1) + + +def login(email: str, password: str) -> requests.Session: + """Login to archive.org. + + Args: + email: Archive.org email + password: Archive.org password + + Returns: + Authenticated requests.Session + + Raises: + SystemExit on login failure + """ + session = requests.Session() + session.get("https://archive.org/account/login", timeout=30) + + data = {"username": email, "password": password} + response = session.post("https://archive.org/account/login", data=data, timeout=30) + + if "bad_login" in response.text: + log("Invalid credentials!", file=sys.stderr) + sys.exit(1) + if "Successful login" in response.text: + debug("Successful login") + return session + display_error(response, "[-] Error while login:") + sys.exit(1) # Unreachable but satisfies type checker + + +def loan(session: requests.Session, book_id: str, verbose: bool = True) -> requests.Session: + """Borrow a book from archive.org (14-day loan). + + Args: + session: Authenticated requests.Session from login() + book_id: Archive.org book identifier (e.g., 'ia_book_id') + verbose: Whether to log messages + + Returns: + Session with active loan + + Raises: + SystemExit on loan failure + """ + data = {"action": "grant_access", "identifier": book_id} + response = session.post("https://archive.org/services/loans/loan/searchInside.php", data=data, timeout=30) + data["action"] = "browse_book" + response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30) + + if response.status_code == 400: + try: + if response.json()["error"] == "This book is not available to borrow at this time. Please try again later.": + debug("This book doesn't need to be borrowed") + return session + display_error(response, "Something went wrong when trying to borrow the book.") + except: + display_error(response, "The book cannot be borrowed") + + data["action"] = "create_token" + response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30) + + if "token" in response.text: + if verbose: + debug("Successful loan") + return session + display_error(response, "Something went wrong when trying to borrow the book.") + sys.exit(1) # Unreachable but satisfies type checker + + +def return_loan(session: requests.Session, book_id: str) -> None: + """Return a borrowed book. + + Args: + session: Authenticated requests.Session with active loan + book_id: Archive.org book identifier + """ + data = {"action": "return_loan", "identifier": book_id} + response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30) + if response.status_code == 200 and response.json()["success"]: + debug("Book returned") + else: + display_error(response, "Something went wrong when trying to return the book") + + +def get_book_infos(session: requests.Session, url: str) -> Tuple[str, List[str], Dict[str, Any]]: + """Extract book information and page links from archive.org viewer. + + Args: + session: Authenticated requests.Session + url: Book URL (e.g., https://archive.org/borrow/book_id or /details/book_id) + + Returns: + Tuple of (title, page_links, metadata) + + Raises: + RuntimeError: If page data cannot be extracted + """ + r = session.get(url, timeout=30).text + + # Try to extract the infos URL from the response + try: + # Look for the "url" field in the response + if '"url":"' not in r: + raise ValueError("No 'url' field found in response") + infos_url = "https:" + r.split('"url":"')[1].split('"')[0].replace("\\u0026", "&") + except (IndexError, ValueError) as e: + # If URL extraction fails, raise with better error message + raise RuntimeError(f"Failed to extract book info URL from response: {e}") + + response = session.get(infos_url, timeout=30) + data = response.json()["data"] + title = data["brOptions"]["bookTitle"].strip().replace(" ", "_") + title = "".join(c for c in title if c not in '<>:"/\\|?*') # Filter forbidden chars + title = title[:150] # Trim to avoid long file names + metadata = data["metadata"] + links = [] + + # Safely extract page links from brOptions data + try: + br_data = data.get("brOptions", {}).get("data", []) + for item in br_data: + if isinstance(item, list): + for page in item: + if isinstance(page, dict) and "uri" in page: + links.append(page["uri"]) + elif isinstance(item, dict) and "uri" in item: + links.append(item["uri"]) + except (KeyError, IndexError, TypeError) as e: + log(f"Warning: Error parsing page links: {e}", file=sys.stderr) + # Continue with whatever links we found + + if len(links) > 1: + debug(f"Found {len(links)} pages") + return title, links, metadata + elif len(links) == 1: + debug(f"Found {len(links)} page") + return title, links, metadata + else: + log("Error while getting image links - no pages found", file=sys.stderr) + raise RuntimeError("No pages found in book data") + + +def image_name(pages: int, page: int, directory: str) -> str: + """Generate image filename for page. + + Args: + pages: Total number of pages + page: Current page number (0-indexed) + directory: Directory to save to + + Returns: + Full path to image file + """ + return f"{directory}/{(len(str(pages)) - len(str(page))) * '0'}{page}.jpg" + + +def deobfuscate_image(image_data: bytes, link: str, obf_header: str) -> bytes: + """Decrypt obfuscated image data using AES-CTR. + + This handles Archive.org's image obfuscation for borrowed books. + Based on: https://github.com/justimm + + Args: + image_data: Encrypted image bytes + link: Image URL (used to derive AES key) + obf_header: X-Obfuscate header value (format: "1|BASE64_COUNTER") + + Returns: + Decrypted image bytes + """ + if not AES or not Counter: + raise RuntimeError("Crypto library not available") + + try: + version, counter_b64 = obf_header.split("|") + except Exception as e: + raise ValueError("Invalid X-Obfuscate header format") from e + + if version != "1": + raise ValueError("Unsupported obfuscation version: " + version) + + # Derive AES key from URL + aesKey = re.sub(r"^https?:\/\/.*?\/", "/", link) + sha1_digest = hashlib.sha1(aesKey.encode("utf-8")).digest() + key = sha1_digest[:16] + + # Decode counter + counter_bytes = base64.b64decode(counter_b64) + if len(counter_bytes) != 16: + raise ValueError(f"Expected counter to be 16 bytes, got {len(counter_bytes)}") + + prefix = counter_bytes[:8] + initial_value = int.from_bytes(counter_bytes[8:], byteorder="big") + + # Create AES-CTR cipher + ctr = Counter.new(64, prefix=prefix, initial_value=initial_value, little_endian=False) # type: ignore + cipher = AES.new(key, AES.MODE_CTR, counter=ctr) # type: ignore + + decrypted_part = cipher.decrypt(image_data[:1024]) + new_data = decrypted_part + image_data[1024:] + return new_data + + +def download_one_image( + session: requests.Session, + link: str, + i: int, + directory: str, + book_id: str, + pages: int, +) -> None: + """Download a single book page image. + + Handles obfuscated images and re-borrowing on 403 errors. + + Args: + session: Authenticated requests.Session + link: Direct image URL + i: Page index (0-based) + directory: Directory to save to + book_id: Archive.org book ID (for re-borrowing on 403) + pages: Total number of pages + """ + headers = { + "Referer": "https://archive.org/", + "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8", + "Sec-Fetch-Site": "same-site", + "Sec-Fetch-Mode": "no-cors", + "Sec-Fetch-Dest": "image", + } + retry = True + response = None + while retry: + try: + response = session.get(link, headers=headers, timeout=30) + if response.status_code == 403: + session = loan(session, book_id, verbose=False) + raise Exception("Borrow again") + if response.status_code == 200: + retry = False + except: + time.sleep(1) + + image = image_name(pages, i, directory) + + if response is None: + log(f"Failed to download page {i}", file=sys.stderr) + return + + obf_header = response.headers.get("X-Obfuscate") + image_content = None + if obf_header: + try: + image_content = deobfuscate_image(response.content, link, obf_header) + except Exception as e: + log(f"Deobfuscation failed: {e}", file=sys.stderr) + return + else: + image_content = response.content + + with open(image, "wb") as f: + f.write(image_content) + + +def download( + session: requests.Session, + n_threads: int, + directory: str, + links: List[str], + scale: int, + book_id: str, +) -> List[str]: + """Download all book pages as images. + + Uses thread pool for parallel downloads. + + Args: + session: Authenticated requests.Session + n_threads: Number of download threads + directory: Directory to save images to + links: List of image URLs + scale: Image resolution (0=highest, 10=lowest) + book_id: Archive.org book ID (for re-borrowing) + + Returns: + List of downloaded image file paths + """ + debug("Downloading pages...") + links = [f"{link}&rotate=0&scale={scale}" for link in links] + pages = len(links) + + tasks = [] + with futures.ThreadPoolExecutor(max_workers=n_threads) as executor: + for link in links: + i = links.index(link) + tasks.append( + executor.submit( + download_one_image, + session=session, + link=link, + i=i, + directory=directory, + book_id=book_id, + pages=pages, + ) + ) + if tqdm: + for _ in tqdm(futures.as_completed(tasks), total=len(tasks)): # type: ignore + pass + else: + for _ in futures.as_completed(tasks): + pass + + images = [image_name(pages, i, directory) for i in range(len(links))] + return images + + +def check_direct_download(book_id: str) -> Tuple[bool, str]: + """Check if a book can be downloaded directly without borrowing. + + Searches Archive.org metadata for downloadable PDF files. + + Args: + book_id: Archive.org book identifier + + Returns: + Tuple of (can_download: bool, pdf_url: str) + """ + try: + # First, try to get the metadata to find the actual PDF filename + metadata_url = f"https://archive.org/metadata/{book_id}" + response = requests.get(metadata_url, timeout=10) + response.raise_for_status() + metadata = response.json() + + # Find PDF file in files list + if "files" in metadata: + for file_info in metadata["files"]: + filename = file_info.get("name", "") + if filename.endswith(".pdf") and file_info.get("source") == "original": + # Found the original PDF + pdf_filename = filename + pdf_url = f"https://archive.org/download/{book_id}/{pdf_filename.replace(' ', '%20')}" + + # Verify it's accessible + check_response = requests.head(pdf_url, timeout=5, allow_redirects=True) + if check_response.status_code == 200: + return True, pdf_url + + return False, "" + + except Exception as e: + log(f"Error checking direct download: {e}", file=sys.stderr) + return False, "" + + +def get_openlibrary_by_isbn(isbn: str) -> Dict[str, Any]: + """Fetch book data from OpenLibrary using ISBN. + + Args: + isbn: ISBN-10 or ISBN-13 to search for + + Returns: + Dictionary with book metadata from OpenLibrary + """ + try: + # Try ISBN API first + api_url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn}&jscmd=data&format=json" + response = requests.get(api_url, timeout=10) + response.raise_for_status() + data = response.json() + + if data: + # Get first result + key = list(data.keys())[0] + return data[key] + return {} + except Exception as e: + log(f"Error fetching OpenLibrary data by ISBN: {e}", file=sys.stderr) + return {} + + +def extract_isbn_from_metadata(metadata: Dict[str, Any]) -> str: + """Extract ISBN from archive.org metadata. + + Looks for ISBN in various metadata fields. + + Args: + metadata: Archive.org metadata dictionary + + Returns: + ISBN string (clean, no hyphens) or empty string if not found + """ + # Try various common metadata fields + isbn_fields = [ + "isbn", "ISBN", "isbn_13", "isbn_10", "isbns", + "isbn-10", "isbn-13", "identifer_isbn" + ] + + for field in isbn_fields: + if field in metadata: + isbn_val = metadata[field] + if isinstance(isbn_val, list): + isbn_val = isbn_val[0] if isbn_val else None + if isbn_val and isinstance(isbn_val, str): + # Clean ISBN (remove hyphens, spaces) + isbn_clean = isbn_val.replace("-", "").replace(" ", "") + if len(isbn_clean) in [10, 13]: + return isbn_clean + + return "" + + +def normalize_url(url: str) -> str: + """Convert openlibrary.org URL to archive.org URL. + + Looks up the actual Archive.org ID from OpenLibrary API. + + Args: + url: Book URL (archive.org or openlibrary.org format) + + Returns: + Normalized archive.org URL + """ + url = url.strip() + + # Already archive.org format + if url.startswith("https://archive.org/details/"): + return url + + # Convert openlibrary.org format by querying the OpenLibrary API + if "openlibrary.org/books/" in url: + try: + # Extract the book ID (e.g., OL6796852M) + parts = url.split("/books/") + if len(parts) > 1: + book_id = parts[1].split("/")[0] + + # Query OpenLibrary API to get the book metadata + api_url = f"https://openlibrary.org/books/{book_id}.json" + response = requests.get(api_url, timeout=10) + response.raise_for_status() + data = response.json() + + # Look for identifiers including internet_archive or ocaid + # First try ocaid (Open Content Alliance ID) - this is most common + if "ocaid" in data: + ocaid = data["ocaid"] + return f"https://archive.org/details/{ocaid}" + + # Check for identifiers object + if "identifiers" in data: + identifiers = data["identifiers"] + + # Look for internet_archive ID + if "internet_archive" in identifiers: + ia_ids = identifiers["internet_archive"] + if isinstance(ia_ids, list) and ia_ids: + ia_id = ia_ids[0] + else: + ia_id = ia_ids + return f"https://archive.org/details/{ia_id}" + + # If no IA identifier found, use the book ID as fallback + log(f"No Internet Archive ID found for {book_id}. Attempting with OpenLibrary ID.", file=sys.stderr) + return f"https://archive.org/details/{book_id}" + + except requests.RequestException as e: + log(f"Could not fetch OpenLibrary metadata: {e}", file=sys.stderr) + # Fallback to using the book ID directly + parts = url.split("/books/") + if len(parts) > 1: + book_id = parts[1].split("/")[0] + return f"https://archive.org/details/{book_id}" + except (KeyError, IndexError) as e: + log(f"Error parsing OpenLibrary response: {e}", file=sys.stderr) + # Fallback to using the book ID directly + parts = url.split("/books/") + if len(parts) > 1: + book_id = parts[1].split("/")[0] + return f"https://archive.org/details/{book_id}" + + # Return original if can't parse + return url diff --git a/helper/download.py b/helper/download.py new file mode 100644 index 0000000..84933ec --- /dev/null +++ b/helper/download.py @@ -0,0 +1,730 @@ +"""Download media files using yt-dlp with support for direct file downloads. + +Lean, focused downloader without event infrastructure overhead. +- yt-dlp integration for streaming sites +- Direct file download fallback for PDFs, images, documents +- Tag extraction via metadata.extract_ytdlp_tags() +- Logging via helper.logger.log() +""" +from __future__ import annotations + +import re # noqa: F401 +import sys +import time +import traceback +from pathlib import Path +from typing import Any, Dict, Iterator, List, Optional +from urllib.parse import urljoin + +import httpx + +from helper.logger import log, debug +from .utils import ensure_directory, sha256_file +from .http_client import HTTPClient +from models import DownloadError, DownloadOptions, DownloadMediaResult, DebugLogger, ProgressBar + +try: + import yt_dlp # type: ignore + from yt_dlp.extractor import gen_extractors # type: ignore +except Exception as exc: + yt_dlp = None # type: ignore + YTDLP_IMPORT_ERROR = exc +else: + YTDLP_IMPORT_ERROR = None + +try: + from metadata import extract_ytdlp_tags +except ImportError: + extract_ytdlp_tags = None + +_EXTRACTOR_CACHE: List[Any] | None = None + + +def _ensure_yt_dlp_ready() -> None: + """Verify yt-dlp is available, raise if not.""" + if yt_dlp is not None: + return + detail = str(YTDLP_IMPORT_ERROR or "yt-dlp is not installed") + raise DownloadError(f"yt-dlp module not available: {detail}") + + +def _progress_callback(status: Dict[str, Any]) -> None: + """Simple progress callback using logger.""" + event = status.get("status") + if event == "downloading": + percent = status.get("_percent_str", "?") + speed = status.get("_speed_str", "?") + debug(f"Downloading {percent} at {speed}") + elif event == "finished": + debug(f"✓ Download finished: {status.get('filename')}") + elif event in ("postprocessing", "processing"): + debug(f"Post-processing: {status.get('postprocessor')}") + + +def is_url_supported_by_ytdlp(url: str) -> bool: + """Check if URL is supported by yt-dlp.""" + if yt_dlp is None: + return False + global _EXTRACTOR_CACHE + if _EXTRACTOR_CACHE is None: + try: + _EXTRACTOR_CACHE = [ie for ie in gen_extractors()] # type: ignore[arg-type] + except Exception: + _EXTRACTOR_CACHE = [] + for extractor in _EXTRACTOR_CACHE: + try: + if not extractor.suitable(url): + continue + except Exception: + continue + name = getattr(extractor, "IE_NAME", "") + if name.lower() == "generic": + continue + return True + return False + + +def list_formats(url: str, no_playlist: bool = False, playlist_items: Optional[str] = None) -> Optional[List[Dict[str, Any]]]: + """Get list of available formats for a URL using yt-dlp. + + Args: + url: URL to get formats for + no_playlist: If True, ignore playlists and list formats for single video + playlist_items: If specified, only list formats for these playlist items (e.g., "1,3,5-8") + + Returns: + List of format dictionaries with keys: format_id, format, resolution, fps, vcodec, acodec, filesize, etc. + Returns None if yt-dlp is not available or format listing fails. + """ + _ensure_yt_dlp_ready() + + try: + ydl_opts = { + "quiet": False, + "no_warnings": False, + "socket_timeout": 30, + } + + # Add no_playlist option if specified + if no_playlist: + ydl_opts["noplaylist"] = True + + # Add playlist_items filter if specified + if playlist_items: + ydl_opts["playlist_items"] = playlist_items + + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + debug(f"Fetching format list for: {url}") + info = ydl.extract_info(url, download=False) + + formats = info.get("formats", []) + if not formats: + log("No formats available", file=sys.stderr) + return None + + # Parse and extract relevant format info + result_formats = [] + for fmt in formats: + format_info = { + "format_id": fmt.get("format_id", ""), + "format": fmt.get("format", ""), + "ext": fmt.get("ext", ""), + "resolution": fmt.get("resolution", ""), + "width": fmt.get("width"), + "height": fmt.get("height"), + "fps": fmt.get("fps"), + "vcodec": fmt.get("vcodec", "none"), + "acodec": fmt.get("acodec", "none"), + "filesize": fmt.get("filesize"), + "tbr": fmt.get("tbr"), # Total bitrate + } + result_formats.append(format_info) + + debug(f"Found {len(result_formats)} available formats") + return result_formats + + except Exception as e: + log(f"✗ Error fetching formats: {e}", file=sys.stderr) + return None +def _build_ytdlp_options(opts: DownloadOptions) -> Dict[str, Any]: + """Build yt-dlp download options.""" + ensure_directory(opts.output_dir) + + outtmpl = str((opts.output_dir / "%(title)s.%(ext)s").resolve()) + + base_options: Dict[str, Any] = { + "outtmpl": outtmpl, + "quiet": False, + "no_warnings": False, + "noprogress": False, + "socket_timeout": 30, + "retries": 10, + "fragment_retries": 10, + "http_chunk_size": 10_485_760, + "restrictfilenames": True, + "progress_hooks": [_progress_callback], + } + + if opts.cookies_path and opts.cookies_path.is_file(): + base_options["cookiefile"] = str(opts.cookies_path) + + # Add no-playlist option if specified (for single video from playlist URLs) + if opts.no_playlist: + base_options["noplaylist"] = True + + # Configure based on mode + if opts.mode == "audio": + base_options["format"] = opts.ytdl_format or "251/140/bestaudio" + base_options["postprocessors"] = [{"key": "FFmpegExtractAudio"}] + else: # video + base_options["format"] = opts.ytdl_format or "bestvideo+bestaudio/best" + base_options["format_sort"] = [ + "res:4320", "res:2880", "res:2160", "res:1440", "res:1080", "res:720", "res" + ] + + # Add clip sections if provided + if opts.clip_sections: + base_options["download_sections"] = opts.clip_sections + + # Add playlist items selection if provided + if opts.playlist_items: + base_options["playlist_items"] = opts.playlist_items + + debug(f"yt-dlp: mode={opts.mode}, format={base_options.get('format')}") + return base_options + + +def _iter_download_entries(info: Dict[str, Any]) -> Iterator[Dict[str, Any]]: + """Iterate through download entries, handling playlists.""" + queue: List[Dict[str, Any]] = [info] + seen: set[int] = set() + while queue: + current = queue.pop(0) + obj_id = id(current) + if obj_id in seen: + continue + seen.add(obj_id) + entries = current.get("entries") + if isinstance(entries, list): + for entry in entries: + if isinstance(entry, dict): + queue.append(entry) + if current.get("requested_downloads") or not entries: + yield current + + +def _candidate_paths(entry: Dict[str, Any], output_dir: Path) -> Iterator[Path]: + """Get candidate file paths for downloaded media.""" + requested = entry.get("requested_downloads") + if isinstance(requested, list): + for item in requested: + if isinstance(item, dict): + for key in ("filepath", "_filename", "filename"): + value = item.get(key) + if value: + yield Path(value) + for key in ("filepath", "_filename", "filename"): + value = entry.get(key) + if value: + yield Path(value) + if entry.get("filename"): + yield output_dir / entry["filename"] + + +def _resolve_entry_and_path(info: Dict[str, Any], output_dir: Path) -> tuple[Dict[str, Any], Path]: + """Find downloaded file in yt-dlp metadata.""" + for entry in _iter_download_entries(info): + for candidate in _candidate_paths(entry, output_dir): + if candidate.is_file(): + return entry, candidate + if not candidate.is_absolute(): + resolved = output_dir / candidate + if resolved.is_file(): + return entry, resolved + raise FileNotFoundError("yt-dlp did not report a downloaded media file") + + +def _extract_sha256(info: Dict[str, Any]) -> Optional[str]: + """Extract SHA256 hash from yt-dlp metadata.""" + for payload in [info] + info.get("entries", []): + if not isinstance(payload, dict): + continue + hashes = payload.get("hashes") + if isinstance(hashes, dict): + for key in ("sha256", "sha-256", "sha_256"): + value = hashes.get(key) + if isinstance(value, str) and value.strip(): + return value.strip().lower() + for key in ("sha256", "sha-256", "sha_256"): + value = payload.get(key) + if isinstance(value, str) and value.strip(): + return value.strip().lower() + return None + + +def _get_libgen_download_url(libgen_url: str) -> Optional[str]: + """Extract the actual download link from LibGen redirect URL. + + LibGen URLs like https://libgen.gl/file.php?id=123456 redirect to + actual mirror URLs. This follows the redirect chain to get the real file. + + Args: + libgen_url: LibGen file.php URL + + Returns: + Actual download URL or None if extraction fails + """ + try: + import requests + from urllib.parse import urlparse + + # Check if this is a LibGen URL + parsed = urlparse(libgen_url) + if 'libgen' not in parsed.netloc.lower(): + return None + + if '/file.php' not in parsed.path.lower(): + return None + + # LibGen redirects to actual mirrors, follow redirects to get final URL + session = requests.Session() + session.headers.update({ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' + }) + + debug(f"Following LibGen redirect chain for: {libgen_url}") + + # First, get the page and look for direct download link + try: + response = session.get(libgen_url, timeout=10, allow_redirects=True) + final_url = response.url + + # Try to find actual download link in the page + try: + from bs4 import BeautifulSoup + soup = BeautifulSoup(response.content, 'html.parser') + + # Look for download links - LibGen typically has forms with download buttons + # Look for all links and forms that might lead to download + for link in soup.find_all('a'): + href = link.get('href') + if href and isinstance(href, str): + # Look for direct file links or get.php redirects + if 'get.php' in href.lower() or href.endswith(('.pdf', '.epub', '.djvu', '.mobi')): + download_url = href if href.startswith('http') else urljoin(final_url, href) + debug(f"Found download link: {download_url}") + return download_url + except ImportError: + pass # BeautifulSoup not available + + # If we followed redirects successfully, return the final URL + # This handles cases where libgen redirects to a direct download mirror + if final_url != libgen_url: + debug(f"LibGen resolved to mirror: {final_url}") + return final_url + + except requests.RequestException as e: + log(f"Error following LibGen redirects: {e}", file=sys.stderr) + # Try head request as fallback + try: + response = session.head(libgen_url, allow_redirects=True, timeout=10) + if response.url != libgen_url: + debug(f"LibGen HEAD resolved to: {response.url}") + return response.url + except: + pass + + return None + + except Exception as e: + log(f"Error resolving LibGen URL: {e}", file=sys.stderr) + return None + + +def _download_direct_file( + url: str, + output_dir: Path, + debug_logger: Optional[DebugLogger] = None, +) -> DownloadMediaResult: + """Download a direct file (PDF, image, document, etc.) without yt-dlp.""" + ensure_directory(output_dir) + + from urllib.parse import unquote, urlparse, parse_qs + import re + + # Extract filename from URL + parsed_url = urlparse(url) + url_path = parsed_url.path + + # Try to get filename from query parameters first (for LibGen and similar services) + # e.g., ?filename=Book+Title.pdf or &download=filename.pdf + filename = None + if parsed_url.query: + query_params = parse_qs(parsed_url.query) + for param_name in ('filename', 'download', 'file', 'name'): + if param_name in query_params and query_params[param_name]: + filename = query_params[param_name][0] + filename = unquote(filename) + break + + # If not found in query params, extract from URL path + if not filename or not filename.strip(): + filename = url_path.split("/")[-1] if url_path else "" + filename = unquote(filename) + + # Remove query strings from filename if any + if "?" in filename: + filename = filename.split("?")[0] + + # Try to get real filename from Content-Disposition header (HEAD request) + try: + with HTTPClient(timeout=10.0) as client: + response = client._request("HEAD", url, follow_redirects=True) + content_disposition = response.headers.get("content-disposition", "") + if content_disposition: + # Extract filename from Content-Disposition header + # Format: attachment; filename="filename.pdf" or filename=filename.pdf + match = re.search(r'filename\*?=(?:"([^"]*)"|([^;\s]*))', content_disposition) + if match: + extracted_name = match.group(1) or match.group(2) + if extracted_name: + filename = unquote(extracted_name) + debug(f"Filename from Content-Disposition: {filename}") + except Exception as e: + log(f"Could not get filename from headers: {e}", file=sys.stderr) + + # Fallback if we still don't have a good filename + if not filename or "." not in filename: + filename = "downloaded_file.bin" + + file_path = output_dir / filename + progress_bar = ProgressBar() + + debug(f"Direct download: {filename}") + + try: + start_time = time.time() + downloaded_bytes = [0] + total_bytes = [0] + last_progress_time = [start_time] + + def progress_callback(bytes_downloaded: int, content_length: int) -> None: + downloaded_bytes[0] = bytes_downloaded + total_bytes[0] = content_length + + now = time.time() + if now - last_progress_time[0] >= 0.5 and total_bytes[0] > 0: + elapsed = now - start_time + percent = (bytes_downloaded / content_length) * 100 if content_length > 0 else 0 + speed = bytes_downloaded / elapsed if elapsed > 0 else 0 + eta_seconds = (content_length - bytes_downloaded) / speed if speed > 0 else 0 + + speed_str = progress_bar.format_bytes(speed) + "/s" + minutes, seconds = divmod(int(eta_seconds), 60) + hours, minutes = divmod(minutes, 60) + eta_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}" + + progress_line = progress_bar.format_progress( + percent_str=f"{percent:.1f}%", + downloaded=bytes_downloaded, + total=content_length, + speed_str=speed_str, + eta_str=eta_str, + ) + debug(progress_line) + last_progress_time[0] = now + + with HTTPClient(timeout=30.0) as client: + client.download(url, str(file_path), progress_callback=progress_callback) + + elapsed = time.time() - start_time + avg_speed_str = progress_bar.format_bytes(downloaded_bytes[0] / elapsed if elapsed > 0 else 0) + "/s" + debug(f"✓ Downloaded in {elapsed:.1f}s at {avg_speed_str}") + + # For direct file downloads, create minimal info dict without filename as title + # This prevents creating duplicate title: tags when filename gets auto-generated + # We'll add title back later only if we couldn't extract meaningful tags + info = { + "id": filename.rsplit(".", 1)[0], + "ext": filename.rsplit(".", 1)[1] if "." in filename else "bin", + "webpage_url": url, + } + + hash_value = None + try: + hash_value = sha256_file(file_path) + except Exception: + pass + + tags = [] + if extract_ytdlp_tags: + try: + tags = extract_ytdlp_tags(info) + except Exception as e: + log(f"Error extracting tags: {e}", file=sys.stderr) + + # Only use filename as a title tag if we couldn't extract any meaningful tags + # This prevents duplicate title: tags when the filename could be mistaken for metadata + if not any(t.startswith('title:') for t in tags): + # Re-extract tags with filename as title only if needed + info['title'] = filename + tags = [] + if extract_ytdlp_tags: + try: + tags = extract_ytdlp_tags(info) + except Exception as e: + log(f"Error extracting tags with filename: {e}", file=sys.stderr) + + if debug_logger is not None: + debug_logger.write_record( + "direct-file-downloaded", + {"url": url, "path": str(file_path), "hash": hash_value}, + ) + + return DownloadMediaResult( + path=file_path, + info=info, + tags=tags, + source_url=url, + hash_value=hash_value, + ) + + except (httpx.HTTPError, httpx.RequestError) as exc: + log(f"Download error: {exc}", file=sys.stderr) + if debug_logger is not None: + debug_logger.write_record( + "exception", + {"phase": "direct-file", "url": url, "error": str(exc)}, + ) + raise DownloadError(f"Failed to download {url}: {exc}") from exc + except Exception as exc: + log(f"Error downloading file: {exc}", file=sys.stderr) + if debug_logger is not None: + debug_logger.write_record( + "exception", + { + "phase": "direct-file", + "url": url, + "error": str(exc), + "traceback": traceback.format_exc(), + }, + ) + raise DownloadError(f"Error downloading file: {exc}") from exc + + +def probe_url(url: str, no_playlist: bool = False) -> Optional[Dict[str, Any]]: + """Probe URL to extract metadata WITHOUT downloading. + + Args: + url: URL to probe + no_playlist: If True, ignore playlists and probe only the single video + + Returns: + Dict with keys: extractor, title, entries (if playlist), duration, etc. + Returns None if not supported by yt-dlp. + """ + if not is_url_supported_by_ytdlp(url): + return None + + _ensure_yt_dlp_ready() + + assert yt_dlp is not None + try: + # Extract info without downloading + # Use extract_flat='in_playlist' to get full metadata for playlist items + ydl_opts = { + "quiet": True, # Suppress all output + "no_warnings": True, + "socket_timeout": 10, + "retries": 3, + "skip_download": True, # Don't actually download + "extract_flat": "in_playlist", # Get playlist with metadata for each entry + "noprogress": True, # No progress bars + "quiet": True, + } + + # Add no_playlist option if specified + if no_playlist: + ydl_opts["noplaylist"] = True + + with yt_dlp.YoutubeDL(ydl_opts) as ydl: # type: ignore[arg-type] + info = ydl.extract_info(url, download=False) + + if not isinstance(info, dict): + return None + + # Extract relevant fields + return { + "extractor": info.get("extractor", ""), + "title": info.get("title", ""), + "entries": info.get("entries", []), # Will be populated if playlist + "duration": info.get("duration"), + "uploader": info.get("uploader"), + "description": info.get("description"), + "url": url, + } + except Exception as exc: + log(f"Probe failed for {url}: {exc}") + return None + + +def download_media( + opts: DownloadOptions, + *, + debug_logger: Optional[DebugLogger] = None, +) -> DownloadMediaResult: + """Download media from URL using yt-dlp or direct HTTP download. + + Args: + opts: DownloadOptions with url, mode, output_dir, etc. + debug_logger: Optional debug logger for troubleshooting + + Returns: + DownloadMediaResult with path, info, tags, hash + + Raises: + DownloadError: If download fails + """ + # Handle LibGen URLs specially + # file.php redirects to mirrors, get.php is direct from modern API + if 'libgen' in opts.url.lower(): + if '/get.php' in opts.url.lower(): + # Modern API get.php links are direct downloads from mirrors (not file redirects) + log(f"Detected LibGen get.php URL, downloading directly...") + if debug_logger is not None: + debug_logger.write_record("libgen-direct", {"url": opts.url}) + return _download_direct_file(opts.url, opts.output_dir, debug_logger) + elif '/file.php' in opts.url.lower(): + # Old-style file.php redirects to mirrors, we need to resolve + log(f"Detected LibGen file.php URL, resolving to actual mirror...") + actual_url = _get_libgen_download_url(opts.url) + if actual_url and actual_url != opts.url: + log(f"Resolved LibGen URL to mirror: {actual_url}") + opts.url = actual_url + # After resolution, this will typically be an onion link or direct file + # Skip yt-dlp for this (it won't support onion/mirrors), go direct + if debug_logger is not None: + debug_logger.write_record("libgen-resolved", {"original": opts.url, "resolved": actual_url}) + return _download_direct_file(opts.url, opts.output_dir, debug_logger) + else: + log(f"Could not resolve LibGen URL, trying direct download anyway", file=sys.stderr) + if debug_logger is not None: + debug_logger.write_record("libgen-resolve-failed", {"url": opts.url}) + return _download_direct_file(opts.url, opts.output_dir, debug_logger) + + # Try yt-dlp first if URL is supported + if not is_url_supported_by_ytdlp(opts.url): + log(f"URL not supported by yt-dlp, trying direct download: {opts.url}") + if debug_logger is not None: + debug_logger.write_record("direct-file-attempt", {"url": opts.url}) + return _download_direct_file(opts.url, opts.output_dir, debug_logger) + + _ensure_yt_dlp_ready() + + ytdl_options = _build_ytdlp_options(opts) + log(f"Starting yt-dlp download: {opts.url}") + if debug_logger is not None: + debug_logger.write_record("ytdlp-start", {"url": opts.url}) + + assert yt_dlp is not None + try: + with yt_dlp.YoutubeDL(ytdl_options) as ydl: # type: ignore[arg-type] + info = ydl.extract_info(opts.url, download=True) + except Exception as exc: + log(f"yt-dlp failed: {exc}", file=sys.stderr) + if debug_logger is not None: + debug_logger.write_record( + "exception", + { + "phase": "yt-dlp", + "error": str(exc), + "traceback": traceback.format_exc(), + }, + ) + raise DownloadError("yt-dlp download failed") from exc + + if not isinstance(info, dict): + log(f"Unexpected yt-dlp response: {type(info)}", file=sys.stderr) + raise DownloadError("Unexpected yt-dlp response type") + + info_dict: Dict[str, Any] = info + if debug_logger is not None: + debug_logger.write_record( + "ytdlp-info", + { + "keys": sorted(info_dict.keys()), + "is_playlist": bool(info_dict.get("entries")), + }, + ) + + try: + entry, media_path = _resolve_entry_and_path(info_dict, opts.output_dir) + except FileNotFoundError as exc: + log(f"Error: {exc}", file=sys.stderr) + if debug_logger is not None: + debug_logger.write_record( + "exception", + {"phase": "resolve-path", "error": str(exc)}, + ) + raise DownloadError(str(exc)) from exc + + if debug_logger is not None: + debug_logger.write_record( + "resolved-media", + {"path": str(media_path), "entry_keys": sorted(entry.keys())}, + ) + + # Extract hash from metadata or compute + hash_value = _extract_sha256(entry) or _extract_sha256(info_dict) + if not hash_value: + try: + hash_value = sha256_file(media_path) + except OSError as exc: + if debug_logger is not None: + debug_logger.write_record( + "hash-error", + {"path": str(media_path), "error": str(exc)}, + ) + + # Extract tags using metadata.py + tags = [] + if extract_ytdlp_tags: + try: + tags = extract_ytdlp_tags(entry) + except Exception as e: + log(f"Error extracting tags: {e}", file=sys.stderr) + + source_url = ( + entry.get("webpage_url") + or entry.get("original_url") + or entry.get("url") + ) + + log(f"✓ Downloaded: {media_path.name} ({len(tags)} tags)") + if debug_logger is not None: + debug_logger.write_record( + "downloaded", + { + "path": str(media_path), + "tag_count": len(tags), + "source_url": source_url, + "sha256": hash_value, + }, + ) + + return DownloadMediaResult( + path=media_path, + info=entry, + tags=tags, + source_url=source_url, + hash_value=hash_value, + ) + + +__all__ = [ + "download_media", + "is_url_supported_by_ytdlp", + "DownloadError", + "DownloadOptions", + "DownloadMediaResult", +] diff --git a/helper/file_server.py b/helper/file_server.py new file mode 100644 index 0000000..2d7e46f --- /dev/null +++ b/helper/file_server.py @@ -0,0 +1,180 @@ +"""Simple HTTP file server for serving files in web mode.""" + +import threading +import socket +import logging +from http.server import HTTPServer, SimpleHTTPRequestHandler +from pathlib import Path +from typing import Optional +import mimetypes +import urllib.parse + +logger = logging.getLogger(__name__) + +# Global server instance +_file_server: Optional[HTTPServer] = None +_server_thread: Optional[threading.Thread] = None +_server_port: int = 8001 + + +class FileServerHandler(SimpleHTTPRequestHandler): + """HTTP request handler for file serving.""" + + def do_GET(self): + """Handle GET requests.""" + # Parse the path + parsed_path = urllib.parse.urlparse(self.path) + file_path = urllib.parse.unquote(parsed_path.path) + + # Remove leading slash + if file_path.startswith('/'): + file_path = file_path[1:] + + # Decode the file path (it's URL encoded) + try: + full_path = Path(file_path).resolve() + + # Security check: ensure the path is within allowed directories + # For now, allow all paths (can be restricted later) + + if full_path.is_file() and full_path.exists(): + # Serve the file + logger.debug(f"Serving file: {full_path}") + + # Determine content type + content_type, _ = mimetypes.guess_type(str(full_path)) + if content_type is None: + content_type = 'application/octet-stream' + + try: + with open(full_path, 'rb') as f: + file_content = f.read() + + self.send_response(200) + self.send_header('Content-type', content_type) + self.send_header('Content-Length', str(len(file_content))) + self.send_header('Content-Disposition', f'attachment; filename="{full_path.name}"') + self.end_headers() + self.wfile.write(file_content) + logger.info(f"Successfully served file: {full_path.name}") + return + except Exception as e: + logger.error(f"Error serving file: {e}") + self.send_error(500, "Internal server error") + return + else: + logger.warning(f"File not found: {full_path}") + self.send_error(404, "File not found") + return + + except Exception as e: + logger.error(f"Error handling request: {e}") + self.send_error(400, "Bad request") + + def log_message(self, format, *args): + """Override to use our logger instead of stderr.""" + logger.debug(format % args) + + +def get_local_ip() -> Optional[str]: + """Get the local IP address that's accessible from other devices.""" + try: + # Connect to a remote server to determine local IP + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + s.connect(("8.8.8.8", 80)) + ip = s.getsockname()[0] + s.close() + return ip + except Exception as e: + logger.warning(f"Failed to determine local IP: {e}") + return None + + +def start_file_server(port: int = 8001) -> Optional[str]: + """Start the HTTP file server. + + Args: + port: Port to serve on + + Returns: + Server URL if successful, None otherwise + """ + global _file_server, _server_thread, _server_port + + if _file_server is not None: + logger.debug(f"File server already running on port {_server_port}") + local_ip = get_local_ip() + if local_ip: + return f"http://{local_ip}:{_server_port}" + return None + + try: + _server_port = port + + # Create server + server_address = ('', port) + _file_server = HTTPServer(server_address, FileServerHandler) + + # Start in daemon thread + _server_thread = threading.Thread(target=_file_server.serve_forever, daemon=True) + _server_thread.start() + + logger.info(f"File server started on port {port}") + + # Get local IP + local_ip = get_local_ip() + if local_ip: + server_url = f"http://{local_ip}:{port}" + logger.info(f"File server accessible at: {server_url}") + return server_url + else: + logger.warning("Could not determine local IP") + return None + + except Exception as e: + logger.error(f"Failed to start file server: {e}") + _file_server = None + _server_thread = None + return None + + +def stop_file_server(): + """Stop the HTTP file server.""" + global _file_server, _server_thread + + if _file_server is not None: + try: + _file_server.shutdown() + _file_server.server_close() + logger.info("File server stopped") + except Exception as e: + logger.error(f"Error stopping file server: {e}") + finally: + _file_server = None + _server_thread = None + + +def get_file_url(file_path: Path, server_url: Optional[str] = None) -> Optional[str]: + """Get the HTTP URL for a file. + + Args: + file_path: Path to the file + server_url: Base server URL (gets determined if None) + + Returns: + HTTP URL to the file, or None if server not running + """ + if not file_path.exists(): + logger.warning(f"File does not exist: {file_path}") + return None + + if server_url is None: + local_ip = get_local_ip() + if not local_ip: + logger.error("Cannot determine local IP for file URL") + return None + server_url = f"http://{local_ip}:{_server_port}" + + # URL encode the file path + encoded_path = urllib.parse.quote(str(file_path.resolve())) + return f"{server_url}/{encoded_path}" diff --git a/helper/file_storage.py b/helper/file_storage.py new file mode 100644 index 0000000..a1eefdb --- /dev/null +++ b/helper/file_storage.py @@ -0,0 +1,1039 @@ +"""File storage abstraction layer for uploading files to different services. + +Supports multiple backend storage services (0x0.st, local directories, Hydrus, etc.) +with a unified interface. + +Example: + storage = FileStorage() + + # Upload to 0x0.st + url = storage["0x0"].upload(Path("file.mp3")) + + # Copy to local directory + path = storage["local"].upload(Path("file.mp3"), location="/home/user/files") + + # Upload to Hydrus + hash_result = storage["hydrus"].upload(file_path, config=config) +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Any, Dict, Optional +import sys +import shutil +import requests + +from helper.logger import log, debug + + +class StorageBackend(ABC): + """Abstract base class for file storage backends. + + Backends can optionally support searching by implementing the search() method. + """ + + @abstractmethod + def upload(self, file_path: Path, **kwargs: Any) -> str: + """Upload a file and return a result identifier (URL, hash, path, etc.). + + Args: + file_path: Path to the file to upload + **kwargs: Backend-specific options + + Returns: + Result identifier (e.g., URL for 0x0.st, hash for Hydrus, path for local) + + Raises: + Exception: If upload fails + """ + + def search(self, query: str, **kwargs: Any) -> list[Dict[str, Any]]: + """Search for files in backends that support it. + + This method is optional and only implemented by searchable backends + (e.g., Hydrus, Debrid, Soulseek). + + Args: + query: Search query string + **kwargs: Backend-specific search options + + Returns: + List of search results, each as a dict with backend-specific fields. + Common fields: 'name', 'size', 'hash', 'url', 'id', etc. + + Raises: + NotImplementedError: If backend doesn't support searching + Exception: If search fails + + Example: + results = storage["hydrus"].search("music artist:john") + for result in results: + print(result['name'], result['hash']) + """ + raise NotImplementedError(f"{self.get_name()} backend does not support searching") + + def supports_search(self) -> bool: + """Check if this backend supports searching. + + Returns: + True if search() is implemented, False otherwise + """ + return self.search.__func__ is not StorageBackend.search + + +class LocalStorageBackend(StorageBackend): + """File storage backend for local file system copy.""" + + def __init__(self, location: Optional[str] = None) -> None: + """Initialize local storage backend. + + Args: + location: Default directory path for storage operations + """ + self._location = location + + def get_name(self) -> str: + return "local" + + def upload(self, file_path: Path, **kwargs: Any) -> str: + """Copy or move file to a local directory. + + Args: + file_path: Path to the file to upload + location: Destination directory path (uses default if not provided) + move: When True, move the file instead of copying (default: False) + + Returns: + Absolute path to the copied/moved file + + Raises: + ValueError: If location not provided and no default configured + Exception: If copy fails or duplicate detected + """ + from helper.utils import unique_path as utils_unique_path + from helper.utils import sha256_file + from helper.local_library import LocalLibraryDB + + location = kwargs.get("location") or self._location + move_file = bool(kwargs.get("move")) + if not location: + raise ValueError("'location' parameter required for local storage (not configured)") + + try: + # Compute file hash + file_hash = sha256_file(file_path) + log(f"File hash: {file_hash}", file=sys.stderr) + + dest_dir = Path(location).expanduser() + dest_dir.mkdir(parents=True, exist_ok=True) + + # Check for duplicate files using LocalLibraryDB (fast - uses index) + try: + db = LocalLibraryDB(dest_dir) + existing_path = db.search_by_hash(file_hash) + if existing_path and existing_path.exists(): + log( + f"✓ File already in local storage: {existing_path}", + file=sys.stderr, + ) + return str(existing_path) + except Exception as exc: + log(f"⚠️ Could not check for duplicates in DB: {exc}", file=sys.stderr) + + dest_file = dest_dir / file_path.name + dest_file = utils_unique_path(dest_file) + + if move_file: + shutil.move(str(file_path), dest_file) + log(f"✅ Local move: {dest_file}", file=sys.stderr) + else: + shutil.copy2(file_path, dest_file) + log(f"✅ Local copy: {dest_file}", file=sys.stderr) + return str(dest_file) + except Exception as exc: + log(f"❌ Local copy failed: {exc}", file=sys.stderr) + raise + + def search(self, query: str, **kwargs: Any) -> list[Dict[str, Any]]: + """Search local database for files by title tag or filename. + + Args: + query: Search string supporting: + - Title tag search: "title:document" or just searches DB for matching title tags + - Tag namespace search: "creator:Mac*" matches tags in database + - Filename fallback: if query not in DB, searches filesystem + - "*" means "match all files" + location: Directory to search in (uses default if not provided) + recursive: Search subdirectories (default: True) + + Returns: + List of dicts with 'name', 'path', 'size' fields + """ + from fnmatch import fnmatch + from helper.local_library import LocalLibraryDB + + location = kwargs.get("location") or self._location + if not location: + raise ValueError("'location' parameter required for local search (not configured)") + + limit = kwargs.get("limit") + try: + limit = int(limit) if limit is not None else None + except (TypeError, ValueError): + limit = None + if isinstance(limit, int) and limit <= 0: + limit = None + + query_lower = query.lower() + match_all = query_lower == "*" + results = [] + search_dir = Path(location).expanduser() + debug(f"Searching local storage at: {search_dir}") + + try: + if not search_dir.exists(): + debug(f"Search directory does not exist: {search_dir}") + return results + + # Try database search first (much faster than filesystem scan) + try: + debug(f"Connecting to local library DB at {search_dir}") + db = LocalLibraryDB(search_dir) + cursor = db.connection.cursor() + + # Check if query is a tag namespace search (format: "namespace:pattern") + if ":" in query and not query.startswith(":"): + namespace, pattern = query.split(":", 1) + namespace = namespace.strip().lower() + pattern = pattern.strip().lower() + debug(f"Performing namespace search: {namespace}:{pattern}") + + # Search for tags matching the namespace and pattern + query_pattern = f"{namespace}:%" + + cursor.execute(""" + SELECT DISTINCT f.id, f.file_path, f.file_size + FROM files f + JOIN tags t ON f.id = t.file_id + WHERE LOWER(t.tag) LIKE ? + ORDER BY f.file_path + LIMIT ? + """, (query_pattern, limit or 1000)) + + rows = cursor.fetchall() + debug(f"Found {len(rows)} potential matches in DB") + + # Filter results by pattern match + for file_id, file_path_str, size_bytes in rows: + if not file_path_str: + continue + + # Get the file's tags and check if any match the pattern + cursor.execute(""" + SELECT DISTINCT tag FROM tags + WHERE file_id = ? + AND LOWER(tag) LIKE ? + """, (file_id, query_pattern)) + + tags = [row[0] for row in cursor.fetchall()] + + # Check if any tag matches the pattern (case-insensitive wildcard) + for tag in tags: + tag_lower = tag.lower() + # Extract the value part after "namespace:" + if tag_lower.startswith(f"{namespace}:"): + value = tag_lower[len(namespace)+1:] + # Use fnmatch for wildcard matching + if fnmatch(value, pattern): + file_path = Path(file_path_str) + if file_path.exists(): + path_str = str(file_path) + if size_bytes is None: + size_bytes = file_path.stat().st_size + + # Fetch all tags for this file + cursor.execute(""" + SELECT tag FROM tags WHERE file_id = ? + """, (file_id,)) + all_tags = [row[0] for row in cursor.fetchall()] + + results.append({ + "name": file_path.name, + "title": file_path.name, + "path": path_str, + "target": path_str, + "origin": "local", + "size": size_bytes, + "size_bytes": size_bytes, + "tags": all_tags, + }) + else: + debug(f"File missing on disk: {file_path}") + break # Don't add same file multiple times + + if limit is not None and len(results) >= limit: + return results + + elif not match_all: + # Search by filename or simple tags (namespace-agnostic for plain text) + # For plain text search, match: + # 1. Filenames containing the query + # 2. Simple tags (without namespace) containing the query + # NOTE: Does NOT match namespaced tags (e.g., "joe" won't match "channel:Joe Mullan") + # Use explicit namespace search for that (e.g., "channel:joe*") + query_pattern = f"%{query_lower}%" + debug(f"Performing filename/tag search: {query_pattern}") + + cursor.execute(""" + SELECT DISTINCT f.id, f.file_path, f.file_size + FROM files f + WHERE LOWER(f.file_path) LIKE ? + ORDER BY f.file_path + LIMIT ? + """, (query_pattern, limit or 1000)) + + rows = cursor.fetchall() + debug(f"Found {len(rows)} filename matches in DB") + seen_files = set() + for file_id, file_path_str, size_bytes in rows: + if not file_path_str or file_path_str in seen_files: + continue + seen_files.add(file_path_str) + + file_path = Path(file_path_str) + if file_path.exists(): + path_str = str(file_path) + if size_bytes is None: + size_bytes = file_path.stat().st_size + + # Fetch tags for this file + cursor.execute(""" + SELECT tag FROM tags WHERE file_id = ? + """, (file_id,)) + tags = [row[0] for row in cursor.fetchall()] + + results.append({ + "name": file_path.name, + "title": file_path.name, + "path": path_str, + "target": path_str, + "origin": "local", + "size": size_bytes, + "size_bytes": size_bytes, + "tags": tags, + }) + + # Also search for simple tags (without namespace) containing the query + cursor.execute(""" + SELECT DISTINCT f.id, f.file_path, f.file_size + FROM files f + JOIN tags t ON f.id = t.file_id + WHERE LOWER(t.tag) LIKE ? AND LOWER(t.tag) NOT LIKE '%:%' + ORDER BY f.file_path + LIMIT ? + """, (query_pattern, limit or 1000)) + + tag_rows = cursor.fetchall() + for file_id, file_path_str, size_bytes in tag_rows: + if not file_path_str or file_path_str in seen_files: + continue + seen_files.add(file_path_str) + + file_path = Path(file_path_str) + if file_path.exists(): + path_str = str(file_path) + if size_bytes is None: + size_bytes = file_path.stat().st_size + + # Fetch tags for this file + cursor.execute(""" + SELECT tag FROM tags WHERE file_id = ? + """, (file_id,)) + tags = [row[0] for row in cursor.fetchall()] + + results.append({ + "name": file_path.name, + "title": file_path.name, + "path": path_str, + "target": path_str, + "origin": "local", + "size": size_bytes, + "size_bytes": size_bytes, + "tags": tags, + }) + + if limit is not None and len(results) >= limit: + return results + + else: + # Match all - get all files from database + cursor.execute(""" + SELECT id, file_path, file_size + FROM files + ORDER BY file_path + LIMIT ? + """, (limit or 1000,)) + + rows = cursor.fetchall() + for file_id, file_path_str, size_bytes in rows: + if file_path_str: + file_path = Path(file_path_str) + if file_path.exists(): + path_str = str(file_path) + if size_bytes is None: + size_bytes = file_path.stat().st_size + + # Fetch tags for this file + cursor.execute(""" + SELECT tag FROM tags WHERE file_id = ? + """, (file_id,)) + tags = [row[0] for row in cursor.fetchall()] + + results.append({ + "name": file_path.name, + "title": file_path.name, + "path": path_str, + "target": path_str, + "origin": "local", + "size": size_bytes, + "size_bytes": size_bytes, + "tags": tags, + }) + + if results: + debug(f"Returning {len(results)} results from DB") + return results + else: + debug("No results found in DB, falling back to filesystem scan") + + except Exception as e: + log(f"⚠️ Database search failed: {e}", file=sys.stderr) + debug(f"DB search exception details: {e}") + + # Fallback to filesystem search if database search fails or returns nothing + debug("Starting filesystem scan...") + recursive = kwargs.get("recursive", True) + pattern = "**/*" if recursive else "*" + + count = 0 + for file_path in search_dir.glob(pattern): + if not file_path.is_file(): + continue + lower_name = file_path.name.lower() + if lower_name.endswith('.tags') or lower_name.endswith('.metadata') \ + or lower_name.endswith('.notes') or lower_name.endswith('.tags.txt'): + continue + if not (match_all or query_lower in lower_name): + continue + + size_bytes = file_path.stat().st_size + path_str = str(file_path) + results.append({ + "name": file_path.name, + "title": file_path.name, + "path": path_str, + "target": path_str, + "origin": "local", + "size": size_bytes, + "size_bytes": size_bytes, + }) + count += 1 + + if limit is not None and len(results) >= limit: + break + + debug(f"Filesystem scan found {count} matches") + + except Exception as exc: + log(f"❌ Local search failed: {exc}", file=sys.stderr) + raise + + return results + + +class HydrusStorageBackend(StorageBackend): + """File storage backend for Hydrus client.""" + + def __init__(self, config: Optional[Dict[str, Any]] = None) -> None: + """Initialize Hydrus storage backend. + + Args: + config: Configuration dict with Hydrus settings (HydrusNetwork section) + """ + self._config = config or {} + + def get_name(self) -> str: + return "hydrus" + + def upload(self, file_path: Path, **kwargs: Any) -> str: + """Upload file to Hydrus. + + Args: + file_path: Path to the file to upload + tags: Optional list of tags to add (uses default config if not provided) + config: Optional override for config (uses default if not provided) + + Returns: + File hash from Hydrus + + Raises: + Exception: If upload fails + """ + from helper import hydrus as hydrus_wrapper + from helper.utils import sha256_file + + config = kwargs.get("config") or self._config + if not config: + raise ValueError("'config' parameter required for Hydrus storage (not configured)") + + tags = kwargs.get("tags", []) + + try: + # Compute file hash + file_hash = sha256_file(file_path) + debug(f"File hash: {file_hash}") + + # Build Hydrus client + client = hydrus_wrapper.get_client(config) + if client is None: + raise Exception("Hydrus client unavailable") + + # Check if file already exists in Hydrus + try: + metadata = client.fetch_file_metadata(hashes=[file_hash]) + if metadata and isinstance(metadata, dict): + files = metadata.get("file_metadata", []) + if files: + log( + f"ℹ️ Duplicate detected - file already in Hydrus with hash: {file_hash}", + file=sys.stderr, + ) + # Even if duplicate, we should add tags if provided + if tags: + try: + service_name = hydrus_wrapper.get_tag_service_name(config) + except Exception: + service_name = "my tags" + + try: + debug(f"Adding {len(tags)} tag(s) to existing file in Hydrus: {tags}") + client.add_tags(file_hash, tags, service_name) + log(f"✅ Tags added to existing file via '{service_name}'", file=sys.stderr) + except Exception as exc: + log(f"⚠️ Failed to add tags to existing file: {exc}", file=sys.stderr) + + return file_hash + except Exception: + pass + + # Upload file to Hydrus + log(f"Uploading to Hydrus: {file_path.name}", file=sys.stderr) + response = client.add_file(file_path) + + # Extract hash from response + hydrus_hash: Optional[str] = None + if isinstance(response, dict): + hydrus_hash = response.get("hash") or response.get("file_hash") + if not hydrus_hash: + hashes = response.get("hashes") + if isinstance(hashes, list) and hashes: + hydrus_hash = hashes[0] + + if not hydrus_hash: + raise Exception(f"Hydrus response missing file hash: {response}") + + file_hash = hydrus_hash + log(f"✅ File uploaded to Hydrus: {file_hash}", file=sys.stderr) + + # Add tags if provided + if tags: + try: + service_name = hydrus_wrapper.get_tag_service_name(config) + except Exception: + service_name = "my tags" + + try: + debug(f"Adding {len(tags)} tag(s) to Hydrus: {tags}") + client.add_tags(file_hash, tags, service_name) + log(f"✅ Tags added via '{service_name}'", file=sys.stderr) + except Exception as exc: + log(f"⚠️ Failed to add tags: {exc}", file=sys.stderr) + + return file_hash + + except Exception as exc: + log(f"❌ Hydrus upload failed: {exc}", file=sys.stderr) + raise + + def search(self, query: str, **kwargs: Any) -> list[Dict[str, Any]]: + """Search Hydrus database for files matching query. + + Args: + query: Search query (tags, filenames, hashes, etc.) + limit: Maximum number of results to return (default: 100) + config: Optional override for config (uses default if not provided) + + Returns: + List of dicts with 'name', 'hash', 'size', 'tags' fields + + Example: + results = storage["hydrus"].search("artist:john_doe music") + results = storage["hydrus"].search("Simple Man") + """ + from helper import hydrus as hydrus_wrapper + + config = kwargs.get("config") or self._config + if not config: + raise ValueError("'config' parameter required for Hydrus search (not configured)") + + limit = kwargs.get("limit", 100) + + try: + client = hydrus_wrapper.get_client(config) + if client is None: + raise Exception("Hydrus client unavailable") + + debug(f"Searching Hydrus for: {query}") + + # Parse the query into tags + # Handle both simple tags and complex queries + # "*" means "match all" - use system:everything tag in Hydrus + if query.strip() == "*": + # Use system:everything to match all files in Hydrus + tags = ["system:everything"] + else: + query_lower = query.lower().strip() + # If query doesn't have a namespace (no ':'), search all files and filter by title/tags + # If query has explicit namespace, use it as a tag search + if ':' not in query_lower: + # No namespace provided: search all files, then filter by title/tags containing the query + tags = ["system:everything"] + else: + # User provided explicit namespace (e.g., "creator:john" or "system:has_audio") + # Use it as a tag search + tags = [query_lower] + + if not tags: + debug(f"Found 0 result(s)") + return [] + + # Search files with the tags + search_result = client.search_files( + tags=tags, + return_hashes=True, + return_file_ids=True + ) + + # Extract file IDs from search result + file_ids = search_result.get("file_ids", []) + hashes = search_result.get("hashes", []) + + if not file_ids and not hashes: + debug(f"Found 0 result(s)") + return [] + + # Fetch metadata for the found files + results = [] + query_lower = query.lower().strip() + search_terms = set(query_lower.split()) # For substring matching + + if file_ids: + metadata = client.fetch_file_metadata(file_ids=file_ids) + metadata_list = metadata.get("metadata", []) + + for meta in metadata_list: + if len(results) >= limit: + break + + file_id = meta.get("file_id") + hash_hex = meta.get("hash") + size = meta.get("size", 0) + + # Get tags for this file and extract title + tags_set = meta.get("tags", {}) + all_tags = [] + title = f"Hydrus File {file_id}" # Default fallback + all_tags_str = "" # For substring matching + + # debug(f"[HydrusBackend.search] Processing file_id={file_id}, tags type={type(tags_set)}") + + if isinstance(tags_set, dict): + # debug(f"[HydrusBackend.search] Tags payload keys: {list(tags_set.keys())}") + for service_name, service_tags in tags_set.items(): + # debug(f"[HydrusBackend.search] Processing service: {service_name}") + if isinstance(service_tags, dict): + storage_tags = service_tags.get("storage_tags", {}) + if isinstance(storage_tags, dict): + for tag_type, tag_list in storage_tags.items(): + # debug(f"[HydrusBackend.search] Tag type: {tag_type}, count: {len(tag_list) if isinstance(tag_list, list) else 0}") + if isinstance(tag_list, list): + for tag in tag_list: + tag_text = str(tag) if tag else "" + if tag_text: + # debug(f"[HydrusBackend.search] Tag: {tag_text}") + all_tags.append(tag_text) + all_tags_str += " " + tag_text.lower() + # Extract title: namespace + if tag_text.startswith("title:"): + title = tag_text[6:].strip() # Remove "title:" prefix + # debug(f"[HydrusBackend.search] ✓ Extracted title: {title}") + break + if title != f"Hydrus File {file_id}": + break + + # Filter results based on query type + # If user provided explicit namespace (has ':'), don't do substring filtering + # Just include what the tag search returned + has_namespace = ':' in query_lower + + if has_namespace: + # Explicit namespace search - already filtered by Hydrus tag search + # Include this result as-is + results.append({ + "hash": hash_hex, + "hash_hex": hash_hex, + "target": hash_hex, + "name": title, + "title": title, + "size": size, + "size_bytes": size, + "origin": "hydrus", + "tags": all_tags, + "file_id": file_id, + "mime": meta.get("mime"), + }) + else: + # Free-form search: check if search terms match the title or tags + # Match if ANY search term is found in title or tags (OR logic) + if query_lower == "*" or any(term in all_tags_str or term in title.lower() for term in search_terms): + results.append({ + "hash": hash_hex, + "hash_hex": hash_hex, + "target": hash_hex, + "name": title, + "title": title, + "size": size, + "size_bytes": size, + "origin": "hydrus", + "tags": all_tags, + "file_id": file_id, + "mime": meta.get("mime"), + }) + + debug(f"Found {len(results)} result(s)") + return results[:limit] + + except Exception as exc: + log(f"❌ Hydrus search failed: {exc}", file=sys.stderr) + import traceback + traceback.print_exc(file=sys.stderr) + raise + + +class DebridStorageBackend(StorageBackend): + """File storage backend for Debrid services (AllDebrid, RealDebrid, etc.).""" + + def __init__(self, api_key: Optional[str] = None) -> None: + """Initialize Debrid storage backend. + + Args: + api_key: API key for Debrid service (e.g., from config["Debrid"]["All-debrid"]) + """ + self._api_key = api_key + + def get_name(self) -> str: + return "debrid" + + def upload(self, file_path: Path, **kwargs: Any) -> str: + """Upload file to Debrid service. + + Args: + file_path: Path to the file to upload + **kwargs: Debrid-specific options + + Returns: + Debrid link/URL + + Raises: + NotImplementedError: Debrid upload not yet implemented + """ + raise NotImplementedError("Debrid upload not yet implemented") + + def search(self, query: str, **kwargs: Any) -> list[Dict[str, Any]]: + """Search Debrid for files matching query. + + Searches through available magnets in AllDebrid storage and returns + matching results with download links. + + Args: + query: Search query string (filename or magnet name pattern) + limit: Maximum number of results to return (default: 50) + api_key: Optional override for API key (uses default if not provided) + + Returns: + List of dicts with keys: + - 'name': File/magnet name + - 'title': Same as name (for compatibility) + - 'url': AllDebrid download link + - 'size': File size in bytes + - 'magnet_id': AllDebrid magnet ID + - 'origin': 'debrid' + - 'annotations': Status and seeders info + + Example: + results = storage["debrid"].search("movie.mkv") + for result in results: + print(f"{result['name']} - {result['size']} bytes") + """ + api_key = kwargs.get("api_key") or self._api_key + if not api_key: + raise ValueError("'api_key' parameter required for Debrid search (not configured)") + + limit = kwargs.get("limit", 50) + + try: + from helper.alldebrid import AllDebridClient + + debug(f"Searching AllDebrid for: {query}") + + client = AllDebridClient(api_key=api_key) + + # STEP 1: Get magnet status list + try: + response = client._request('magnet/status') + magnets_data = response.get('data', {}) + magnets = magnets_data.get('magnets', []) + if not isinstance(magnets, list): + magnets = [magnets] if magnets else [] + debug(f"[debrid_search] Got {len(magnets)} total magnets") + except Exception as e: + log(f"⚠ Failed to get magnets list: {e}", file=sys.stderr) + magnets = [] + + # Filter by query for relevant magnets + query_lower = query.lower() + matching_magnet_ids = [] + magnet_info_map = {} # Store status info for later + + # "*" means "match all" - include all magnets + match_all = query_lower == "*" + + for magnet in magnets: + filename = magnet.get('filename', '').lower() + status_code = magnet.get('statusCode', 0) + magnet_id = magnet.get('id') + + # Only include ready or nearly-ready magnets (skip error states 5+) + if status_code not in [0, 1, 2, 3, 4]: + continue + + # Match query against filename (or match all if query is "*") + if not match_all and query_lower not in filename: + continue + + matching_magnet_ids.append(magnet_id) + magnet_info_map[magnet_id] = magnet + debug(f"[debrid_search] ✓ Matched magnet {magnet_id}: {filename}") + + debug(f"[debrid_search] Found {len(matching_magnet_ids)} matching magnets") + + results = [] + + # Return one result per magnet (not per file) + # This keeps search results clean and allows user to download entire magnet at once + for magnet_id in matching_magnet_ids: + magnet_status = magnet_info_map.get(magnet_id, {}) + filename = magnet_status.get('filename', 'Unknown') + status = magnet_status.get('status', 'Unknown') + status_code = magnet_status.get('statusCode', 0) + size = magnet_status.get('size', 0) + seeders = magnet_status.get('seeders', 0) + + # Format size nicely + size_label = f"{size / (1024**3):.2f}GB" if size > 0 else "Unknown" + + # Create one result per magnet with aggregated info + results.append({ + 'name': filename, + 'title': filename, + 'url': '', # No direct file link for the magnet itself + 'size': size, + 'size_bytes': size, + 'magnet_id': magnet_id, + 'origin': 'debrid', + 'annotations': [ + status, + f"{seeders} seeders", + size_label, + ], + 'target': '', # Magnet ID is stored, user can then download it + }) + + debug(f"Found {len(results)} result(s) on AllDebrid") + return results[:limit] + + except Exception as exc: + log(f"❌ Debrid search failed: {exc}", file=sys.stderr) + raise + + def _flatten_file_tree(self, files: list[Any], prefix: str = '') -> list[Dict[str, Any]]: + """Flatten AllDebrid's nested file tree structure. + + AllDebrid returns files in a tree structure with folders ('e' key). + This flattens it to a list of individual files. + + Args: + files: AllDebrid file tree structure + prefix: Current path prefix (used recursively) + + Returns: + List of flattened file entries with 'name', 'size', 'link' keys + """ + result = [] + + if not isinstance(files, list): + return result + + for item in files: + if not isinstance(item, dict): + continue + + name = item.get('n', '') + + # Check if it's a folder (has 'e' key with entries) + if 'e' in item: + # Recursively flatten subfolder + subfolder_path = f"{prefix}/{name}" if prefix else name + subitems = item.get('e', []) + result.extend(self._flatten_file_tree(subitems, subfolder_path)) + else: + # It's a file - add it to results + file_path = f"{prefix}/{name}" if prefix else name + result.append({ + 'name': file_path, + 'size': item.get('s', 0), + 'link': item.get('l', ''), + }) + + return result + + +class FileStorage: + """Unified file storage interface supporting multiple backend services. + + Example: + storage = FileStorage(config) + + # Upload to different backends (uses configured locations) + url = storage["0x0"].upload(Path("file.mp3")) + local_path = storage["local"].upload(Path("file.mp3")) # Uses config["Local"]["path"] + hydrus_hash = storage["hydrus"].upload(Path("file.mp3"), tags=["music"]) + + # Search with searchable backends (uses configured locations) + results = storage["hydrus"].search("music") + results = storage["local"].search("song") # Uses config["Local"]["path"] + results = storage["debrid"].search("movie") + """ + + def __init__(self, config: Optional[Dict[str, Any]] = None) -> None: + """Initialize the file storage system with available backends. + + Args: + config: Configuration dict with backend settings (Local.path, HydrusNetwork, Debrid, etc.) + """ + config = config or {} + + # Extract backend-specific settings from config + from config import get_local_storage_path, get_debrid_api_key + + local_path = get_local_storage_path(config) + local_path_str = str(local_path) if local_path else None + + debrid_api_key = get_debrid_api_key(config) + + self._backends: Dict[str, StorageBackend] = {} + + # Always include local backend (even if no default path configured) + # The location can be specified at upload time if not configured globally + self._backends["local"] = LocalStorageBackend(location=local_path_str) + + # Include Hydrus backend (configuration optional) + self._backends["hydrus"] = HydrusStorageBackend(config=config) + + # Include Debrid backend (API key optional - will raise on use if not provided) + if debrid_api_key: + self._backends["debrid"] = DebridStorageBackend(api_key=debrid_api_key) + + def __getitem__(self, backend_name: str) -> StorageBackend: + """Get a storage backend by name. + + Args: + backend_name: Name of the backend ('0x0', 'local', 'hydrus') + + Returns: + StorageBackend instance + + Raises: + KeyError: If backend not found + """ + if backend_name not in self._backends: + raise KeyError( + f"Unknown storage backend: {backend_name}. " + f"Available: {list(self._backends.keys())}" + ) + return self._backends[backend_name] + + def register(self, backend: StorageBackend) -> None: + """Register a custom storage backend. + + Args: + backend: StorageBackend instance to register + """ + name = backend.get_name() + self._backends[name] = backend + log(f"Registered storage backend: {name}", file=sys.stderr) + + def list_backends(self) -> list[str]: + """Get list of available backend names. + + Returns: + List of backend names + """ + return list(self._backends.keys()) + + def is_available(self, backend_name: str) -> bool: + """Check if a backend is available. + + Args: + backend_name: Name of the backend + + Returns: + True if backend is registered + """ + return backend_name in self._backends + + def list_searchable_backends(self) -> list[str]: + """Get list of backends that support searching. + + Returns: + List of searchable backend names + """ + return [ + name for name, backend in self._backends.items() + if backend.supports_search() + ] + + def supports_search(self, backend_name: str) -> bool: + """Check if a backend supports searching. + + Args: + backend_name: Name of the backend + + Returns: + True if backend supports search(), False otherwise + """ + if backend_name not in self._backends: + return False + return self._backends[backend_name].supports_search() diff --git a/helper/http_client.py b/helper/http_client.py new file mode 100644 index 0000000..650db19 --- /dev/null +++ b/helper/http_client.py @@ -0,0 +1,579 @@ +""" +Unified HTTP client for downlow using httpx. + +Provides synchronous and asynchronous HTTP operations with: +- Automatic retries on transient failures +- Configurable timeouts and headers +- Built-in progress tracking for downloads +- Request/response logging support +""" + +import httpx +import asyncio +from typing import Optional, Dict, Any, Callable, BinaryIO +from pathlib import Path +import logging + +logger = logging.getLogger(__name__) + +# Default configuration +DEFAULT_TIMEOUT = 30.0 +DEFAULT_RETRIES = 3 +DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + + +class HTTPClient: + """Unified HTTP client with sync support.""" + + def __init__( + self, + timeout: float = DEFAULT_TIMEOUT, + retries: int = DEFAULT_RETRIES, + user_agent: str = DEFAULT_USER_AGENT, + verify_ssl: bool = True, + headers: Optional[Dict[str, str]] = None, + ): + """ + Initialize HTTP client. + + Args: + timeout: Request timeout in seconds + retries: Number of retries on transient failures + user_agent: User-Agent header value + verify_ssl: Whether to verify SSL certificates + headers: Additional headers to include in all requests + """ + self.timeout = timeout + self.retries = retries + self.user_agent = user_agent + self.verify_ssl = verify_ssl + self.base_headers = headers or {} + self._client: Optional[httpx.Client] = None + + def __enter__(self): + """Context manager entry.""" + self._client = httpx.Client( + timeout=self.timeout, + verify=self.verify_ssl, + headers=self._get_headers(), + ) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Context manager exit.""" + if self._client: + self._client.close() + self._client = None + + def _get_headers(self) -> Dict[str, str]: + """Get request headers with user-agent.""" + headers = {"User-Agent": self.user_agent} + headers.update(self.base_headers) + return headers + + def get( + self, + url: str, + params: Optional[Dict[str, Any]] = None, + headers: Optional[Dict[str, str]] = None, + allow_redirects: bool = True, + ) -> httpx.Response: + """ + Make a GET request. + + Args: + url: Request URL + params: Query parameters + headers: Additional headers + allow_redirects: Follow redirects + + Returns: + httpx.Response object + """ + return self._request( + "GET", + url, + params=params, + headers=headers, + follow_redirects=allow_redirects, + ) + + def post( + self, + url: str, + data: Optional[Any] = None, + json: Optional[Dict] = None, + files: Optional[Dict] = None, + headers: Optional[Dict[str, str]] = None, + ) -> httpx.Response: + """ + Make a POST request. + + Args: + url: Request URL + data: Form data + json: JSON data + files: Files to upload + headers: Additional headers + + Returns: + httpx.Response object + """ + return self._request( + "POST", + url, + data=data, + json=json, + files=files, + headers=headers, + ) + + def put( + self, + url: str, + data: Optional[Any] = None, + json: Optional[Dict] = None, + content: Optional[Any] = None, + files: Optional[Dict] = None, + headers: Optional[Dict[str, str]] = None, + ) -> httpx.Response: + """ + Make a PUT request. + + Args: + url: Request URL + data: Form data + json: JSON data + content: Raw content + files: Files to upload + headers: Additional headers + + Returns: + httpx.Response object + """ + return self._request( + "PUT", + url, + data=data, + json=json, + content=content, + files=files, + headers=headers, + ) + + def delete( + self, + url: str, + headers: Optional[Dict[str, str]] = None, + ) -> httpx.Response: + """ + Make a DELETE request. + + Args: + url: Request URL + headers: Additional headers + + Returns: + httpx.Response object + """ + return self._request( + "DELETE", + url, + headers=headers, + ) + + def request( + self, + method: str, + url: str, + **kwargs + ) -> httpx.Response: + """ + Make a generic HTTP request. + + Args: + method: HTTP method + url: Request URL + **kwargs: Additional arguments + + Returns: + httpx.Response object + """ + return self._request(method, url, **kwargs) + + def download( + self, + url: str, + file_path: str, + chunk_size: int = 8192, + progress_callback: Optional[Callable[[int, int], None]] = None, + headers: Optional[Dict[str, str]] = None, + ) -> Path: + """ + Download a file from URL with optional progress tracking. + + Args: + url: File URL + file_path: Local file path to save to + chunk_size: Download chunk size + progress_callback: Callback(bytes_downloaded, total_bytes) + headers: Additional headers + + Returns: + Path object of downloaded file + """ + path = Path(file_path) + path.parent.mkdir(parents=True, exist_ok=True) + + with self._request_stream("GET", url, headers=headers, follow_redirects=True) as response: + response.raise_for_status() + total_bytes = int(response.headers.get("content-length", 0)) + bytes_downloaded = 0 + + with open(path, "wb") as f: + for chunk in response.iter_bytes(chunk_size): + if chunk: + f.write(chunk) + bytes_downloaded += len(chunk) + if progress_callback: + progress_callback(bytes_downloaded, total_bytes) + + return path + + def _request( + self, + method: str, + url: str, + **kwargs + ) -> httpx.Response: + """ + Make an HTTP request with automatic retries. + + Args: + method: HTTP method + url: Request URL + **kwargs: Additional arguments for httpx.Client.request() + + Returns: + httpx.Response object + """ + if not self._client: + raise RuntimeError("HTTPClient must be used with context manager (with statement)") + + # Merge headers + if "headers" in kwargs and kwargs["headers"]: + headers = self._get_headers() + headers.update(kwargs["headers"]) + kwargs["headers"] = headers + else: + kwargs["headers"] = self._get_headers() + + last_exception = None + + for attempt in range(self.retries): + try: + response = self._client.request(method, url, **kwargs) + response.raise_for_status() + return response + except httpx.TimeoutException as e: + last_exception = e + logger.warning(f"Timeout on attempt {attempt + 1}/{self.retries}: {url}") + if attempt < self.retries - 1: + continue + except httpx.HTTPStatusError as e: + # Don't retry on 4xx errors + if 400 <= e.response.status_code < 500: + try: + response_text = e.response.text[:500] + except: + response_text = "" + logger.error(f"HTTP {e.response.status_code} from {url}: {response_text}") + raise + last_exception = e + try: + response_text = e.response.text[:200] + except: + response_text = "" + logger.warning(f"HTTP {e.response.status_code} on attempt {attempt + 1}/{self.retries}: {url} - {response_text}") + if attempt < self.retries - 1: + continue + except (httpx.RequestError, httpx.ConnectError) as e: + last_exception = e + logger.warning(f"Connection error on attempt {attempt + 1}/{self.retries}: {url} - {e}") + if attempt < self.retries - 1: + continue + + if last_exception: + logger.error(f"Request failed after {self.retries} attempts: {url} - {last_exception}") + raise last_exception + + raise RuntimeError("Request failed after retries") + + def _request_stream(self, method: str, url: str, **kwargs): + """Make a streaming request.""" + if not self._client: + raise RuntimeError("HTTPClient must be used with context manager (with statement)") + + # Merge headers + if "headers" in kwargs and kwargs["headers"]: + headers = self._get_headers() + headers.update(kwargs["headers"]) + kwargs["headers"] = headers + else: + kwargs["headers"] = self._get_headers() + + return self._client.stream(method, url, **kwargs) + + +class AsyncHTTPClient: + """Unified async HTTP client with asyncio support.""" + + def __init__( + self, + timeout: float = DEFAULT_TIMEOUT, + retries: int = DEFAULT_RETRIES, + user_agent: str = DEFAULT_USER_AGENT, + verify_ssl: bool = True, + headers: Optional[Dict[str, str]] = None, + ): + """ + Initialize async HTTP client. + + Args: + timeout: Request timeout in seconds + retries: Number of retries on transient failures + user_agent: User-Agent header value + verify_ssl: Whether to verify SSL certificates + headers: Additional headers to include in all requests + """ + self.timeout = timeout + self.retries = retries + self.user_agent = user_agent + self.verify_ssl = verify_ssl + self.base_headers = headers or {} + self._client: Optional[httpx.AsyncClient] = None + + async def __aenter__(self): + """Async context manager entry.""" + self._client = httpx.AsyncClient( + timeout=self.timeout, + verify=self.verify_ssl, + headers=self._get_headers(), + ) + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + """Async context manager exit.""" + if self._client: + await self._client.aclose() + self._client = None + + def _get_headers(self) -> Dict[str, str]: + """Get request headers with user-agent.""" + headers = {"User-Agent": self.user_agent} + headers.update(self.base_headers) + return headers + + async def get( + self, + url: str, + params: Optional[Dict[str, Any]] = None, + headers: Optional[Dict[str, str]] = None, + allow_redirects: bool = True, + ) -> httpx.Response: + """ + Make an async GET request. + + Args: + url: Request URL + params: Query parameters + headers: Additional headers + allow_redirects: Follow redirects + + Returns: + httpx.Response object + """ + return await self._request( + "GET", + url, + params=params, + headers=headers, + follow_redirects=allow_redirects, + ) + + async def post( + self, + url: str, + data: Optional[Any] = None, + json: Optional[Dict] = None, + headers: Optional[Dict[str, str]] = None, + ) -> httpx.Response: + """ + Make an async POST request. + + Args: + url: Request URL + data: Form data + json: JSON data + headers: Additional headers + + Returns: + httpx.Response object + """ + return await self._request( + "POST", + url, + data=data, + json=json, + headers=headers, + ) + + async def download( + self, + url: str, + file_path: str, + chunk_size: int = 8192, + progress_callback: Optional[Callable[[int, int], None]] = None, + headers: Optional[Dict[str, str]] = None, + ) -> Path: + """ + Download a file from URL asynchronously with optional progress tracking. + + Args: + url: File URL + file_path: Local file path to save to + chunk_size: Download chunk size + progress_callback: Callback(bytes_downloaded, total_bytes) + headers: Additional headers + + Returns: + Path object of downloaded file + """ + path = Path(file_path) + path.parent.mkdir(parents=True, exist_ok=True) + + async with self._request_stream("GET", url, headers=headers) as response: + response.raise_for_status() + total_bytes = int(response.headers.get("content-length", 0)) + bytes_downloaded = 0 + + with open(path, "wb") as f: + async for chunk in response.aiter_bytes(chunk_size): + if chunk: + f.write(chunk) + bytes_downloaded += len(chunk) + if progress_callback: + progress_callback(bytes_downloaded, total_bytes) + + return path + + async def _request( + self, + method: str, + url: str, + **kwargs + ) -> httpx.Response: + """ + Make an async HTTP request with automatic retries. + + Args: + method: HTTP method + url: Request URL + **kwargs: Additional arguments for httpx.AsyncClient.request() + + Returns: + httpx.Response object + """ + if not self._client: + raise RuntimeError("AsyncHTTPClient must be used with async context manager") + + # Merge headers + if "headers" in kwargs and kwargs["headers"]: + headers = self._get_headers() + headers.update(kwargs["headers"]) + kwargs["headers"] = headers + else: + kwargs["headers"] = self._get_headers() + + last_exception = None + + for attempt in range(self.retries): + try: + response = await self._client.request(method, url, **kwargs) + response.raise_for_status() + return response + except httpx.TimeoutException as e: + last_exception = e + logger.warning(f"Timeout on attempt {attempt + 1}/{self.retries}: {url}") + if attempt < self.retries - 1: + await asyncio.sleep(0.5) # Brief delay before retry + continue + except httpx.HTTPStatusError as e: + # Don't retry on 4xx errors + if 400 <= e.response.status_code < 500: + try: + response_text = e.response.text[:500] + except: + response_text = "" + logger.error(f"HTTP {e.response.status_code} from {url}: {response_text}") + raise + last_exception = e + try: + response_text = e.response.text[:200] + except: + response_text = "" + logger.warning(f"HTTP {e.response.status_code} on attempt {attempt + 1}/{self.retries}: {url} - {response_text}") + if attempt < self.retries - 1: + await asyncio.sleep(0.5) + continue + except (httpx.RequestError, httpx.ConnectError) as e: + last_exception = e + logger.warning(f"Connection error on attempt {attempt + 1}/{self.retries}: {url} - {e}") + if attempt < self.retries - 1: + await asyncio.sleep(0.5) + continue + + if last_exception: + logger.error(f"Request failed after {self.retries} attempts: {url} - {last_exception}") + raise last_exception + + raise RuntimeError("Request failed after retries") + + def _request_stream(self, method: str, url: str, **kwargs): + """Make a streaming request.""" + if not self._client: + raise RuntimeError("AsyncHTTPClient must be used with async context manager") + + # Merge headers + if "headers" in kwargs and kwargs["headers"]: + headers = self._get_headers() + headers.update(kwargs["headers"]) + kwargs["headers"] = headers + else: + kwargs["headers"] = self._get_headers() + + return self._client.stream(method, url, **kwargs) + + +# Convenience function for quick sync requests +def get(url: str, **kwargs) -> httpx.Response: + """Quick GET request without context manager.""" + with HTTPClient() as client: + return client.get(url, **kwargs) + + +def post(url: str, **kwargs) -> httpx.Response: + """Quick POST request without context manager.""" + with HTTPClient() as client: + return client.post(url, **kwargs) + + +def download( + url: str, + file_path: str, + progress_callback: Optional[Callable[[int, int], None]] = None, + **kwargs +) -> Path: + """Quick file download without context manager.""" + with HTTPClient() as client: + return client.download(url, file_path, progress_callback=progress_callback, **kwargs) diff --git a/helper/hydrus.py b/helper/hydrus.py new file mode 100644 index 0000000..ff59d9e --- /dev/null +++ b/helper/hydrus.py @@ -0,0 +1,1553 @@ +"""Hydrus API helpers and export utilities.""" +from __future__ import annotations + +import base64 +import json +import os +import re +import shutil +import subprocess +import sys +import time + +from helper.logger import log, debug +import tempfile +import logging +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Iterable, Optional, Sequence, Type, TypeVar, Union, cast +from urllib.parse import urlsplit, urlencode, quote +import httpx + +logger = logging.getLogger(__name__) + +try: # Optional metadata helper for audio files + import mutagen # type: ignore +except ImportError: # pragma: no cover - best effort + mutagen = None # type: ignore + +from .utils import ( + decode_cbor, + jsonify, + ensure_directory, + sanitize_metadata_value, + unique_path, + unique_preserve_order, +) +from .http_client import HTTPClient + + +class HydrusRequestError(RuntimeError): + """Raised when the Hydrus Client API returns an error response.""" + + def __init__(self, status: int, message: str, payload: Any | None = None) -> None: + super().__init__(f"Hydrus request failed ({status}): {message}") + self.status = status + self.payload = payload + + +class HydrusConnectionError(HydrusRequestError): + """Raised when Hydrus service is unavailable (connection refused, timeout, etc.). + + This is an expected error when Hydrus is not running and should not include + a full traceback in logs. + """ + + def __init__(self, message: str) -> None: + super().__init__(0, message, None) # status 0 indicates connection error + self.is_connection_error = True + + +@dataclass(slots=True) +class HydrusRequestSpec: + method: str + endpoint: str + query: dict[str, Any] | None = None + data: Any | None = None + file_path: Path | None = None + content_type: str | None = None + accept: str | None = "application/cbor" + + +@dataclass(slots=True) +class HydrusClient: + """Thin wrapper around the Hydrus Client API.""" + + base_url: str + access_key: str = "" + timeout: float = 60.0 + + scheme: str = field(init=False) + hostname: str = field(init=False) + port: int = field(init=False) + base_path: str = field(init=False) + _session_key: str = field(init=False, default="", repr=False) # Cached session key + + def __post_init__(self) -> None: + if not self.base_url: + raise ValueError("Hydrus base URL is required") + self.base_url = self.base_url.rstrip("/") + parsed = urlsplit(self.base_url) + if parsed.scheme not in {"http", "https"}: + raise ValueError("Hydrus base URL must use http or https") + self.scheme = parsed.scheme + self.hostname = parsed.hostname or "localhost" + self.port = parsed.port or (443 if self.scheme == "https" else 80) + self.base_path = parsed.path.rstrip("/") + self.access_key = self.access_key or "" + + # ------------------------------------------------------------------ + # low-level helpers + # ------------------------------------------------------------------ + + def _build_path(self, endpoint: str, query: dict[str, Any] | None = None) -> str: + path = endpoint if endpoint.startswith("/") else f"/{endpoint}" + if self.base_path: + path = f"{self.base_path}{path}" + if query: + encoded = urlencode(query, doseq=True) + if encoded: + path = f"{path}?{encoded}" + return path + + def _perform_request(self, spec: HydrusRequestSpec) -> Any: + headers: dict[str, str] = {} + + # On first request, try to acquire session key for security + if not self._session_key and self.access_key and spec.endpoint != "/session_key": + try: + logger.debug(f"[Hydrus] Acquiring session key on first request...") + self._acquire_session_key() + except Exception as e: + # If session key acquisition fails, fall back to access key + logger.debug(f"[Hydrus] Session key acquisition failed: {e}. Using access key instead.") + + # Use session key if available, otherwise use access key + if self._session_key: + headers["Hydrus-Client-API-Session-Key"] = self._session_key + elif self.access_key: + headers["Hydrus-Client-API-Access-Key"] = self.access_key + if spec.accept: + headers["Accept"] = spec.accept + + path = self._build_path(spec.endpoint, spec.query) + url = f"{self.scheme}://{self.hostname}:{self.port}{path}" + + # Log request details + logger.debug(f"[Hydrus] {spec.method} {spec.endpoint} (auth: {'session_key' if self._session_key else 'access_key' if self.access_key else 'none'})") + + status = 0 + reason = "" + body = b"" + content_type = "" + + try: + with HTTPClient(timeout=self.timeout, headers=headers, verify_ssl=False) as client: + response = None + + if spec.file_path is not None: + file_path = Path(spec.file_path) + if not file_path.is_file(): + error_msg = f"Upload file not found: {file_path}" + logger.error(f"[Hydrus] {error_msg}") + raise FileNotFoundError(error_msg) + + file_size = file_path.stat().st_size + headers["Content-Type"] = spec.content_type or "application/octet-stream" + headers["Content-Length"] = str(file_size) + + logger.debug(f"[Hydrus] Uploading file {file_path.name} ({file_size} bytes)") + + def file_gen(): + with file_path.open("rb") as handle: + while chunk := handle.read(65536): + yield chunk + + response = client.request( + spec.method, + url, + content=file_gen(), + headers=headers + ) + else: + content = None + json_data = None + if spec.data is not None: + if isinstance(spec.data, (bytes, bytearray)): + content = spec.data + else: + json_data = spec.data + logger.debug(f"[Hydrus] Request body size: {len(content) if content else 'json'}") + + response = client.request( + spec.method, + url, + content=content, + json=json_data, + headers=headers + ) + + status = response.status_code + reason = response.reason_phrase + body = response.content + content_type = response.headers.get("Content-Type", "") or "" + + logger.debug(f"[Hydrus] Response {status} {reason} ({len(body)} bytes)") + + except (httpx.ConnectError, httpx.TimeoutException, httpx.NetworkError) as exc: + msg = f"Hydrus unavailable: {exc}" + logger.warning(f"[Hydrus] {msg}") + raise HydrusConnectionError(msg) from exc + except httpx.HTTPStatusError as exc: + response = exc.response + status = response.status_code + reason = response.reason_phrase + body = response.content + content_type = response.headers.get("Content-Type", "") or "" + except Exception as exc: + logger.error(f"[Hydrus] Connection error: {exc}", exc_info=True) + raise + + payload: Any + payload = {} + if body: + content_main = content_type.split(";", 1)[0].strip().lower() + if "json" in content_main: + try: + payload = json.loads(body.decode("utf-8")) + except (json.JSONDecodeError, UnicodeDecodeError): + payload = body.decode("utf-8", "replace") + elif "cbor" in content_main: + try: + payload = decode_cbor(body) + except Exception: + payload = body + else: + payload = body + + if status >= 400: + message = "" + if isinstance(payload, dict): + message = str(payload.get("message") or payload.get("error") or payload) + elif isinstance(payload, str): + message = payload + else: + message = reason or "HTTP error" + + logger.error(f"[Hydrus] HTTP {status}: {message}") + + # Handle expired session key (419) by clearing cache and retrying once + if status == 419 and self._session_key and "session" in message.lower(): + logger.warning(f"[Hydrus] Session key expired, acquiring new one and retrying...") + self._session_key = "" # Clear expired session key + try: + self._acquire_session_key() + # Retry the request with new session key + return self._perform_request(spec) + except Exception as retry_error: + logger.error(f"[Hydrus] Retry failed: {retry_error}", exc_info=True) + # If retry fails, raise the original error + raise HydrusRequestError(status, message, payload) from retry_error + + raise HydrusRequestError(status, message, payload) + + return payload + + def _acquire_session_key(self) -> str: + """Acquire a session key from the Hydrus API using the access key. + + Session keys are temporary authentication tokens that expire after 24 hours + of inactivity, client restart, or if the access key is deleted. They are + more secure than passing access keys in every request. + + Returns the session key string. + Raises HydrusRequestError if the request fails. + """ + if not self.access_key: + raise HydrusRequestError(401, "Cannot acquire session key: no access key configured") + + # Temporarily use access key to get session key + original_session_key = self._session_key + try: + self._session_key = "" # Clear session key to use access key for this request + + result = self._get("/session_key") + session_key = result.get("session_key") + + if not session_key: + raise HydrusRequestError(500, "Session key response missing 'session_key' field", result) + + self._session_key = session_key + return session_key + except HydrusRequestError: + self._session_key = original_session_key + raise + except Exception as e: + self._session_key = original_session_key + raise HydrusRequestError(500, f"Failed to acquire session key: {e}") + + def ensure_session_key(self) -> str: + """Ensure a valid session key exists, acquiring one if needed. + + Returns the session key. If one is already cached, returns it. + Otherwise acquires a new session key from the API. + """ + if self._session_key: + return self._session_key + return self._acquire_session_key() + + def _get(self, endpoint: str, *, query: dict[str, Any] | None = None) -> dict[str, Any]: + spec = HydrusRequestSpec("GET", endpoint, query=query) + return cast(dict[str, Any], self._perform_request(spec)) + + def _post( + self, + endpoint: str, + *, + data: dict[str, Any] | None = None, + file_path: Path | None = None, + content_type: str | None = None, + ) -> dict[str, Any]: + spec = HydrusRequestSpec("POST", endpoint, data=data, file_path=file_path, content_type=content_type) + return cast(dict[str, Any], self._perform_request(spec)) + + def _ensure_hashes(self, hashes: Union[str, Iterable[str]]) -> list[str]: + if isinstance(hashes, str): + return [hashes] + return list(hashes) + + def _append_access_key(self, url: str) -> str: + if not self.access_key: + return url + separator = "&" if "?" in url else "?" + # Use the correct parameter name for Hydrus API compatibility + return f"{url}{separator}access_key={quote(self.access_key)}" + + # ------------------------------------------------------------------ + # public API wrappers + # ------------------------------------------------------------------ + + def add_file(self, file_path: Path) -> dict[str, Any]: + return self._post("/add_files/add_file", file_path=file_path) + + def add_tags(self, file_hashes: Union[str, Iterable[str]], tags: Iterable[str], service_name: str) -> dict[str, Any]: + hashes = self._ensure_hashes(file_hashes) + body = {"hashes": hashes, "service_names_to_tags": {service_name: list(tags)}} + return self._post("/add_tags/add_tags", data=body) + + def delete_tags( + self, + file_hashes: Union[str, Iterable[str]], + tags: Iterable[str], + service_name: str, + *, + action: int = 1, + ) -> dict[str, Any]: + hashes = self._ensure_hashes(file_hashes) + body = { + "hashes": hashes, + "service_names_to_actions_to_tags": {service_name: {action: list(tags)}}, + } + return self._post("/add_tags/add_tags", data=body) + + def add_tags_by_key(self, file_hashes: Union[str, Iterable[str]], tags: Iterable[str], service_key: str) -> dict[str, Any]: + hashes = self._ensure_hashes(file_hashes) + body = {"hashes": hashes, "service_keys_to_tags": {service_key: list(tags)}} + return self._post("/add_tags/add_tags", data=body) + + def delete_tags_by_key( + self, + file_hashes: Union[str, Iterable[str]], + tags: Iterable[str], + service_key: str, + *, + action: int = 1, + ) -> dict[str, Any]: + hashes = self._ensure_hashes(file_hashes) + body = { + "hashes": hashes, + "service_keys_to_actions_to_tags": {service_key: {action: list(tags)}}, + } + return self._post("/add_tags/add_tags", data=body) + + def associate_url(self, file_hashes: Union[str, Iterable[str]], url: str) -> dict[str, Any]: + hashes = self._ensure_hashes(file_hashes) + if len(hashes) == 1: + body = {"hash": hashes[0], "url_to_add": url} + return self._post("/add_urls/associate_url", data=body) + + results: dict[str, Any] = {} + for file_hash in hashes: + body = {"hash": file_hash, "url_to_add": url} + results[file_hash] = self._post("/add_urls/associate_url", data=body) + return {"batched": results} + + def delete_url(self, file_hashes: Union[str, Iterable[str]], url: str) -> dict[str, Any]: + hashes = self._ensure_hashes(file_hashes) + if len(hashes) == 1: + body = {"hash": hashes[0], "url_to_delete": url} + return self._post("/add_urls/associate_url", data=body) + + results: dict[str, Any] = {} + for file_hash in hashes: + body = {"hash": file_hash, "url_to_delete": url} + results[file_hash] = self._post("/add_urls/associate_url", data=body) + return {"batched": results} + + def set_notes(self, file_hashes: Union[str, Iterable[str]], notes: dict[str, str], service_name: str) -> dict[str, Any]: + if not notes: + raise ValueError("notes mapping must not be empty") + hashes = self._ensure_hashes(file_hashes) + body = {"hashes": hashes, "service_names_to_notes": {service_name: notes}} + return self._post("/add_notes/set_notes", data=body) + + def delete_notes( + self, + file_hashes: Union[str, Iterable[str]], + note_names: Sequence[str], + service_name: str, + ) -> dict[str, Any]: + names = [name for name in note_names if name] + if not names: + raise ValueError("note_names must not be empty") + hashes = self._ensure_hashes(file_hashes) + body = {"hashes": hashes, "service_names_to_deleted_note_names": {service_name: names}} + return self._post("/add_notes/set_notes", data=body) + + def get_file_relationships(self, file_hash: str) -> dict[str, Any]: + query = {"hash": file_hash} + return self._get("/manage_file_relationships/get_file_relationships", query=query) + + def set_relationship(self, hash_a: str, hash_b: str, relationship: Union[str, int], do_default_content_merge: bool = False) -> dict[str, Any]: + """Set a relationship between two files in Hydrus. + + Args: + hash_a: First file hash + hash_b: Second file hash + relationship: Relationship type - can be string ("king", "alt", "related", etc) + or integer (0-4): + - 0 = duplicates + - 1 = alternate + - 2 = not_related + - 3 = related + - 4 = king + do_default_content_merge: Whether to perform default content merge + + Returns: + Response from Hydrus API + """ + # Convert string relationship types to integers + if isinstance(relationship, str): + rel_map = { + "duplicates": 0, + "duplicate": 0, + "alt": 1, + "alternate": 1, + "not_related": 2, + "not related": 2, + "related": 3, + "king": 4, + } + relationship = rel_map.get(relationship.lower(), 3) # Default to "related" (3) + + body = { + "relationships": [ + { + "hash_a": hash_a, + "hash_b": hash_b, + "relationship": relationship, + "do_default_content_merge": do_default_content_merge, + } + ] + } + return self._post("/manage_file_relationships/set_file_relationships", data=body) + + def get_services(self) -> dict[str, Any]: + return self._get("/get_services") + + def search_files( + self, + tags: Sequence[Any], + *, + file_service_name: str | None = None, + return_hashes: bool = False, + return_file_ids: bool = True, + include_current_tags: bool | None = None, + include_pending_tags: bool | None = None, + file_sort_type: int | None = None, + file_sort_asc: bool | None = None, + file_sort_key: str | None = None, + ) -> dict[str, Any]: + if not tags: + raise ValueError("tags must not be empty") + + query: dict[str, Any] = {} + query_fields = [ + ("tags", tags, lambda v: json.dumps(list(v))), + ("file_service_name", file_service_name, lambda v: v), + ("return_hashes", return_hashes, lambda v: "true" if v else None), + ("return_file_ids", return_file_ids, lambda v: "true" if v else None), + ( + "include_current_tags", + include_current_tags, + lambda v: "true" if v else "false" if v is not None else None, + ), + ( + "include_pending_tags", + include_pending_tags, + lambda v: "true" if v else "false" if v is not None else None, + ), + ("file_sort_type", file_sort_type, lambda v: str(v) if v is not None else None), + ("file_sort_asc", file_sort_asc, lambda v: "true" if v else "false" if v is not None else None), + ("file_sort_key", file_sort_key, lambda v: v), + ] + + for key, value, formatter in query_fields: + if value is None or value == []: + continue + formatted = formatter(value) + if formatted is not None: + query[key] = formatted + + return self._get("/get_files/search_files", query=query) + + def fetch_file_metadata( + self, + *, + file_ids: Sequence[int] | None = None, + hashes: Sequence[str] | None = None, + include_service_keys_to_tags: bool = True, + include_file_urls: bool = False, + include_duration: bool = True, + include_size: bool = True, + include_mime: bool = False, + include_notes: bool = False, + ) -> dict[str, Any]: + if not file_ids and not hashes: + raise ValueError("Either file_ids or hashes must be provided") + + query: dict[str, Any] = {} + query_fields = [ + ("file_ids", file_ids, lambda v: json.dumps(list(v))), + ("hashes", hashes, lambda v: json.dumps(list(v))), + ( + "include_service_keys_to_tags", + include_service_keys_to_tags, + lambda v: "true" if v else None, + ), + ("include_file_urls", include_file_urls, lambda v: "true" if v else None), + ("include_duration", include_duration, lambda v: "true" if v else None), + ("include_size", include_size, lambda v: "true" if v else None), + ("include_mime", include_mime, lambda v: "true" if v else None), + ("include_notes", include_notes, lambda v: "true" if v else None), + ] + + for key, value, formatter in query_fields: + if not value: + continue + formatted = formatter(value) + if formatted is not None: + query[key] = formatted + + return self._get("/get_files/file_metadata", query=query) + + def get_file_path(self, file_hash: str) -> dict[str, Any]: + """Get the local file system path for a given file hash.""" + query = {"hash": file_hash} + return self._get("/get_files/file_path", query=query) + + def file_url(self, file_hash: str) -> str: + hash_param = quote(file_hash) + # Don't append access_key parameter for file downloads - use header instead + url = f"{self.base_url}/get_files/file?hash={hash_param}" + return url + + def thumbnail_url(self, file_hash: str) -> str: + hash_param = quote(file_hash) + # Don't append access_key parameter for file downloads - use header instead + url = f"{self.base_url}/get_files/thumbnail?hash={hash_param}" + return url + + +HydrusCliOptionsT = TypeVar("HydrusCliOptionsT", bound="HydrusCliOptions") + + +@dataclass(slots=True) +class HydrusCliOptions: + url: str + method: str + access_key: str + accept: str + timeout: float + content_type: str | None + body_bytes: bytes | None = None + body_path: Path | None = None + debug: bool = False + + @classmethod + def from_namespace(cls: Type[HydrusCliOptionsT], namespace: Any) -> HydrusCliOptionsT: + accept_header = namespace.accept or 'application/cbor' + body_bytes: bytes | None = None + body_path: Path | None = None + if namespace.body_file: + body_path = Path(namespace.body_file) + elif namespace.body is not None: + body_bytes = namespace.body.encode('utf-8') + return cls( + url=namespace.url, + method=namespace.method.upper(), + access_key=namespace.access_key or '', + accept=accept_header, + timeout=namespace.timeout, + content_type=namespace.content_type, + body_bytes=body_bytes, + body_path=body_path, + debug=bool(os.environ.get('DOWNLOW_DEBUG')), + ) +def hydrus_request(args, parser) -> int: + if args.body and args.body_file: + parser.error('Only one of --body or --body-file may be supplied') + + options = HydrusCliOptions.from_namespace(args) + + parsed = urlsplit(options.url) + if parsed.scheme not in ('http', 'https'): + parser.error('Only http and https URLs are supported') + if not parsed.hostname: + parser.error('Invalid Hydrus URL') + + headers: dict[str, str] = {} + if options.access_key: + headers['Hydrus-Client-API-Access-Key'] = options.access_key + if options.accept: + headers['Accept'] = options.accept + + request_body_bytes: bytes | None = None + body_path: Path | None = None + if options.body_path is not None: + body_path = options.body_path + if not body_path.is_file(): + parser.error(f'File not found: {body_path}') + headers.setdefault('Content-Type', options.content_type or 'application/octet-stream') + headers['Content-Length'] = str(body_path.stat().st_size) + elif options.body_bytes is not None: + request_body_bytes = options.body_bytes + headers['Content-Type'] = options.content_type or 'application/json' + assert request_body_bytes is not None + headers['Content-Length'] = str(len(request_body_bytes)) + elif options.content_type: + headers['Content-Type'] = options.content_type + + if parsed.username or parsed.password: + userinfo = f"{parsed.username or ''}:{parsed.password or ''}".encode('utf-8') + headers['Authorization'] = 'Basic ' + base64.b64encode(userinfo).decode('ascii') + + path = parsed.path or '/' + if parsed.query: + path += '?' + parsed.query + + port = parsed.port + if port is None: + port = 443 if parsed.scheme == 'https' else 80 + + connection_cls = http.client.HTTPSConnection if parsed.scheme == 'https' else http.client.HTTPConnection + host = parsed.hostname or 'localhost' + connection = connection_cls(host, port, timeout=options.timeout) + + if options.debug: + log(f"Hydrus connecting to {parsed.scheme}://{host}:{port}{path}", file=sys.stderr) + response_bytes: bytes = b'' + content_type = '' + status = 0 + try: + if body_path is not None: + with body_path.open('rb') as handle: + if options.debug: + size_hint = headers.get('Content-Length', 'unknown') + log(f"Hydrus sending file body ({size_hint} bytes)", file=sys.stderr) + connection.putrequest(options.method, path) + host_header = host + if (parsed.scheme == 'http' and port not in (80, None)) or (parsed.scheme == 'https' and port not in (443, None)): + host_header = f"{host}:{port}" + connection.putheader('Host', host_header) + for key, value in headers.items(): + if value: + connection.putheader(key, value) + connection.endheaders() + while True: + chunk = handle.read(65536) + if not chunk: + break + connection.send(chunk) + if options.debug: + log('[downlow.py] Hydrus upload complete; awaiting response', file=sys.stderr) + else: + if options.debug: + size_hint = 'none' if request_body_bytes is None else str(len(request_body_bytes)) + log(f"Hydrus sending request body bytes={size_hint}", file=sys.stderr) + sanitized_headers = {k: v for k, v in headers.items() if v} + connection.request(options.method, path, body=request_body_bytes, headers=sanitized_headers) + response = connection.getresponse() + status = response.status + response_bytes = response.read() + if options.debug: + log(f"Hydrus response received ({len(response_bytes)} bytes)", file=sys.stderr) + content_type = response.getheader('Content-Type', '') + except (OSError, http.client.HTTPException) as exc: + log(f"HTTP error: {exc}", file=sys.stderr) + return 1 + finally: + connection.close() + content_type_lower = (content_type or '').split(';', 1)[0].strip().lower() + accept_value = options.accept or '' + expect_cbor = 'cbor' in (content_type_lower or '') or 'cbor' in accept_value.lower() + payload = None + decode_error: Exception | None = None + if response_bytes: + if expect_cbor: + try: + payload = decode_cbor(response_bytes) + except Exception as exc: # pragma: no cover - library errors surfaced + decode_error = exc + if payload is None and not expect_cbor: + try: + payload = json.loads(response_bytes.decode('utf-8')) + except (json.JSONDecodeError, UnicodeDecodeError): + payload = response_bytes.decode('utf-8', 'replace') + elif payload is None and expect_cbor and decode_error is not None: + log(f"Expected CBOR response but decoding failed: {decode_error}", file=sys.stderr) + return 1 + json_ready = jsonify(payload) if isinstance(payload, (dict, list)) else payload + if options.debug: + log(f"Hydrus {options.method} {options.url} -> {status}", file=sys.stderr) + if isinstance(json_ready, (dict, list)): + log(json.dumps(json_ready, ensure_ascii=False)) + elif json_ready is None: + log('{}') + else: + log(json.dumps({'value': json_ready}, ensure_ascii=False)) + return 0 if 200 <= status < 400 else 1 +def prepare_ffmpeg_metadata(payload: Optional[dict[str, Any]]) -> dict[str, str]: + if not isinstance(payload, dict): + return {} + metadata: dict[str, str] = {} + def set_field(key: str, raw: Any, limit: int = 2000) -> None: + sanitized = sanitize_metadata_value(raw) + if not sanitized: + return + if len(sanitized) > limit: + sanitized = sanitized[:limit] + metadata[key] = sanitized + set_field('title', payload.get('title')) + set_field('artist', payload.get('artist'), 512) + set_field('album', payload.get('album'), 512) + set_field('date', payload.get('year'), 20) + comment = payload.get('comment') + tags_value = payload.get('tags') + tag_strings: list[str] = [] + artists_from_tags: list[str] = [] + albums_from_tags: list[str] = [] + genres_from_tags: list[str] = [] + if isinstance(tags_value, list): + for raw_tag in tags_value: + if raw_tag is None: + continue + if not isinstance(raw_tag, str): + raw_tag = str(raw_tag) + tag = raw_tag.strip() + if not tag: + continue + tag_strings.append(tag) + namespace, sep, value = tag.partition(':') + if sep and value: + ns = namespace.strip().lower() + value = value.strip() + if ns in {'artist', 'creator', 'author', 'performer'}: + artists_from_tags.append(value) + elif ns in {'album', 'series', 'collection', 'group'}: + albums_from_tags.append(value) + elif ns in {'genre', 'rating'}: + genres_from_tags.append(value) + elif ns in {'comment', 'description'} and not comment: + comment = value + elif ns in {'year', 'date'} and not payload.get('year'): + set_field('date', value, 20) + else: + genres_from_tags.append(tag) + if 'artist' not in metadata and artists_from_tags: + set_field('artist', ', '.join(unique_preserve_order(artists_from_tags)[:3]), 512) + if 'album' not in metadata and albums_from_tags: + set_field('album', unique_preserve_order(albums_from_tags)[0], 512) + if genres_from_tags: + set_field('genre', ', '.join(unique_preserve_order(genres_from_tags)[:5]), 256) + if tag_strings: + joined_tags = ', '.join(tag_strings[:50]) + set_field('keywords', joined_tags, 2000) + if not comment: + comment = joined_tags + if comment: + set_field('comment', comment, 2000) + set_field('description', comment, 2000) + return metadata +def apply_mutagen_metadata(path: Path, metadata: dict[str, str], fmt: str) -> None: + if fmt != 'audio': + return + if not metadata: + return + if mutagen is None: + return + try: + audio = mutagen.File(path, easy=True) # type: ignore[attr-defined] + except Exception as exc: # pragma: no cover - best effort only + log(f"mutagen load failed: {exc}", file=sys.stderr) + return + if audio is None: + return + field_map = { + 'title': 'title', + 'artist': 'artist', + 'album': 'album', + 'genre': 'genre', + 'comment': 'comment', + 'description': 'comment', + 'date': 'date', + } + changed = False + for source_key, target_key in field_map.items(): + value = metadata.get(source_key) + if not value: + continue + try: + audio[target_key] = [value] + changed = True + except Exception: # pragma: no cover - best effort only + continue + if not changed: + return + try: + audio.save() + except Exception as exc: # pragma: no cover - best effort only + log(f"mutagen save failed: {exc}", file=sys.stderr) +def build_ffmpeg_command(ffmpeg_path: str, input_path: Path, output_path: Path, fmt: str, max_width: int, metadata: Optional[dict[str, str]] = None) -> list[str]: + cmd = [ffmpeg_path, '-y', '-i', str(input_path)] + if fmt in {'mp4', 'webm'} and max_width and max_width > 0: + cmd.extend(['-vf', f"scale='min({max_width},iw)':-2"]) + if metadata: + for key, value in metadata.items(): + cmd.extend(['-metadata', f'{key}={value}']) + + # Video formats + if fmt == 'mp4': + cmd.extend([ + '-c:v', 'libx265', + '-preset', 'medium', + '-crf', '26', + '-tag:v', 'hvc1', + '-pix_fmt', 'yuv420p', + '-c:a', 'aac', + '-b:a', '192k', + '-movflags', '+faststart', + ]) + elif fmt == 'webm': + cmd.extend([ + '-c:v', 'libvpx-vp9', + '-b:v', '0', + '-crf', '32', + '-c:a', 'libopus', + '-b:a', '160k', + ]) + cmd.extend(['-f', 'webm']) + + # Audio formats + elif fmt == 'mp3': + cmd.extend([ + '-vn', + '-c:a', 'libmp3lame', + '-b:a', '192k', + ]) + cmd.extend(['-f', 'mp3']) + elif fmt == 'flac': + cmd.extend([ + '-vn', + '-c:a', 'flac', + ]) + cmd.extend(['-f', 'flac']) + elif fmt == 'wav': + cmd.extend([ + '-vn', + '-c:a', 'pcm_s16le', + ]) + cmd.extend(['-f', 'wav']) + elif fmt == 'aac': + cmd.extend([ + '-vn', + '-c:a', 'aac', + '-b:a', '192k', + ]) + cmd.extend(['-f', 'adts']) + elif fmt == 'm4a': + cmd.extend([ + '-vn', + '-c:a', 'aac', + '-b:a', '192k', + ]) + cmd.extend(['-f', 'ipod']) + elif fmt == 'ogg': + cmd.extend([ + '-vn', + '-c:a', 'libvorbis', + '-b:a', '192k', + ]) + cmd.extend(['-f', 'ogg']) + elif fmt == 'opus': + cmd.extend([ + '-vn', + '-c:a', 'libopus', + '-b:a', '192k', + ]) + cmd.extend(['-f', 'opus']) + elif fmt == 'audio': + # Legacy format name for mp3 + cmd.extend([ + '-vn', + '-c:a', 'libmp3lame', + '-b:a', '192k', + ]) + cmd.extend(['-f', 'mp3']) + elif fmt != 'copy': + raise ValueError(f'Unsupported format: {fmt}') + + cmd.append(str(output_path)) + return cmd +def hydrus_export(args, _parser) -> int: + output_path: Path = args.output + original_suffix = output_path.suffix + target_dir = output_path.parent + metadata_payload: Optional[dict[str, Any]] = None + metadata_raw = getattr(args, 'metadata_json', None) + if metadata_raw: + try: + parsed = json.loads(metadata_raw) + except json.JSONDecodeError as exc: + log(f"Invalid metadata JSON: {exc}", file=sys.stderr) + return 1 + if isinstance(parsed, dict): + metadata_payload = parsed + else: + log('[downlow.py] Metadata JSON must decode to an object', file=sys.stderr) + return 1 + ffmpeg_metadata = prepare_ffmpeg_metadata(metadata_payload) + + def _normalise_ext(value: Optional[str]) -> Optional[str]: + if not value: + return None + cleaned = value.strip() + if not cleaned: + return None + if not cleaned.startswith('.'): # tolerate inputs like "mp4" + cleaned = '.' + cleaned.lstrip('.') + return cleaned + + def _extension_from_mime(mime: Optional[str]) -> Optional[str]: + if not mime: + return None + mime_map = { + # Images / bitmaps + 'image/jpeg': '.jpg', + 'image/jpg': '.jpg', + 'image/png': '.png', + 'image/gif': '.gif', + 'image/webp': '.webp', + 'image/avif': '.avif', + 'image/jxl': '.jxl', # JPEG XL + 'image/bmp': '.bmp', + 'image/heic': '.heic', + 'image/heif': '.heif', + 'image/x-icon': '.ico', + 'image/vnd.microsoft.icon': '.ico', + 'image/qoi': '.qoi', # Quite OK Image + 'image/tiff': '.tiff', + 'image/svg+xml': '.svg', + 'image/vnd.adobe.photoshop': '.psd', + + # Animation / sequence variants + 'image/apng': '.apng', + 'image/avif-sequence': '.avifs', + 'image/heic-sequence': '.heics', + 'image/heif-sequence': '.heifs', + + # Video + 'video/mp4': '.mp4', + 'video/webm': '.webm', + 'video/quicktime': '.mov', + 'video/ogg': '.ogv', + 'video/mpeg': '.mpeg', + 'video/x-msvideo': '.avi', + 'video/x-flv': '.flv', + 'video/x-matroska': '.mkv', + 'video/x-ms-wmv': '.wmv', + 'video/vnd.rn-realvideo': '.rv', + + # Audio + 'audio/mpeg': '.mp3', + 'audio/mp4': '.m4a', + 'audio/ogg': '.ogg', + 'audio/flac': '.flac', + 'audio/wav': '.wav', + 'audio/x-wav': '.wav', + 'audio/x-ms-wma': '.wma', + 'audio/x-tta': '.tta', + 'audio/vnd.wave': '.wav', + 'audio/x-wavpack': '.wv', + + # Documents / office + 'application/pdf': '.pdf', + 'application/epub+zip': '.epub', + 'application/vnd.djvu': '.djvu', + 'application/rtf': '.rtf', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx', + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx', + 'application/vnd.openxmlformats-officedocument.presentationml.presentation': '.pptx', + 'application/msword': '.doc', + 'application/vnd.ms-excel': '.xls', + 'application/vnd.ms-powerpoint': '.ppt', + + # Archive / comicbook / zip-like + 'application/zip': '.zip', + 'application/x-7z-compressed': '.7z', + 'application/x-rar-compressed': '.rar', + 'application/gzip': '.gz', + 'application/x-tar': '.tar', + 'application/x-cbz': '.cbz', # often just ZIP with images; CBZ is not an official mime type but used as mapping + + # App / project / other + 'application/clip': '.clip', # Clip Studio + 'application/x-krita': '.kra', + 'application/x-procreate': '.procreate', + 'application/x-shockwave-flash': '.swf', + } + + return mime_map.get(mime.lower()) + + def _extract_hash(file_url: str) -> Optional[str]: + match = re.search(r'[?&]hash=([0-9a-fA-F]+)', file_url) + return match.group(1) if match else None + + # Ensure output and temp directories exist using global helper + for dir_path in [target_dir, Path(args.tmp_dir) if args.tmp_dir else target_dir]: + try: + ensure_directory(dir_path) + except RuntimeError as exc: + log(f"{exc}", file=sys.stderr) + return 1 + + source_suffix = _normalise_ext(getattr(args, 'source_ext', None)) + if source_suffix and source_suffix.lower() == '.bin': + source_suffix = None + + if source_suffix is None: + hydrus_url = getattr(args, 'hydrus_url', None) + if not hydrus_url: + try: + from config import load_config, get_hydrus_url + hydrus_url = get_hydrus_url(load_config()) + except Exception as exc: + hydrus_url = None + if os.environ.get('DOWNLOW_DEBUG'): + log(f"hydrus-export could not load Hydrus URL: {exc}", file=sys.stderr) + if hydrus_url: + try: + setattr(args, 'hydrus_url', hydrus_url) + except Exception: + pass + resolved_suffix: Optional[str] = None + file_hash = getattr(args, 'file_hash', None) or _extract_hash(args.file_url) + if hydrus_url and file_hash: + try: + client = HydrusClient(base_url=hydrus_url, access_key=args.access_key, timeout=args.timeout) + meta_response = client.fetch_file_metadata(hashes=[file_hash], include_mime=True) + entries = meta_response.get('metadata') if isinstance(meta_response, dict) else None + if isinstance(entries, list) and entries: + entry = entries[0] + ext_value = _normalise_ext(entry.get('ext') if isinstance(entry, dict) else None) + if ext_value: + resolved_suffix = ext_value + else: + mime_value = entry.get('mime') if isinstance(entry, dict) else None + resolved_suffix = _extension_from_mime(mime_value) + except Exception as exc: # pragma: no cover - defensive + if os.environ.get('DOWNLOW_DEBUG'): + log(f"hydrus metadata fetch failed: {exc}", file=sys.stderr) + if not resolved_suffix: + fallback_suffix = _normalise_ext(original_suffix) + if fallback_suffix and fallback_suffix.lower() == '.bin': + fallback_suffix = None + resolved_suffix = fallback_suffix or '.hydrus' + source_suffix = resolved_suffix + + suffix = source_suffix or '.hydrus' + if suffix and output_path.suffix.lower() in {'', '.bin'}: + if output_path.suffix.lower() != suffix.lower(): + output_path = output_path.with_suffix(suffix) + target_dir = output_path.parent + # Determine temp directory (prefer provided tmp_dir, fallback to output location) + temp_dir = Path(getattr(args, 'tmp_dir', None) or target_dir) + try: + ensure_directory(temp_dir) + except RuntimeError: + temp_dir = target_dir + temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=suffix, dir=str(temp_dir)) + temp_path = Path(temp_file.name) + temp_file.close() + downloaded_bytes = 0 + headers = { + 'Hydrus-Client-API-Access-Key': args.access_key, + } + try: + downloaded_bytes = download_hydrus_file(args.file_url, headers, temp_path, args.timeout) + if os.environ.get('DOWNLOW_DEBUG'): + log(f"hydrus-export downloaded {downloaded_bytes} bytes", file=sys.stderr) + except httpx.RequestError as exc: + if temp_path.exists(): + temp_path.unlink() + log(f"hydrus-export download failed: {exc}", file=sys.stderr) + return 1 + except Exception as exc: # pragma: no cover - unexpected + if temp_path.exists(): + temp_path.unlink() + log(f"hydrus-export error: {exc}", file=sys.stderr) + return 1 + ffmpeg_log: Optional[str] = None + converted_tmp: Optional[Path] = None + try: + final_target = unique_path(output_path) + if args.format == 'copy': + shutil.move(str(temp_path), str(final_target)) + result_path = final_target + else: + ffmpeg_path = shutil.which('ffmpeg') + if not ffmpeg_path: + raise RuntimeError('ffmpeg executable not found in PATH') + converted_tmp = final_target.with_suffix(final_target.suffix + '.part') + if converted_tmp.exists(): + converted_tmp.unlink() + max_width = args.max_width if args.max_width and args.max_width > 0 else 0 + cmd = build_ffmpeg_command(ffmpeg_path, temp_path, converted_tmp, args.format, max_width, metadata=ffmpeg_metadata if ffmpeg_metadata else None) + if os.environ.get('DOWNLOW_DEBUG'): + log(f"ffmpeg command: {' '.join(cmd)}", file=sys.stderr) + completed = subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=False, + text=True, + ) + ffmpeg_log = (completed.stderr or '').strip() + if completed.returncode != 0: + error_details = ffmpeg_log or (completed.stdout or '').strip() + raise RuntimeError( + f'ffmpeg failed with exit code {completed.returncode}' + + (f': {error_details}' if error_details else '') + ) + shutil.move(str(converted_tmp), str(final_target)) + result_path = final_target + apply_mutagen_metadata(result_path, ffmpeg_metadata, args.format) + result_size = result_path.stat().st_size if result_path.exists() else None + payload: dict[str, object] = {'output': str(result_path)} + if downloaded_bytes: + payload['source_bytes'] = downloaded_bytes + if result_size is not None: + payload['size_bytes'] = result_size + if metadata_payload: + payload['metadata_keys'] = sorted(ffmpeg_metadata.keys()) if ffmpeg_metadata else [] + log(json.dumps(payload, ensure_ascii=False)) + if ffmpeg_log: + log(ffmpeg_log, file=sys.stderr) + return 0 + except Exception as exc: + log(f"hydrus-export failed: {exc}", file=sys.stderr) + return 1 + finally: + if temp_path.exists(): + try: + temp_path.unlink() + except OSError: + pass + if converted_tmp and converted_tmp.exists(): + try: + converted_tmp.unlink() + except OSError: + pass + + +# ============================================================================ +# Hydrus Wrapper Functions - Utilities for client initialization and config +# ============================================================================ +# This section consolidates functions formerly in hydrus_wrapper.py +# Provides: supported filetypes, client initialization, caching, service resolution + + +# Official Hydrus supported filetypes +# Source: https://hydrusnetwork.github.io/hydrus/filetypes.html +SUPPORTED_FILETYPES = { + # Images + 'image': { + '.jpeg': 'image/jpeg', + '.jpg': 'image/jpeg', + '.png': 'image/png', + '.gif': 'image/gif', + '.webp': 'image/webp', + '.avif': 'image/avif', + '.jxl': 'image/jxl', + '.bmp': 'image/bmp', + '.heic': 'image/heic', + '.heif': 'image/heif', + '.ico': 'image/x-icon', + '.qoi': 'image/qoi', + '.tiff': 'image/tiff', + }, + # Animated Images + 'animation': { + '.apng': 'image/apng', + '.avifs': 'image/avif-sequence', + '.heics': 'image/heic-sequence', + '.heifs': 'image/heif-sequence', + }, + # Video + 'video': { + '.mp4': 'video/mp4', + '.webm': 'video/webm', + '.mkv': 'video/x-matroska', + '.avi': 'video/x-msvideo', + '.flv': 'video/x-flv', + '.mov': 'video/quicktime', + '.mpeg': 'video/mpeg', + '.ogv': 'video/ogg', + '.rm': 'video/vnd.rn-realvideo', + '.wmv': 'video/x-ms-wmv', + }, + # Audio + 'audio': { + '.mp3': 'audio/mp3', + '.ogg': 'audio/ogg', + '.flac': 'audio/flac', + '.m4a': 'audio/mp4', + '.mka': 'audio/x-matroska', + '.mkv': 'audio/x-matroska', + '.mp4': 'audio/mp4', + '.ra': 'audio/vnd.rn-realaudio', + '.tta': 'audio/x-tta', + '.wav': 'audio/x-wav', + '.wv': 'audio/wavpack', + '.wma': 'audio/x-ms-wma', + }, + # Applications & Documents + 'application': { + '.swf': 'application/x-shockwave-flash', + '.pdf': 'application/pdf', + '.epub': 'application/epub+zip', + '.djvu': 'image/vnd.djvu', + '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', + '.doc': 'application/msword', + '.xls': 'application/vnd.ms-excel', + '.ppt': 'application/vnd.ms-powerpoint', + '.rtf': 'application/rtf', + }, + # Image Project Files + 'project': { + '.clip': 'application/clip1', + '.kra': 'application/x-krita', + '.procreate': 'application/x-procreate1', + '.psd': 'image/vnd.adobe.photoshop', + '.sai2': 'application/sai21', + '.svg': 'image/svg+xml', + '.xcf': 'application/x-xcf', + }, + # Archives + 'archive': { + '.cbz': 'application/vnd.comicbook+zip', + '.7z': 'application/x-7z-compressed', + '.gz': 'application/gzip', + '.rar': 'application/vnd.rar', + '.zip': 'application/zip', + }, +} + +# Flatten to get all supported extensions +ALL_SUPPORTED_EXTENSIONS = set() +for category_extensions in SUPPORTED_FILETYPES.values(): + ALL_SUPPORTED_EXTENSIONS.update(category_extensions.keys()) + + +# Global Hydrus client cache to reuse session keys +_hydrus_client_cache: dict[str, Any] = {} + +# Cache Hydrus availability across the session +_HYDRUS_AVAILABLE: Optional[bool] = None +_HYDRUS_UNAVAILABLE_REASON: Optional[str] = None + + +def reset_cache() -> None: + """Reset the availability cache (useful for testing).""" + global _HYDRUS_AVAILABLE, _HYDRUS_UNAVAILABLE_REASON + _HYDRUS_AVAILABLE = None + _HYDRUS_UNAVAILABLE_REASON = None + + +def is_available(config: dict[str, Any], use_cache: bool = True) -> tuple[bool, Optional[str]]: + """Check if Hydrus is available and accessible. + + Performs a lightweight probe to verify: + - Hydrus URL is configured + - Hydrus client library is available + - Can connect to Hydrus and retrieve services + + Results are cached per session unless use_cache=False. + + Args: + config: Configuration dict with Hydrus settings + use_cache: If True, use cached result from previous probe + + Returns: + Tuple of (is_available: bool, reason: Optional[str]) + reason is None if available, or an error message if not + """ + global _HYDRUS_AVAILABLE, _HYDRUS_UNAVAILABLE_REASON + + if use_cache and _HYDRUS_AVAILABLE is not None: + return _HYDRUS_AVAILABLE, _HYDRUS_UNAVAILABLE_REASON + + # Use new config helpers first, fallback to old method + from config import get_hydrus_url, get_hydrus_access_key + + url = (get_hydrus_url(config, "home") or "").strip() + if not url: + reason = "Hydrus URL not configured (check config.json HydrusNetwork.home.url)" + _HYDRUS_AVAILABLE = False + _HYDRUS_UNAVAILABLE_REASON = reason + return False, reason + + access_key = get_hydrus_access_key(config, "home") or "" + timeout_raw = config.get("HydrusNetwork_Request_Timeout") + try: + timeout = float(timeout_raw) if timeout_raw is not None else 10.0 + except (TypeError, ValueError): + timeout = 10.0 + + try: + client = HydrusClient(url, access_key, timeout) + # Lightweight probe: get services + # Temporarily suppress error logging for health checks (expected to fail if Hydrus unavailable) + hydrus_logger = logging.getLogger("helper.hydrus") + original_level = hydrus_logger.level + hydrus_logger.setLevel(logging.CRITICAL) # Suppress errors/warnings + try: + _ = client.get_services() + _HYDRUS_AVAILABLE = True + _HYDRUS_UNAVAILABLE_REASON = None + return True, None + finally: + hydrus_logger.setLevel(original_level) + except Exception as exc: + reason = str(exc) + _HYDRUS_AVAILABLE = False + _HYDRUS_UNAVAILABLE_REASON = reason + return False, reason + + +def is_hydrus_available(config: dict[str, Any]) -> bool: + """Check if Hydrus is available without raising. + + Args: + config: Configuration dict + + Returns: + True if Hydrus is available, False otherwise + """ + available, _ = is_available(config) + return available + + +def get_client(config: dict[str, Any]) -> HydrusClient: + """Create and return a Hydrus client with session key authentication. + + Reuses cached client instance to preserve session keys across requests. + + Args: + config: Configuration dict with Hydrus settings + + Returns: + HydrusClient instance (with active session key) + + Raises: + RuntimeError: If Hydrus is not configured or unavailable + """ + # Check availability first - if unavailable, raise immediately + available, reason = is_available(config) + if not available: + raise RuntimeError(f"Hydrus is unavailable: {reason}") + + from config import get_hydrus_url, get_hydrus_access_key + + # Use new config helpers + hydrus_url = (get_hydrus_url(config, "home") or "").strip() + if not hydrus_url: + raise RuntimeError("Hydrus URL is not configured (check config.json HydrusNetwork.home.url)") + + access_key = get_hydrus_access_key(config, "home") or "" + timeout_raw = config.get("HydrusNetwork_Request_Timeout") + try: + timeout = float(timeout_raw) if timeout_raw is not None else 60.0 + except (TypeError, ValueError): + timeout = 60.0 + + # Create cache key from URL and access key + cache_key = f"{hydrus_url}#{access_key}" + + # Check if we have a cached client + if cache_key in _hydrus_client_cache: + cached_client = _hydrus_client_cache[cache_key] + # If cached client has a session key, reuse it (don't re-acquire) + if hasattr(cached_client, '_session_key') and cached_client._session_key: + debug(f"Reusing cached session key for {hydrus_url}") + return cached_client + # If no session key in cache, try to get one + try: + cached_client.ensure_session_key() + return cached_client + except Exception as e: + # If verification fails, remove from cache and create new one + debug(f"Cached client invalid, creating new: {e}") + del _hydrus_client_cache[cache_key] + + # Create new client + client = HydrusClient(hydrus_url, access_key, timeout) + + # Acquire session key for secure authentication + try: + client.ensure_session_key() + except HydrusConnectionError: + # This should not happen since we checked availability above + debug(f"Hydrus service unavailable during client creation") + raise RuntimeError("Hydrus is unavailable") from None + except Exception as e: + # Log other exceptions but don't fail - client can still work with access_key + debug(f"Warning: Could not acquire session key: {e}") + + # Cache the client + _hydrus_client_cache[cache_key] = client + + return client + + +def get_tag_service_name(config: dict[str, Any]) -> str: + """Get the name of the tag service to use for tagging operations. + + Currently always returns "my tags" to avoid remote service errors. + + Args: + config: Configuration dict (not currently used) + + Returns: + Service name string, typically "my tags" + """ + # Always use 'my tags' to avoid remote service errors + return "my tags" + + +def get_tag_service_key(client: HydrusClient, fallback_name: str = "my tags") -> Optional[str]: + """Get the service key for a named tag service. + + Queries the Hydrus client's services and finds the service key matching + the given name. + + Args: + client: HydrusClient instance + fallback_name: Name of the service to find (e.g., "my tags") + + Returns: + Service key string if found, None otherwise + """ + try: + services = client.get_services() + except Exception: + return None + + if not isinstance(services, dict): + return None + + # Hydrus returns services grouped by type; walk all lists and match on name + for group in services.values(): + if not isinstance(group, list): + continue + for item in group: + if not isinstance(item, dict): + continue + name = str(item.get("name") or "").strip().lower() + key = item.get("service_key") or item.get("key") + if name == fallback_name.lower() and key: + return str(key) + + return None + + +def is_request_error(exc: Exception) -> bool: + """Check if an exception is a Hydrus request error. + + Args: + exc: Exception to check + + Returns: + True if this is a HydrusRequestError + """ + return isinstance(exc, HydrusRequestError) + + +CHUNK_SIZE = 1024 * 1024 # 1 MiB + +def download_hydrus_file(file_url: str, headers: dict[str, str], destination: Path, timeout: float) -> int: + """Download *file_url* into *destination* returning the byte count with progress bar.""" + from .progress import print_progress, print_final_progress + + downloaded = 0 + start_time = time.time() + last_update = start_time + + # Try to get file size from headers if available + file_size = None + with HTTPClient(timeout=timeout, headers=headers) as client: + response = client.get(file_url) + response.raise_for_status() + + # Try to get size from content-length header + try: + file_size = int(response.headers.get('content-length', 0)) + except (ValueError, TypeError): + file_size = None + + filename = destination.name + + with destination.open('wb') as handle: + for chunk in response.iter_bytes(CHUNK_SIZE): + if not chunk: + break + handle.write(chunk) + downloaded += len(chunk) + + # Update progress every 0.5 seconds if we know total size + if file_size: + now = time.time() + if now - last_update >= 0.5: + elapsed = now - start_time + speed = downloaded / elapsed if elapsed > 0 else 0 + print_progress(filename, downloaded, file_size, speed) + last_update = now + + # Print final progress line if we tracked it + if file_size: + elapsed = time.time() - start_time + print_final_progress(filename, file_size, elapsed) + + return downloaded diff --git a/helper/libgen_service.py b/helper/libgen_service.py new file mode 100644 index 0000000..90def52 --- /dev/null +++ b/helper/libgen_service.py @@ -0,0 +1,377 @@ +"""Shared Library Genesis search and download helpers.""" +from __future__ import annotations + +from pathlib import Path +from typing import Any, Callable, Dict, Iterable, List, Optional +import logging +import requests +from urllib.parse import quote, urljoin + +from libgen import search_sync, LibgenError + +LogFn = Optional[Callable[[str], None]] +ErrorFn = Optional[Callable[[str], None]] + +DEFAULT_TIMEOUT = 10.0 +DEFAULT_LIMIT = 50 + +logging.getLogger(__name__).setLevel(logging.WARNING) + + +def _call(logger: LogFn, message: str) -> None: + if logger: + logger(message) + + +def search_libgen_no_ads(query: str, session: Optional[requests.Session] = None) -> List[Dict[str, Any]]: + """Search Libgen without triggering ads.php requests.""" + try: + from bs4 import BeautifulSoup + except ImportError: # pragma: no cover + logging.warning("BeautifulSoup not available; falling back to standard search") + return [] + + mirrors = [ + "https://libgen.gl", + "https://libgen.vg", + "https://libgen.la", + "https://libgen.bz", + "https://libgen.gs", + ] + + session = session or requests.Session() + session.headers.setdefault( + "User-Agent", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", + ) + + for mirror in mirrors: + try: + search_url = f"{mirror}/index.php?req={quote(query)}&res=100&covers=on&filesuns=all" + response = session.get(search_url, timeout=DEFAULT_TIMEOUT) + if response.status_code != 200: + continue + + soup = BeautifulSoup(response.content, "html.parser") + table = soup.find("table", {"class": "catalog"}) + if table is None: + for candidate in soup.find_all("table"): + rows = candidate.find_all("tr") + if len(rows) > 2: + table = candidate + break + if table is None: + logging.debug("[libgen_no_ads] No results table on %s", mirror) + continue + + rows = table.find_all("tr")[1:] + results: List[Dict[str, Any]] = [] + for row in rows: + try: + cells = row.find_all("td") + if len(cells) < 9: + continue + + size_cell = cells[7] + file_link = size_cell.find("a") + mirror_link = "" + if file_link: + href = str(file_link.get("href", "")) + if href.startswith("/"): + mirror_link = mirror + href + elif href: + mirror_link = urljoin(mirror, href) + + if not mirror_link: + title_link = cells[1].find("a") if len(cells) > 1 else None + if title_link: + href = str(title_link.get("href", "")) + if href.startswith("/"): + mirror_link = mirror + href + elif href: + mirror_link = urljoin(mirror, href) + + if not mirror_link: + continue + + results.append( + { + "id": "", + "mirror": mirror_link, + "cover": "", + "title": cells[1].get_text(strip=True) if len(cells) > 1 else "Unknown", + "authors": [cells[2].get_text(strip=True)] + if len(cells) > 2 + else ["Unknown"], + "publisher": cells[3].get_text(strip=True) if len(cells) > 3 else "", + "year": cells[4].get_text(strip=True) if len(cells) > 4 else "", + "pages": cells[6].get_text(strip=True) if len(cells) > 6 else "", + "language": cells[5].get_text(strip=True) if len(cells) > 5 else "", + "size": cells[7].get_text(strip=True) if len(cells) > 7 else "", + "extension": cells[8].get_text(strip=True) if len(cells) > 8 else "", + "isbn": "", + } + ) + except Exception as exc: # pragma: no cover - defensive + logging.debug("[libgen_no_ads] Error parsing row: %s", exc) + continue + + if results: + logging.info("[libgen_no_ads] %d results from %s", len(results), mirror) + return results + except Exception as exc: # pragma: no cover - mirror issues + logging.debug("[libgen_no_ads] Mirror %s failed: %s", mirror, exc) + continue + + return [] + + +def format_book_info(book: Any) -> Dict[str, Any]: + """Format Libgen search result into a consistent dictionary.""" + filesize_bytes = 0 + size_str = getattr(book, "size", "") or "" + if size_str: + parts = size_str.strip().split() + try: + value = float(parts[0]) + unit = parts[1].upper() if len(parts) > 1 else "B" + if unit in {"MB", "M"}: + filesize_bytes = int(value * 1024 * 1024) + elif unit in {"GB", "G"}: + filesize_bytes = int(value * 1024 * 1024 * 1024) + elif unit in {"KB", "K"}: + filesize_bytes = int(value * 1024) + else: + filesize_bytes = int(value) + except (ValueError, IndexError): # pragma: no cover - defensive + filesize_bytes = 0 + + title = getattr(book, "title", "") or "" + isbn = getattr(book, "isbn", "") or "" + if not isbn and title: + import re + + match = re.search( + r"((?:[\d]{10,13}(?:\s*[;,]\s*[\d]{10,13})+)|(?:[\d]{10,13})(?:\s*[;,]?\s*[\d\-]{0,50})?)\s*(?:\b|$)", + title, + ) + if match: + potential_isbn = match.group(0).strip() + if re.search(r"\d{10,13}", potential_isbn): + isbn = potential_isbn + title = re.sub(r"\s+[a-z]\s*$", "", title[: match.start()].strip(), flags=re.IGNORECASE) + + authors_value = getattr(book, "authors", None) + if isinstance(authors_value, Iterable) and not isinstance(authors_value, str): + authors_str = ", ".join(str(author) for author in authors_value) + else: + authors_str = str(authors_value or "Unknown") + + download_links = getattr(book, "download_links", None) + mirror_url = None + if download_links and getattr(download_links, "get_link", None): + mirror_url = download_links.get_link + + return { + "title": title or "Unknown", + "author": authors_str, + "publisher": getattr(book, "publisher", "") or "", + "year": getattr(book, "year", "") or "", + "pages": getattr(book, "pages", "") or "", + "language": getattr(book, "language", "") or "", + "filesize": filesize_bytes, + "filesize_str": size_str or "Unknown", + "extension": getattr(book, "extension", "") or "", + "isbn": isbn, + "mirror_url": mirror_url, + } + + +def search_libgen( + query: str, + limit: int = DEFAULT_LIMIT, + *, + log_info: LogFn = None, + log_error: ErrorFn = None, + session: Optional[requests.Session] = None, +) -> List[Dict[str, Any]]: + """Search Libgen returning formatted dictionaries with multiple mirrors. + + Uses HTML scraper (search_libgen_no_ads) to find books quickly. + Returns mirror URLs and book IDs that can be used to generate alternative mirrors. + """ + try: + _call(log_info, f"[search] Searching Libgen for: {query}") + session = session or requests.Session() + + # Use HTML scraper - more reliable and doesn't hang on mirror resolution + _call(log_info, "[search] Using HTML scraper (search_libgen_no_ads)...") + results: List[Any] = search_libgen_no_ads(query, session=session) + + if not results: + _call(log_info, "[search] No results from HTML scraper") + return [] + + formatted: List[Dict[str, Any]] = [] + mirrors_list = [ + "https://libgen.gl", + "https://libgen.vg", + "https://libgen.la", + "https://libgen.bz", + "https://libgen.gs", + ] + + for book in results[:limit]: + if isinstance(book, dict): + # Result from search_libgen_no_ads (HTML scraper) + authors = book.get("authors", ["Unknown"]) + if isinstance(authors, list): + author_value = ", ".join(str(a) for a in authors) + else: + author_value = str(authors) + + # Extract book ID from mirror URL if available + mirror = book.get("mirror", "") + book_id = "" + if mirror and "/file.php?id=" in mirror: + try: + book_id = mirror.split("/file.php?id=")[1].split("&")[0] + except (IndexError, ValueError): + pass + + # Build list of alternative mirrors based on book ID + mirrors_dict = {} + if book_id: + for mirror_base in mirrors_list: + mirrors_dict[mirror_base] = f"{mirror_base}/file.php?id={book_id}" + elif mirror: + # Fallback: use the mirror we found + mirrors_dict["primary"] = mirror + + formatted.append( + { + "title": book.get("title", "Unknown"), + "author": author_value, + "publisher": book.get("publisher", ""), + "year": book.get("year", ""), + "pages": book.get("pages", ""), + "language": book.get("language", ""), + "filesize": 0, + "filesize_str": book.get("size", "Unknown"), + "extension": book.get("extension", ""), + "isbn": book.get("isbn", ""), + "mirror_url": mirror, # Primary mirror + "mirrors": mirrors_dict, # Alternative mirrors + "book_id": book_id, + } + ) + else: + # Fallback: try to format as book object + try: + formatted.append(format_book_info(book)) + except Exception: + pass + + _call(log_info, f"[search] Found {len(formatted)} result(s)") + return formatted + except LibgenError as exc: + _call(log_error, f"[search] Libgen error: {exc}") + return [] + except Exception as exc: # pragma: no cover - defensive + _call(log_error, f"[search] Error: {exc}") + return [] + + +def download_from_mirror( + mirror_url: str, + output_path: str | Path, + *, + log_info: LogFn = None, + log_error: ErrorFn = None, + session: Optional[requests.Session] = None, +) -> bool: + """Download a Libgen file and write it to disk. + + Handles Libgen redirects and ensures proper file download by: + - Following all redirects (default behavior) + - Setting User-Agent header (required by some mirrors) + - Validating that we're downloading binary content, not HTML + - Attempting alternative download method if HTML is returned + """ + session = session or requests.Session() + try: + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + + _call(log_info, f"[download] Downloading from mirror: {mirror_url}") + + # Ensure session has proper headers for Libgen + if 'User-Agent' not in session.headers: + session.headers['User-Agent'] = ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + ) + + # Download with redirects enabled (default) and referer + session.headers['Referer'] = 'https://libgen.gs/' + response = session.get(mirror_url, stream=True, timeout=30, allow_redirects=True) + response.raise_for_status() + + # Check if we got HTML instead of a file (common Libgen issue) + content_type = response.headers.get('content-type', '').lower() + if 'text/html' in content_type: + _call(log_error, f"[download] Server returned HTML. Trying alternative method...") + + # Try to extract file ID and use alternative CDN + try: + # Parse the HTML to extract MD5 or file ID + from bs4 import BeautifulSoup + soup = BeautifulSoup(response.text, 'html.parser') + + # Look for download link in the HTML + # Common patterns: md5 hash in form, or direct link in anchor tags + download_link = None + + # Try to find forms that might contain download functionality + forms = soup.find_all('form') + for form in forms: + action = form.get('action', '') + if 'download' in action.lower() or 'get' in action.lower(): + download_link = action + break + + if not download_link: + _call(log_error, f"[download] Could not extract alternative download link from HTML") + return False + + _call(log_info, f"[download] Using alternative download method: {download_link[:100]}") + # Try downloading from alternative link + response2 = session.get(download_link, stream=True, timeout=30, allow_redirects=True) + response2.raise_for_status() + response = response2 # Use the new response + + except Exception as alt_error: + _call(log_error, f"[download] Alternative method failed: {alt_error}") + return False + + total_size = int(response.headers.get("content-length", 0)) + downloaded = 0 + + with open(output_path, "wb") as handle: + for chunk in response.iter_content(chunk_size=8192): + if not chunk: + continue + handle.write(chunk) + downloaded += len(chunk) + if total_size > 0: + percent = downloaded / total_size * 100 + _call( + log_info, + f"[download] {percent:.1f}% - {downloaded // (1024*1024)}MB / {total_size // (1024*1024)}MB", + ) + + _call(log_info, f"[download] Downloaded successfully to: {output_path}") + return True + except Exception as exc: # pragma: no cover - defensive + _call(log_error, f"[download] Error: {exc}") + return False diff --git a/helper/local_library.py b/helper/local_library.py new file mode 100644 index 0000000..1d4f2d9 --- /dev/null +++ b/helper/local_library.py @@ -0,0 +1,1395 @@ +"""Unified local library management system combining database, initialization, migration, and search. + +This module provides: +- SQLite database management for local file metadata caching +- Library scanning and database initialization +- Sidecar file migration from old .tags/.metadata files to database +- Optimized search functionality using database indices +- Worker task tracking for background operations +""" + +from __future__ import annotations + +import sqlite3 +import json +import logging +import subprocess +import shutil +from datetime import datetime +from pathlib import Path +from typing import Optional, Dict, Any, List, Tuple, Set + +logger = logging.getLogger(__name__) + +# Try to import optional dependencies +try: + import mutagen +except ImportError: + mutagen = None + +try: + from metadata import ( + _read_sidecar_metadata, + _derive_sidecar_path, + write_tags, + write_tags_to_file, + embed_metadata_in_file, + read_tags_from_file, + ) + METADATA_AVAILABLE = True +except ImportError: + _read_sidecar_metadata = None + _derive_sidecar_path = None + write_tags = None + write_tags_to_file = None + embed_metadata_in_file = None + read_tags_from_file = None + METADATA_AVAILABLE = False + +# Media extensions to index +MEDIA_EXTENSIONS = { + '.mp4', '.mkv', '.mka', '.webm', '.avi', '.mov', '.flv', '.wmv', '.m4v', + '.mp3', '.flac', '.wav', '.aac', '.ogg', '.m4a', '.wma', + '.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.tiff', + '.pdf', '.epub', '.txt', '.docx', '.doc' +} + + +# ============================================================================ +# SIDECAR FILE HANDLING +# ============================================================================ + +def read_sidecar(sidecar_path: Path) -> Tuple[Optional[str], List[str], List[str]]: + """Read metadata from a sidecar file. + + Delegates to metadata._read_sidecar_metadata for centralized handling. + + Args: + sidecar_path: Path to .tags sidecar file + + Returns: + Tuple of (hash_value, tags_list, urls_list) + Returns (None, [], []) if file doesn't exist or can't be read + """ + if _read_sidecar_metadata is None: + return None, [], [] + + try: + return _read_sidecar_metadata(sidecar_path) + except Exception: + return None, [], [] + + +def write_sidecar(media_path: Path, tags: List[str], known_urls: List[str], + hash_value: Optional[str] = None) -> bool: + """Write metadata to a sidecar file. + + Delegates to metadata.write_tags for centralized handling. + + Args: + media_path: Path to the media file (sidecar created as media_path.tags) + tags: List of tag strings + known_urls: List of known URL strings + hash_value: Optional SHA256 hash to include + + Returns: + True if successful, False otherwise + """ + if write_tags is None: + return False + + if media_path.exists() and media_path.is_dir(): + return False + + try: + write_tags(media_path, tags, known_urls, hash_value) + return True + except Exception: + return False + + +def find_sidecar(media_path: Path) -> Optional[Path]: + """Find the sidecar file for a media path. + + Uses metadata._derive_sidecar_path for centralized handling. + + Args: + media_path: Path to media file + + Returns: + Path to existing sidecar file, or None if not found + """ + if media_path.is_dir(): + return None + + if _derive_sidecar_path is None: + return None + + try: + # Check for new format: filename.ext.tags + sidecar_path = _derive_sidecar_path(media_path) + if sidecar_path.exists(): + return sidecar_path + except OSError: + pass + + return None + + +def has_sidecar(media_path: Path) -> bool: + """Check if a media file has a sidecar.""" + return find_sidecar(media_path) is not None + +class LocalLibraryDB: + """SQLite database for caching local library metadata.""" + + DB_NAME = ".downlow_library.db" + SCHEMA_VERSION = 2 + + def __init__(self, library_root: Path): + """Initialize the database at the library root. + + Args: + library_root: Path to the local library root directory + """ + self.library_root = Path(library_root) + self.db_path = self.library_root / self.DB_NAME + self.connection: Optional[sqlite3.Connection] = None + self._init_db() + + def _init_db(self) -> None: + """Initialize database connection and create tables if needed.""" + try: + # Use check_same_thread=False to allow multi-threaded access + # This is safe because we're not sharing connections across threads; + # each thread will get its own cursor + self.connection = sqlite3.connect(str(self.db_path), check_same_thread=False) + self.connection.row_factory = sqlite3.Row + self.connection.execute("PRAGMA foreign_keys = ON") + self._create_tables() + logger.info(f"Database initialized at {self.db_path}") + except Exception as e: + logger.error(f"Failed to initialize database: {e}", exc_info=True) + raise + + def _create_tables(self) -> None: + """Create database tables if they don't exist.""" + cursor = self.connection.cursor() + + cursor.execute(""" + CREATE TABLE IF NOT EXISTS files ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + file_path TEXT UNIQUE NOT NULL, + file_hash TEXT, + file_size INTEGER, + file_modified REAL, + indexed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ) + """) + + cursor.execute(""" + CREATE TABLE IF NOT EXISTS metadata ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + file_id INTEGER UNIQUE NOT NULL, + hash TEXT, + known_urls TEXT, + relationships TEXT, + duration REAL, + size INTEGER, + ext TEXT, + media_type TEXT, + media_kind TEXT, + time_imported TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + time_modified TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE + ) + """) + + cursor.execute(""" + CREATE TABLE IF NOT EXISTS tags ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + file_id INTEGER NOT NULL, + tag TEXT NOT NULL, + tag_type TEXT DEFAULT 'user', + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE, + UNIQUE(file_id, tag) + ) + """) + + cursor.execute(""" + CREATE TABLE IF NOT EXISTS notes ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + file_id INTEGER UNIQUE NOT NULL, + note TEXT, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE + ) + """) + + # Worker tracking tables (drop legacy workers table if still present) + self._ensure_worker_tables(cursor) + + # Create indices for performance + cursor.execute("CREATE INDEX IF NOT EXISTS idx_files_path ON files(file_path)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_files_hash ON files(file_hash)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_tags_file_id ON tags(file_id)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_tags_tag ON tags(tag)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_metadata_file_id ON metadata(file_id)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_worker_id ON worker(worker_id)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_worker_status ON worker(status)") + cursor.execute("CREATE INDEX IF NOT EXISTS idx_worker_type ON worker(worker_type)") + + self._migrate_metadata_schema(cursor) + self.connection.commit() + logger.debug("Database tables created/verified") + + def _ensure_worker_tables(self, cursor) -> None: + """Ensure the modern worker tables exist, dropping legacy ones if needed.""" + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='worker'") + has_worker = cursor.fetchone() is not None + if not has_worker: + cursor.execute("DROP TABLE IF EXISTS workers") + cursor.execute(""" + CREATE TABLE worker ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + worker_id TEXT UNIQUE NOT NULL, + worker_type TEXT NOT NULL, + pipe TEXT, + status TEXT DEFAULT 'running', + title TEXT, + description TEXT, + progress REAL DEFAULT 0.0, + current_step TEXT, + total_steps INTEGER DEFAULT 0, + error_message TEXT, + result_data TEXT, + stdout TEXT DEFAULT '', + steps TEXT DEFAULT '', + started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + completed_at TIMESTAMP, + last_stdout_at TIMESTAMP, + last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ) + """) + else: + self._ensure_worker_columns(cursor) + + cursor.execute(""" + CREATE TABLE IF NOT EXISTS worker_log ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + worker_id TEXT NOT NULL, + event_type TEXT NOT NULL, + step TEXT, + channel TEXT, + message TEXT, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY(worker_id) REFERENCES worker(worker_id) ON DELETE CASCADE + ) + """) + cursor.execute("CREATE INDEX IF NOT EXISTS idx_worker_log_worker_id ON worker_log(worker_id)") + + def _ensure_worker_columns(self, cursor) -> None: + """Backfill columns for older worker tables during upgrade.""" + try: + cursor.execute('PRAGMA table_info(worker)') + existing_columns = {row[1] for row in cursor.fetchall()} + except Exception as exc: + logger.error(f"Error introspecting worker table: {exc}") + return + column_specs = { + 'pipe': "TEXT", + 'progress': "REAL DEFAULT 0.0", + 'current_step': "TEXT", + 'total_steps': "INTEGER DEFAULT 0", + 'stdout': "TEXT DEFAULT ''", + 'steps': "TEXT DEFAULT ''", + 'last_stdout_at': "TIMESTAMP" + } + for col_name, ddl in column_specs.items(): + if col_name not in existing_columns: + try: + cursor.execute(f"ALTER TABLE worker ADD COLUMN {col_name} {ddl}") + logger.info(f"Added '{col_name}' column to worker table") + except Exception as exc: + logger.warning(f"Could not add '{col_name}' column to worker table: {exc}") + + def _insert_worker_log_entry(self, cursor, worker_id: str, event_type: str, + message: str, step: Optional[str] = None, + channel: Optional[str] = None) -> None: + if not message: + return + cursor.execute(""" + INSERT INTO worker_log (worker_id, event_type, step, channel, message) + VALUES (?, ?, ?, ?, ?) + """, (worker_id, event_type, step, channel, message)) + + def get_worker_events(self, worker_id: str, limit: int = 500) -> List[Dict[str, Any]]: + """Return chronological worker log events for timelines.""" + try: + cursor = self.connection.cursor() + cursor.execute(""" + SELECT id, event_type, step, channel, message, created_at + FROM worker_log + WHERE worker_id = ? + ORDER BY id ASC + LIMIT ? + """, (worker_id, limit)) + return [dict(row) for row in cursor.fetchall()] + except Exception as exc: + logger.error(f"Error retrieving worker events for {worker_id}: {exc}", exc_info=True) + return [] + + def clear_worker_events(self, worker_id: str, event_type: Optional[str] = None) -> None: + """Remove worker log entries, optionally filtered by event type.""" + try: + cursor = self.connection.cursor() + if event_type: + cursor.execute( + "DELETE FROM worker_log WHERE worker_id = ? AND event_type = ?", + (worker_id, event_type) + ) + else: + cursor.execute("DELETE FROM worker_log WHERE worker_id = ?", (worker_id,)) + self.connection.commit() + except Exception as exc: + logger.error(f"Error clearing worker log for {worker_id}: {exc}", exc_info=True) + + def _migrate_metadata_schema(self, cursor) -> None: + """Add missing columns to metadata table if they don't exist.""" + try: + cursor.execute('PRAGMA table_info(metadata)') + existing_columns = {row[1] for row in cursor.fetchall()} + + for col_name, col_def in [('size', 'INTEGER'), ('ext', 'TEXT'), + ('time_imported', 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP'), + ('time_modified', 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP')]: + if col_name not in existing_columns: + try: + cursor.execute(f"ALTER TABLE metadata ADD COLUMN {col_name} {col_def}") + logger.info(f"Added '{col_name}' column to metadata table") + except Exception as e: + logger.warning(f"Could not add '{col_name}' column: {e}") + except Exception as e: + logger.error(f"Error during metadata schema migration: {e}") + + def _update_metadata_modified_time(self, file_id: int) -> None: + """Update the time_modified timestamp for a file's metadata.""" + try: + cursor = self.connection.cursor() + cursor.execute(""" + UPDATE metadata SET time_modified = CURRENT_TIMESTAMP WHERE file_id = ? + """, (file_id,)) + self.connection.commit() + except Exception as e: + logger.debug(f"Could not update metadata modified time for file_id {file_id}: {e}") + + def get_or_create_file_entry(self, file_path: Path) -> int: + """Get or create a file entry in the database.""" + try: + str_path = str(file_path.resolve()) + logger.debug(f"[get_or_create_file_entry] Looking up: {str_path}") + + cursor = self.connection.cursor() + + cursor.execute("SELECT id FROM files WHERE file_path = ?", (str_path,)) + row = cursor.fetchone() + + if row: + logger.debug(f"[get_or_create_file_entry] Found existing file_id: {row[0]}") + return row[0] + + logger.debug(f"[get_or_create_file_entry] File entry not found, creating new one") + stat = file_path.stat() + cursor.execute(""" + INSERT INTO files (file_path, file_size, file_modified) + VALUES (?, ?, ?) + """, (str_path, stat.st_size, stat.st_mtime)) + + file_id = cursor.lastrowid + logger.debug(f"[get_or_create_file_entry] Created new file_id: {file_id}") + + # Auto-create title tag + filename_without_ext = file_path.stem + if filename_without_ext: + # Normalize underscores to spaces for consistency + title_value = filename_without_ext.replace("_", " ").strip() + title_tag = f"title:{title_value}" + cursor.execute(""" + INSERT OR IGNORE INTO tags (file_id, tag, tag_type) + VALUES (?, ?, 'user') + """, (file_id, title_tag)) + logger.debug(f"[get_or_create_file_entry] Auto-created title tag for file_id {file_id}") + + self.connection.commit() + logger.debug(f"[get_or_create_file_entry] Committed file entry {file_id}") + return file_id + except Exception as e: + logger.error(f"[get_or_create_file_entry] ❌ Error getting/creating file entry for {file_path}: {e}", exc_info=True) + raise + + def get_metadata(self, file_path: Path) -> Optional[Dict[str, Any]]: + """Get metadata for a file.""" + try: + str_path = str(file_path.resolve()) + cursor = self.connection.cursor() + + cursor.execute(""" + SELECT m.* FROM metadata m + JOIN files f ON m.file_id = f.id + WHERE f.file_path = ? + """, (str_path,)) + + row = cursor.fetchone() + if not row: + return None + + metadata = dict(row) + + # Parse JSON fields + for field in ['known_urls', 'relationships']: + if metadata.get(field): + try: + metadata[field] = json.loads(metadata[field]) + except (json.JSONDecodeError, TypeError): + metadata[field] = [] if field == 'known_urls' else [] + + return metadata + except Exception as e: + logger.error(f"Error getting metadata for {file_path}: {e}", exc_info=True) + return None + + def save_metadata(self, file_path: Path, metadata: Dict[str, Any]) -> None: + """Save metadata for a file.""" + try: + str_path = str(file_path.resolve()) + logger.debug(f"[save_metadata] Starting save for: {str_path}") + + file_id = self.get_or_create_file_entry(file_path) + logger.debug(f"[save_metadata] Got/created file_id: {file_id}") + + cursor = self.connection.cursor() + + known_urls = metadata.get('known_urls', []) + if not isinstance(known_urls, str): + known_urls = json.dumps(known_urls) + + relationships = metadata.get('relationships', []) + if not isinstance(relationships, str): + relationships = json.dumps(relationships) + + cursor.execute(""" + INSERT INTO metadata ( + file_id, hash, known_urls, relationships, + duration, size, ext, media_type, media_kind, + time_imported, time_modified + ) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP) + ON CONFLICT(file_id) DO UPDATE SET + hash = excluded.hash, + known_urls = excluded.known_urls, + relationships = excluded.relationships, + duration = excluded.duration, + size = excluded.size, + ext = excluded.ext, + media_type = excluded.media_type, + media_kind = excluded.media_kind, + time_modified = CURRENT_TIMESTAMP, + updated_at = CURRENT_TIMESTAMP + """, ( + file_id, metadata.get('hash'), known_urls, relationships, + metadata.get('duration'), metadata.get('size'), metadata.get('ext'), + metadata.get('media_type'), metadata.get('media_kind') + )) + + self.connection.commit() + logger.debug(f"[save_metadata] ✅ Committed metadata for file_id {file_id}") + except Exception as e: + logger.error(f"[save_metadata] ❌ Error saving metadata for {file_path}: {e}", exc_info=True) + raise + + def get_tags(self, file_path: Path) -> List[str]: + """Get all tags for a file.""" + try: + str_path = str(file_path.resolve()) + cursor = self.connection.cursor() + + cursor.execute(""" + SELECT t.tag FROM tags t + JOIN files f ON t.file_id = f.id + WHERE f.file_path = ? + ORDER BY t.tag + """, (str_path,)) + + return [row[0] for row in cursor.fetchall()] + except Exception as e: + logger.error(f"Error getting tags for {file_path}: {e}", exc_info=True) + return [] + + def save_tags(self, file_path: Path, tags: List[str]) -> None: + """Save tags for a file, replacing all existing tags.""" + try: + str_path = str(file_path.resolve()) + logger.debug(f"[save_tags] Starting save for: {str_path}") + + file_id = self.get_or_create_file_entry(file_path) + logger.debug(f"[save_tags] Got/created file_id: {file_id}") + + cursor = self.connection.cursor() + + cursor.execute(""" + SELECT tag FROM tags WHERE file_id = ? AND tag LIKE 'title:%' + """, (file_id,)) + existing_title = cursor.fetchone() + + cursor.execute("DELETE FROM tags WHERE file_id = ?", (file_id,)) + logger.debug(f"[save_tags] Deleted existing tags for file_id {file_id}") + + if existing_title: + cursor.execute(""" + INSERT INTO tags (file_id, tag, tag_type) VALUES (?, ?, 'user') + """, (file_id, existing_title[0])) + logger.debug(f"[save_tags] Preserved existing title tag") + else: + filename_without_ext = file_path.stem + if filename_without_ext: + # Normalize underscores to spaces for consistency + title_value = filename_without_ext.replace("_", " ").strip() + title_tag = f"title:{title_value}" + cursor.execute(""" + INSERT INTO tags (file_id, tag, tag_type) VALUES (?, ?, 'user') + """, (file_id, title_tag)) + logger.debug(f"[save_tags] Created auto-title tag: {title_tag}") + + for tag in tags: + tag = tag.strip() + if tag: + cursor.execute(""" + INSERT OR IGNORE INTO tags (file_id, tag, tag_type) + VALUES (?, ?, 'user') + """, (file_id, tag)) + + self.connection.commit() + logger.debug(f"[save_tags] ✅ Committed {len(tags)} tags for file_id {file_id}") + + # Verify they were actually saved + cursor.execute("SELECT COUNT(*) FROM tags WHERE file_id = ?", (file_id,)) + saved_count = cursor.fetchone()[0] + logger.debug(f"[save_tags] Verified: {saved_count} tags in database for file_id {file_id}") + + self._update_metadata_modified_time(file_id) + except Exception as e: + logger.error(f"[save_tags] ❌ Error saving tags for {file_path}: {e}", exc_info=True) + raise + + def add_tags(self, file_path: Path, tags: List[str]) -> None: + """Add tags to a file.""" + try: + file_id = self.get_or_create_file_entry(file_path) + cursor = self.connection.cursor() + + user_title_tag = next((tag.strip() for tag in tags + if tag.strip().lower().startswith('title:')), None) + + if user_title_tag: + cursor.execute(""" + DELETE FROM tags WHERE file_id = ? AND tag LIKE 'title:%' + """, (file_id,)) + else: + cursor.execute(""" + SELECT COUNT(*) FROM tags WHERE file_id = ? AND tag LIKE 'title:%' + """, (file_id,)) + + has_title = cursor.fetchone()[0] > 0 + if not has_title: + filename_without_ext = file_path.stem + if filename_without_ext: + # Normalize underscores to spaces for consistency + title_value = filename_without_ext.replace("_", " ").strip() + title_tag = f"title:{title_value}" + cursor.execute(""" + INSERT OR IGNORE INTO tags (file_id, tag, tag_type) + VALUES (?, ?, 'user') + """, (file_id, title_tag)) + + for tag in tags: + tag = tag.strip() + if tag: + cursor.execute(""" + INSERT OR IGNORE INTO tags (file_id, tag, tag_type) + VALUES (?, ?, 'user') + """, (file_id, tag)) + + self.connection.commit() + self._update_metadata_modified_time(file_id) + logger.debug(f"Added {len(tags)} tags for {file_path}") + except Exception as e: + logger.error(f"Error adding tags for {file_path}: {e}", exc_info=True) + raise + + def remove_tags(self, file_path: Path, tags: List[str]) -> None: + """Remove specific tags from a file.""" + try: + str_path = str(file_path.resolve()) + cursor = self.connection.cursor() + + for tag in tags: + tag = tag.strip() + if tag: + cursor.execute(""" + DELETE FROM tags + WHERE file_id = (SELECT id FROM files WHERE file_path = ?) + AND tag = ? + """, (str_path, tag)) + + self.connection.commit() + logger.debug(f"Removed {len(tags)} tags for {file_path}") + except Exception as e: + logger.error(f"Error removing tags for {file_path}: {e}", exc_info=True) + raise + + def get_note(self, file_path: Path) -> Optional[str]: + """Get note for a file.""" + try: + str_path = str(file_path.resolve()) + cursor = self.connection.cursor() + + cursor.execute(""" + SELECT n.note FROM notes n + JOIN files f ON n.file_id = f.id + WHERE f.file_path = ? + """, (str_path,)) + + row = cursor.fetchone() + return row[0] if row else None + except Exception as e: + logger.error(f"Error getting note for {file_path}: {e}", exc_info=True) + return None + + def save_note(self, file_path: Path, note: str) -> None: + """Save note for a file.""" + try: + file_id = self.get_or_create_file_entry(file_path) + cursor = self.connection.cursor() + + cursor.execute(""" + INSERT INTO notes (file_id, note) + VALUES (?, ?) + ON CONFLICT(file_id) DO UPDATE SET + note = excluded.note, + updated_at = CURRENT_TIMESTAMP + """, (file_id, note)) + + self.connection.commit() + logger.debug(f"Saved note for {file_path}") + except Exception as e: + logger.error(f"Error saving note for {file_path}: {e}", exc_info=True) + raise + + def search_by_tag(self, tag: str, limit: int = 100) -> List[Path]: + """Search for files with a specific tag.""" + try: + cursor = self.connection.cursor() + + cursor.execute(""" + SELECT DISTINCT f.file_path FROM files f + JOIN tags t ON f.id = t.file_id + WHERE t.tag = ? + LIMIT ? + """, (tag, limit)) + + return [Path(row[0]) for row in cursor.fetchall()] + except Exception as e: + logger.error(f"Error searching by tag '{tag}': {e}", exc_info=True) + return [] + + def search_by_hash(self, file_hash: str) -> Optional[Path]: + """Search for a file by hash.""" + try: + cursor = self.connection.cursor() + + cursor.execute(""" + SELECT file_path FROM files WHERE file_hash = ? + """, (file_hash,)) + + row = cursor.fetchone() + return Path(row[0]) if row else None + except Exception as e: + logger.error(f"Error searching by hash '{file_hash}': {e}", exc_info=True) + return None + + def update_file_hash(self, file_path: Path, file_hash: str) -> None: + """Update the file hash.""" + try: + str_path = str(file_path.resolve()) + cursor = self.connection.cursor() + + cursor.execute(""" + UPDATE files SET file_hash = ?, updated_at = CURRENT_TIMESTAMP + WHERE file_path = ? + """, (file_hash, str_path)) + + self.connection.commit() + logger.debug(f"Updated hash for {file_path}") + except Exception as e: + logger.error(f"Error updating file hash for {file_path}: {e}", exc_info=True) + raise + + def rename_file(self, old_path: Path, new_path: Path) -> None: + """Rename a file in the database, preserving all metadata.""" + try: + str_old_path = str(old_path.resolve()) + str_new_path = str(new_path.resolve()) + cursor = self.connection.cursor() + + cursor.execute(""" + UPDATE files SET file_path = ?, updated_at = CURRENT_TIMESTAMP + WHERE file_path = ? + """, (str_new_path, str_old_path)) + + self.connection.commit() + logger.debug(f"Renamed file in database: {old_path} → {new_path}") + except Exception as e: + logger.error(f"Error renaming file from {old_path} to {new_path}: {e}", exc_info=True) + raise + + def cleanup_missing_files(self) -> int: + """Remove entries for files that no longer exist.""" + try: + cursor = self.connection.cursor() + cursor.execute("SELECT id, file_path FROM files") + + removed_count = 0 + for row_id, file_path in cursor.fetchall(): + if not Path(file_path).exists(): + cursor.execute("DELETE FROM files WHERE id = ?", (row_id,)) + removed_count += 1 + + self.connection.commit() + logger.info(f"Cleaned up {removed_count} missing file entries") + return removed_count + except Exception as e: + logger.error(f"Error cleaning up missing files: {e}", exc_info=True) + return 0 + + # ======================================================================== + # WORKER MANAGEMENT + # ======================================================================== + + def insert_worker(self, worker_id: str, worker_type: str, title: str = "", + description: str = "", total_steps: int = 0, + pipe: Optional[str] = None) -> int: + """Insert a new worker entry into the database.""" + try: + cursor = self.connection.cursor() + cursor.execute(""" + INSERT INTO worker (worker_id, worker_type, pipe, status, title, description, total_steps) + VALUES (?, ?, ?, ?, ?, ?, ?) + """, (worker_id, worker_type, pipe, 'running', title, description, total_steps)) + self.connection.commit() + return cursor.lastrowid or 0 + except sqlite3.IntegrityError: + return self.update_worker_status(worker_id, 'running') + except Exception as e: + logger.error(f"Error inserting worker: {e}", exc_info=True) + return 0 + + def update_worker(self, worker_id: str, **kwargs) -> bool: + """Update worker entry with given fields.""" + try: + allowed_fields = { + 'status', 'progress', 'current_step', 'error_message', 'result_data', + 'title', 'description', 'completed_at', 'total_steps', 'pipe', + 'started_at', 'last_stdout_at' + } + update_fields = {k: v for k, v in kwargs.items() if k in allowed_fields} + + if not update_fields: + return True + + update_fields['last_updated'] = datetime.now().isoformat() + cursor = self.connection.cursor() + set_clause = ", ".join(f"{k} = ?" for k in update_fields.keys()) + values = list(update_fields.values()) + [worker_id] + + cursor.execute(f""" + UPDATE worker SET {set_clause} WHERE worker_id = ? + """, values) + + self.connection.commit() + return cursor.rowcount > 0 + except Exception as e: + logger.error(f"Error updating worker {worker_id}: {e}", exc_info=True) + return False + + def update_worker_status(self, worker_id: str, status: str) -> int: + """Update worker status and return its database ID.""" + try: + cursor = self.connection.cursor() + + if status in ('completed', 'error'): + cursor.execute(""" + UPDATE worker + SET status = ?, completed_at = CURRENT_TIMESTAMP, last_updated = CURRENT_TIMESTAMP + WHERE worker_id = ? + """, (status, worker_id)) + else: + cursor.execute(""" + UPDATE worker + SET status = ?, last_updated = CURRENT_TIMESTAMP + WHERE worker_id = ? + """, (status, worker_id)) + + self.connection.commit() + + cursor.execute("SELECT id FROM worker WHERE worker_id = ?", (worker_id,)) + row = cursor.fetchone() + return row[0] if row else 0 + except Exception as e: + logger.error(f"Error updating worker status: {e}", exc_info=True) + return 0 + + def get_worker(self, worker_id: str) -> Optional[Dict[str, Any]]: + """Retrieve a worker entry by ID.""" + try: + cursor = self.connection.cursor() + cursor.execute("SELECT * FROM worker WHERE worker_id = ?", (worker_id,)) + row = cursor.fetchone() + return dict(row) if row else None + except Exception as e: + logger.error(f"Error retrieving worker: {e}", exc_info=True) + return None + + def get_active_workers(self) -> List[Dict[str, Any]]: + """Get all active (running) workers.""" + try: + cursor = self.connection.cursor() + cursor.execute("SELECT * FROM worker WHERE status = 'running' ORDER BY started_at DESC") + return [dict(row) for row in cursor.fetchall()] + except Exception as e: + logger.error(f"Error retrieving active workers: {e}", exc_info=True) + return [] + + def get_all_workers(self, limit: int = 100) -> List[Dict[str, Any]]: + """Get all workers (recent first).""" + try: + cursor = self.connection.cursor() + cursor.execute(""" + SELECT * FROM worker ORDER BY started_at DESC LIMIT ? + """, (limit,)) + return [dict(row) for row in cursor.fetchall()] + except Exception as e: + logger.error(f"Error retrieving all workers: {e}", exc_info=True) + return [] + + def delete_worker(self, worker_id: str) -> bool: + """Delete a worker entry.""" + try: + cursor = self.connection.cursor() + cursor.execute("DELETE FROM worker WHERE worker_id = ?", (worker_id,)) + self.connection.commit() + return cursor.rowcount > 0 + except Exception as e: + logger.error(f"Error deleting worker: {e}", exc_info=True) + return False + + def cleanup_old_workers(self, days: int = 7) -> int: + """Clean up completed/errored workers older than specified days.""" + try: + cursor = self.connection.cursor() + cursor.execute(""" + DELETE FROM worker + WHERE status IN ('completed', 'error') + AND completed_at < datetime('now', '-' || ? || ' days') + """, (days,)) + self.connection.commit() + return cursor.rowcount + except Exception as e: + logger.error(f"Error cleaning up old workers: {e}", exc_info=True) + return 0 + + def expire_running_workers( + self, + older_than_seconds: int = 300, + status: str = "error", + reason: str | None = None, + worker_id_prefix: str | None = None, + ) -> int: + """Mark long-idle running workers as finished with the given status. + + Args: + older_than_seconds: Minimum idle time before expiring the worker. + status: New status to apply (e.g., "error" or "cancelled"). + reason: Error message to set when none is present. + worker_id_prefix: Optional LIKE pattern (e.g., 'cli_%') to scope updates. + + Returns: + Number of workers updated. + """ + idle_seconds = max(1, int(older_than_seconds)) + cutoff = f"-{idle_seconds} seconds" + auto_reason = reason or "Worker stopped responding; auto-marked as error" + try: + cursor = self.connection.cursor() + if worker_id_prefix: + cursor.execute( + """ + UPDATE worker + SET status = ?, + error_message = CASE + WHEN IFNULL(TRIM(error_message), '') = '' THEN ? + ELSE error_message + END, + completed_at = COALESCE(completed_at, CURRENT_TIMESTAMP), + last_updated = CURRENT_TIMESTAMP + WHERE status = 'running' + AND worker_id LIKE ? + AND COALESCE(last_updated, started_at, created_at) < datetime('now', ?) + """, + (status, auto_reason, worker_id_prefix, cutoff), + ) + else: + cursor.execute( + """ + UPDATE worker + SET status = ?, + error_message = CASE + WHEN IFNULL(TRIM(error_message), '') = '' THEN ? + ELSE error_message + END, + completed_at = COALESCE(completed_at, CURRENT_TIMESTAMP), + last_updated = CURRENT_TIMESTAMP + WHERE status = 'running' + AND COALESCE(last_updated, started_at, created_at) < datetime('now', ?) + """, + (status, auto_reason, cutoff), + ) + self.connection.commit() + return cursor.rowcount + except Exception as exc: + logger.error(f"Error expiring stale workers: {exc}", exc_info=True) + return 0 + + def append_worker_stdout(self, worker_id: str, text: str, step: Optional[str] = None, + channel: str = "stdout") -> bool: + """Append text to a worker's stdout log and timeline.""" + if not text: + return True + try: + cursor = self.connection.cursor() + cursor.execute("SELECT stdout FROM worker WHERE worker_id = ?", (worker_id,)) + row = cursor.fetchone() + + if not row: + logger.warning(f"Worker {worker_id} not found for stdout append") + return False + + current_stdout = row[0] or "" + separator = "" if not current_stdout else ("" if current_stdout.endswith("\n") else "\n") + new_stdout = f"{current_stdout}{separator}{text}\n" + + cursor.execute(""" + UPDATE worker SET stdout = ?, last_updated = CURRENT_TIMESTAMP, + last_stdout_at = CURRENT_TIMESTAMP + WHERE worker_id = ? + """, (new_stdout, worker_id)) + self._insert_worker_log_entry(cursor, worker_id, 'stdout', text, step, channel) + + self.connection.commit() + return cursor.rowcount > 0 + except Exception as e: + logger.error(f"Error appending stdout to worker {worker_id}: {e}", exc_info=True) + return False + + def get_worker_stdout(self, worker_id: str) -> str: + """Get stdout logs for a worker.""" + try: + cursor = self.connection.cursor() + cursor.execute("SELECT stdout FROM worker WHERE worker_id = ?", (worker_id,)) + row = cursor.fetchone() + return row[0] if row and row[0] else "" + except Exception as e: + logger.error(f"Error getting worker stdout for {worker_id}: {e}", exc_info=True) + return "" + + def append_worker_steps(self, worker_id: str, step_text: str) -> bool: + """Append a step to a worker's step log and timeline.""" + if not step_text: + return True + try: + cursor = self.connection.cursor() + cursor.execute("SELECT steps FROM worker WHERE worker_id = ?", (worker_id,)) + row = cursor.fetchone() + + if not row: + logger.warning(f"Worker {worker_id} not found for steps append") + return False + + current_steps = row[0] or "" + timestamp = datetime.now().strftime('%H:%M:%S') + step_entry = f"[{timestamp}] {step_text}" + new_steps = (current_steps + "\n" if current_steps else "") + step_entry + + cursor.execute(""" + UPDATE worker SET steps = ?, last_updated = CURRENT_TIMESTAMP, + current_step = ? + WHERE worker_id = ? + """, (new_steps, step_text, worker_id)) + self._insert_worker_log_entry(cursor, worker_id, 'step', step_text, step_text, 'step') + + self.connection.commit() + return cursor.rowcount > 0 + except Exception as e: + logger.error(f"Error appending step to worker {worker_id}: {e}", exc_info=True) + return False + + def get_worker_steps(self, worker_id: str) -> str: + """Get step logs for a worker.""" + try: + cursor = self.connection.cursor() + cursor.execute("SELECT steps FROM worker WHERE worker_id = ?", (worker_id,)) + row = cursor.fetchone() + return row[0] if row and row[0] else "" + except Exception as e: + logger.error(f"Error getting worker steps for {worker_id}: {e}", exc_info=True) + return "" + + def clear_worker_stdout(self, worker_id: str) -> bool: + """Clear stdout logs for a worker.""" + try: + cursor = self.connection.cursor() + cursor.execute(""" + UPDATE worker SET stdout = '', last_updated = CURRENT_TIMESTAMP + WHERE worker_id = ? + """, (worker_id,)) + self.clear_worker_events(worker_id, event_type='stdout') + self.connection.commit() + return cursor.rowcount > 0 + except Exception as e: + logger.error(f"Error clearing worker stdout: {e}", exc_info=True) + return False + + def clear_finished_workers(self) -> int: + """Delete all workers that are not currently running.""" + try: + cursor = self.connection.cursor() + cursor.execute("DELETE FROM worker WHERE status != 'running'") + self.connection.commit() + return cursor.rowcount + except Exception as e: + logger.error(f"Error clearing finished workers: {e}", exc_info=True) + return 0 + + def close(self) -> None: + """Close the database connection.""" + try: + if self.connection: + self.connection.close() + logger.info("Database connection closed") + except Exception as e: + logger.error(f"Error closing database: {e}", exc_info=True) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + + +# ============================================================================ +# LIBRARY INITIALIZATION & MIGRATION +# ============================================================================ + +class LocalLibraryInitializer: + """Initialize and synchronize local library database.""" + + def __init__(self, library_root: Path): + """Initialize the database scanner.""" + self.library_root = Path(library_root) + self.db = LocalLibraryDB(library_root) + self.stats = { + 'files_scanned': 0, 'files_new': 0, 'files_existing': 0, + 'sidecars_imported': 0, 'sidecars_deleted': 0, + 'tags_imported': 0, 'metadata_imported': 0, 'errors': 0 + } + + def scan_and_index(self) -> Dict[str, int]: + """Scan library folder and populate database with file entries.""" + try: + logger.info(f"Starting library scan at {self.library_root}") + + media_files = self._find_media_files() + logger.info(f"Found {len(media_files)} media files") + + db_files = self._get_database_files() + logger.info(f"Found {len(db_files)} files in database") + + for file_path in media_files: + self._process_file(file_path, db_files) + + self.db.connection.commit() + self._import_sidecars_batch() + self.db.connection.commit() + self._cleanup_orphaned_sidecars() + self.db.connection.commit() + + logger.info(f"Library scan complete. Stats: {self.stats}") + return self.stats + except Exception as e: + logger.error(f"Error during library scan: {e}", exc_info=True) + self.stats['errors'] += 1 + raise + finally: + self.db.close() + + def _find_media_files(self) -> List[Path]: + """Find all media files in the library folder.""" + media_files = [] + try: + for file_path in self.library_root.rglob("*"): + if file_path.is_file() and file_path.suffix.lower() in MEDIA_EXTENSIONS: + media_files.append(file_path) + except Exception as e: + logger.error(f"Error scanning media files: {e}", exc_info=True) + + return sorted(media_files) + + def _get_database_files(self) -> Dict[str, int]: + """Get existing files from database by normalized path.""" + try: + cursor = self.db.connection.cursor() + cursor.execute("SELECT id, file_path FROM files") + + result = {} + for file_id, file_path in cursor.fetchall(): + normalized = str(Path(file_path).resolve()).lower() + result[normalized] = file_id + + return result + except Exception as e: + logger.error(f"Error getting database files: {e}", exc_info=True) + return {} + + def _process_file(self, file_path: Path, db_files: Dict[str, int]) -> None: + """Process a single media file.""" + try: + normalized = str(file_path.resolve()).lower() + + if normalized in db_files: + self.stats['files_existing'] += 1 + else: + self.db.get_or_create_file_entry(file_path) + self.stats['files_new'] += 1 + + self.stats['files_scanned'] += 1 + except Exception as e: + logger.warning(f"Error processing file {file_path}: {e}") + self.stats['errors'] += 1 + + def _import_sidecars_batch(self) -> None: + """Batch import all sidecar files.""" + try: + for sidecar_path in self.library_root.rglob("*.tags"): + try: + base_path = Path(str(sidecar_path)[:-len('.tags')]) + if not base_path.exists(): + continue + + hash_val, tags, urls = read_sidecar(sidecar_path) + + if hash_val or tags or urls: + if hash_val: + self.db.update_file_hash(base_path, hash_val) + if tags: + self.db.save_tags(base_path, tags) + if urls: + self.db.save_metadata(base_path, {'known_urls': urls}) + + self.stats['sidecars_imported'] += 1 + except Exception as e: + logger.warning(f"Error importing sidecar {sidecar_path}: {e}") + self.stats['errors'] += 1 + except Exception as e: + logger.error(f"Error batch importing sidecars: {e}", exc_info=True) + + def _cleanup_orphaned_sidecars(self) -> None: + """Remove sidecars for non-existent files.""" + try: + for sidecar_path in self.library_root.rglob("*.tags"): + base_path = Path(str(sidecar_path)[:-len('.tags')]) + if not base_path.exists(): + try: + sidecar_path.unlink() + self.stats['sidecars_deleted'] += 1 + except Exception as e: + logger.warning(f"Could not delete orphaned sidecar {sidecar_path}: {e}") + except Exception as e: + logger.error(f"Error cleaning up orphaned sidecars: {e}", exc_info=True) + + +def migrate_tags_to_db(library_root: Path, db: LocalLibraryDB) -> int: + """Migrate .tags files to the database.""" + migrated_count = 0 + + try: + for tags_file in library_root.rglob("*.tags"): + try: + base_path = Path(str(tags_file)[:-len('.tags')]) + tags_text = tags_file.read_text(encoding='utf-8') + tags = [line.strip() for line in tags_text.splitlines() if line.strip()] + + db.save_tags(base_path, tags) + migrated_count += 1 + + try: + tags_file.unlink() + logger.info(f"Migrated and deleted {tags_file}") + except Exception as e: + logger.warning(f"Migrated {tags_file} but failed to delete: {e}") + except Exception as e: + logger.warning(f"Failed to migrate {tags_file}: {e}") + + logger.info(f"Migrated {migrated_count} .tags files to database") + return migrated_count + except Exception as e: + logger.error(f"Error during tags migration: {e}", exc_info=True) + return migrated_count + + +def migrate_metadata_to_db(library_root: Path, db: LocalLibraryDB) -> int: + """Migrate .metadata files to the database.""" + migrated_count = 0 + + try: + for metadata_file in library_root.rglob("*.metadata"): + try: + base_path = Path(str(metadata_file)[:-len('.metadata')]) + metadata_text = metadata_file.read_text(encoding='utf-8') + metadata = _parse_metadata_file(metadata_text) + + db.save_metadata(base_path, metadata) + migrated_count += 1 + + try: + metadata_file.unlink() + logger.info(f"Migrated and deleted {metadata_file}") + except Exception as e: + logger.warning(f"Migrated {metadata_file} but failed to delete: {e}") + except Exception as e: + logger.warning(f"Failed to migrate {metadata_file}: {e}") + + logger.info(f"Migrated {migrated_count} .metadata files to database") + return migrated_count + except Exception as e: + logger.error(f"Error during metadata migration: {e}", exc_info=True) + return migrated_count + + +def _parse_metadata_file(content: str) -> Dict[str, Any]: + """Parse metadata file content.""" + try: + return json.loads(content) + except json.JSONDecodeError: + logger.warning("Could not parse metadata JSON, returning empty dict") + return {} + + +def migrate_all(library_root: Path, db: Optional[LocalLibraryDB] = None) -> Dict[str, int]: + """Migrate all sidecar files to database.""" + should_close = db is None + + try: + if db is None: + db = LocalLibraryDB(library_root) + + return { + 'tags': migrate_tags_to_db(library_root, db), + 'metadata': migrate_metadata_to_db(library_root, db), + } + finally: + if should_close: + db.close() + + +# ============================================================================ +# SEARCH OPTIMIZATION +# ============================================================================ + +class LocalLibrarySearchOptimizer: + """Optimizer that uses database for local library searches.""" + + def __init__(self, library_root: Path): + """Initialize the search optimizer.""" + self.library_root = Path(library_root) + self.db: Optional[LocalLibraryDB] = None + + def __enter__(self): + """Context manager entry.""" + self.db = LocalLibraryDB(self.library_root) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Context manager exit.""" + if self.db: + self.db.close() + + def get_cached_tags(self, file_path: Path) -> List[str]: + """Get tags from database cache.""" + if not self.db: + return [] + return self.db.get_tags(file_path) + + def get_cached_metadata(self, file_path: Path) -> Optional[Dict[str, Any]]: + """Get metadata from database cache.""" + if not self.db: + return None + return self.db.get_metadata(file_path) + + def prefetch_metadata(self, file_paths: List[Path]) -> None: + """Pre-cache metadata for multiple files.""" + if not self.db: + return + + for file_path in file_paths: + try: + self.db.get_or_create_file_entry(file_path) + except Exception as e: + logger.warning(f"Failed to prefetch {file_path}: {e}") + + def update_search_result_with_cached_data(self, search_result: Any, file_path: Path) -> None: + """Update a search result object with cached database data.""" + if not self.db: + return + + try: + tags = self.db.get_tags(file_path) + if tags: + search_result.tag_summary = ", ".join(tags) + + metadata = self.db.get_metadata(file_path) + if metadata: + if 'hash' in metadata: + search_result.hash_hex = metadata['hash'] + if 'duration' in metadata: + search_result.duration_seconds = metadata['duration'] + if 'media_kind' in metadata: + search_result.media_kind = metadata['media_kind'] + except Exception as e: + logger.warning(f"Failed to update search result for {file_path}: {e}") + + def search_by_tag(self, tag: str, limit: int = 100) -> List[Path]: + """Fast tag-based search using database.""" + if not self.db: + return [] + return self.db.search_by_tag(tag, limit) + + def search_by_hash(self, file_hash: str) -> Optional[Path]: + """Fast hash-based search using database.""" + if not self.db: + return None + return self.db.search_by_hash(file_hash) diff --git a/helper/logger.py b/helper/logger.py new file mode 100644 index 0000000..809ec4f --- /dev/null +++ b/helper/logger.py @@ -0,0 +1,70 @@ +"""Unified logging utility for automatic file and function name tracking.""" + +import sys +import inspect +from pathlib import Path + +_DEBUG_ENABLED = False + +def set_debug(enabled: bool) -> None: + """Enable or disable debug logging.""" + global _DEBUG_ENABLED + _DEBUG_ENABLED = enabled + +def debug(*args, **kwargs) -> None: + """Print debug message if debug logging is enabled. + + Automatically prepends [filename.function_name] to all output. + """ + if not _DEBUG_ENABLED: + return + + # Set default to stderr for debug messages + if 'file' not in kwargs: + kwargs['file'] = sys.stderr + + # Prepend DEBUG label + args = ("DEBUG:", *args) + + # Use the same logic as log() + log(*args, **kwargs) + +def log(*args, **kwargs) -> None: + """Print with automatic file.function prefix. + + Automatically prepends [filename.function_name] to all output. + Defaults to stdout if not specified. + + Example: + log("Upload started") # Output: [add_file.run] Upload started + """ + # Get the calling frame + frame = inspect.currentframe() + if frame is None: + print(*args, **kwargs) + return + + caller_frame = frame.f_back + if caller_frame is None: + print(*args, **kwargs) + return + + try: + # Get file name without extension + file_name = Path(caller_frame.f_code.co_filename).stem + + # Get function name + func_name = caller_frame.f_code.co_name + + # Set default to stdout if not specified + if 'file' not in kwargs: + kwargs['file'] = sys.stdout + + # Build prefix + prefix = f"[{file_name}.{func_name}]" + + # Print with prefix + print(prefix, *args, **kwargs) + finally: + del frame + del caller_frame diff --git a/helper/mpv_file.py b/helper/mpv_file.py new file mode 100644 index 0000000..6a014f3 --- /dev/null +++ b/helper/mpv_file.py @@ -0,0 +1,951 @@ +"""MPV file metadata aggregation helpers.""" +from __future__ import annotations + +import os +import re +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional, Sequence +from urllib.parse import parse_qs, urlparse, unquote + +from config import get_hydrus_url +from helper.utils import sha256_file, unique_preserve_order +from helper.hydrus import HydrusClient, HydrusRequestError + +import metadata + + +class MPVFileError(RuntimeError): + """Raised when we cannot construct an MPV file snapshot.""" + + +@dataclass(slots=True) +class DebridMagnet: + """Represents a magnet result from AllDebrid search. + + This class matches the structure expected by the TUI (like Hydrus results) + with title, target, media_kind attributes for compatibility. + """ + magnet_id: str + title: str + size: int + status_code: int + status_text: str + progress: float + downloaded: int + seeders: int + dl_speed: int + tag_summary: Optional[str] = None + metadata: Optional[Dict[str, Any]] = None # Complete magnet file metadata from AllDebrid API + + @property + def target(self) -> str: + """Return the target URI for this magnet (used by TUI for access operations).""" + return f"alldebrid://{self.magnet_id}" + + @property + def media_kind(self) -> str: + """Return media kind for display.""" + return "magnet" + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary for metadata display.""" + return { + "magnet_id": self.magnet_id, + "title": self.title, + "size": self.size, + "status_code": self.status_code, + "status_text": self.status_text, + "progress": f"{self.progress:.1f}%", + "downloaded": self.downloaded, + "seeders": self.seeders, + "dl_speed": self.dl_speed, + } + + +@dataclass(slots=True) +class HydrusSettings: + base_url: Optional[str] + access_key: Optional[str] + timeout: float + prefer_service_name: Optional[str] + include_relationships: bool + + def as_metadata_options(self) -> Dict[str, Any]: + options: Dict[str, Any] = { + "timeout": self.timeout, + "include_relationships": self.include_relationships, + } + if self.prefer_service_name: + options["prefer_service_name"] = self.prefer_service_name + return options + + + +@dataclass(slots=True) +class MPVfile: + path: Optional[str] = None + filename: Optional[str] = None + type: str = "unknown" + hash: Optional[str] = None + local_path: Optional[str] = None + mpv_metadata: Dict[str, Any] = field(default_factory=dict) + metadata: Dict[str, Any] = field(default_factory=dict) + remote_metadata: Optional[Dict[str, Any]] = None + relationships: Optional[Dict[str, Any]] = None + relationship_metadata: Dict[str, Any] = field(default_factory=dict) + tags: List[str] = field(default_factory=list) + original_tags: Dict[str, str] = field(default_factory=dict) + known_urls: List[str] = field(default_factory=list) + title: Optional[str] = None + source_url: Optional[str] = None + clip_time: Optional[str] = None + duration: Optional[float] = None + filesize_mb: Optional[float] = None + is_video: bool = False + is_audio: bool = False + is_deleted: Optional[bool] = None + is_local: Optional[bool] = None + has_current_file_service: Optional[bool] = None + tag_service_key: Optional[str] = None + swap_recommended: bool = False + warnings: List[str] = field(default_factory=list) + # New relationship fields for menu + king: Optional[str] = None + alts: List[str] = field(default_factory=list) + + def to_dict(self) -> Dict[str, Any]: + payload: Dict[str, Any] = { + "path": self.path, + "filename": self.filename, + "type": self.type, + "hash": self.hash, + "local_path": self.local_path, + "mpv_metadata": self.mpv_metadata, + "metadata": self.metadata, + "remote_metadata": self.remote_metadata, + "relationships": self.relationships, + "relationship_metadata": self.relationship_metadata, + "tags": self.tags, + "original_tags": self.original_tags, + "known_urls": self.known_urls, + "title": self.title, + "source_url": self.source_url, + "clip_time": self.clip_time, + "duration": self.duration, + "filesize_mb": self.filesize_mb, + "is_video": self.is_video, + "is_audio": self.is_audio, + "is_deleted": self.is_deleted, + "is_local": self.is_local, + "has_current_file_service": self.has_current_file_service, + "tag_service_key": self.tag_service_key, + "swap_recommended": self.swap_recommended, + "warnings": self.warnings, + # relationship summary fields for easier Lua consumption + "king": self.king, + "alts": self.alts, + } + # Remove empty optional values for terser payloads. + for key in list(payload.keys()): + value = payload[key] + if value in (None, [], {}, ""): + del payload[key] + return payload + + +def _normalise_string_list(values: Optional[Iterable[Any]]) -> List[str]: + if not values: + return [] + seen: set[str] = set() + result: List[str] = [] + for value in values: + if value is None: + continue + text = str(value).strip() + if not text or text in seen: + continue + seen.add(text) + result.append(text) + return result + + +def _looks_like_hash(value: Optional[str]) -> bool: + if not value: + return False + candidate = value.strip().lower() + return len(candidate) == 64 and all(ch in "0123456789abcdef" for ch in candidate) + + +class MPVFileBuilder: + def __init__(self, payload: Dict[str, Any], config: Dict[str, Any]): + self.payload = payload or {} + self.config = config or {} + self.state = MPVfile() + self.hydrus_settings = self._resolve_hydrus_settings() + self.remote_options = self._resolve_remote_options() + self.include_relationships = bool(self.payload.get("include_relationships", True)) + self.last_url = self._normalise_url(self.payload.get("last_url")) + self._initialise_identity() + + # ------------------------------------------------------------------ + # public API + # ------------------------------------------------------------------ + + def build(self) -> Dict[str, Any]: + if self.state.type == "hydrus": + self._populate_hydrus_by_hash() + elif self.state.type == "local": + self._populate_local() + elif self.state.type == "remote": + self._populate_remote() + else: + # Attempt best effort resolution even for unknown types. + self._populate_local(best_effort=True) + self._finalise() + result = self.state.to_dict() + # Append King and Alts info to mpv_metadata for info menu + king = self.state.king + alts = self.state.alts + if king: + result.setdefault("mpv_metadata", {})["King"] = king + if alts: + result.setdefault("mpv_metadata", {})["Alts"] = ", ".join(alts) + return result + + # ------------------------------------------------------------------ + # configuration helpers + # ------------------------------------------------------------------ + + def _resolve_hydrus_settings(self) -> HydrusSettings: + overrides = self.payload.get("hydrus") + overrides = overrides if isinstance(overrides, dict) else {} + base_url = overrides.get("url") or overrides.get("base_url") + access_key = overrides.get("access_key") + timeout_raw = overrides.get("timeout") or overrides.get("hydrus_timeout") + prefer_service = overrides.get("prefer_service_name") + include_relationships = overrides.get("include_relationships") + if base_url is None: + base_url = get_hydrus_url(self.config) + if access_key is None: + raw_key = self.config.get("HydrusNetwork_Access_Key") + access_key = str(raw_key) if raw_key is not None else None + if timeout_raw is None: + timeout_raw = self.config.get("HydrusNetwork_Request_Timeout") + try: + timeout = float(timeout_raw) if timeout_raw is not None else 60.0 + except (TypeError, ValueError): + timeout = 60.0 + if prefer_service is None: + prefer_service = self.config.get("Hydrus_Tag_Service") + if isinstance(prefer_service, str): + prefer_service = prefer_service.strip() or None + if include_relationships is None: + include_relationships = self.payload.get("include_relationships") + include_relationships = bool(True if include_relationships is None else include_relationships) + base_url = base_url.strip() if isinstance(base_url, str) else None + access_key = access_key.strip() if isinstance(access_key, str) else None + return HydrusSettings( + base_url=base_url or None, + access_key=access_key or None, + timeout=timeout, + prefer_service_name=prefer_service, + include_relationships=include_relationships, + ) + + def _resolve_remote_options(self) -> Dict[str, Any]: + remote_payload = self.payload.get("remote") + remote_payload = remote_payload if isinstance(remote_payload, dict) else {} + options = remote_payload.get("options") + options = options if isinstance(options, dict) else {} + ytdlp_args = options.get("ytdlp_args") + if not ytdlp_args: + options["ytdlp_args"] = ["--no-playlist", "--skip-download", "--no-warnings"] + existing_timeout = options.get("timeout") + if existing_timeout is None: + options["timeout"] = min(90.0, max(10.0, float(self.payload.get("remote_timeout") or 45.0))) + return options + + # ------------------------------------------------------------------ + # initialisation + # ------------------------------------------------------------------ + + def _initialise_identity(self) -> None: + s = self.state + p = self.payload + + def _str_or_none(v): + return str(v) if v is not None and v != "" else None + + def _copy_dict_if_dict(v): + return dict(v) if isinstance(v, dict) else {} + + # path and filename + s.path = _str_or_none(p.get("path")) + s.filename = _str_or_none(p.get("filename")) + + # mpv metadata + s.mpv_metadata = _copy_dict_if_dict(p.get("mpv_metadata")) + + # tags (support both "tags" and legacy "existing_tags") + existing_tags = p.get("tags") or p.get("existing_tags") + s.tags = _normalise_string_list(existing_tags) + if s.tags: + s.original_tags = {tag: tag for tag in s.tags} + + # known URLs + last_url + s.known_urls = _normalise_string_list(p.get("known_urls")) + if self.last_url and self.last_url not in s.known_urls: + s.known_urls.append(self.last_url) + + # source URL (explicit or fallback to last_url) + explicit_source = p.get("source_url") + s.source_url = self._normalise_url(explicit_source) or self.last_url + + # hash (validate looks-like-hash) + hash_candidate = p.get("hash") + if isinstance(hash_candidate, str): + candidate = hash_candidate.strip().lower() + if _looks_like_hash(candidate): + s.hash = candidate + + # local_path (non-empty string) + local_path_override = p.get("local_path") + if isinstance(local_path_override, str): + lp = local_path_override.strip() + if lp: + s.local_path = lp + + # derive remaining fields from path/filename/type + self._derive_filename_from_path() + self._determine_type() + + + def _derive_filename_from_path(self) -> None: + if self.state.filename or not self.state.path: + return + parsed = urlparse(self.state.path) + if parsed.scheme in ("http", "https", "ytdl") and parsed.path: + candidate = Path(parsed.path).name + if candidate: + self.state.filename = candidate + elif parsed.scheme == "file": + decoded = self._decode_file_url(self.state.path) + if decoded: + self.state.filename = Path(decoded).name + else: + try: + self.state.filename = Path(self.state.path).name + except Exception: + pass + + def _determine_type(self) -> None: + s = self.state + p = self.payload + + def _set_local_from_path(pth: str | None): + if not pth: + return + # Prefer resolved local path when available + resolved = self._resolve_local_path(pth) + s.local_path = resolved if resolved else pth + s.type = "local" + + # 1) Respect explicit type when valid + explicit = p.get("type") + if isinstance(explicit, str): + lowered = explicit.strip().lower() + if lowered in {"local", "hydrus", "remote"}: + s.type = lowered + if lowered == "local": + s.local_path = self._resolve_local_path(s.path) + return + + # 2) Work from path + path = s.path or "" + if not path: + s.type = "unknown" + return + + # 3) Hydrus-specific quick checks + if self._looks_like_hydrus_url(path): + s.type = "hydrus" + return + + parsed = urlparse(path) + scheme = (parsed.scheme or "").lower() + + # 4) scheme-based handling + if scheme == "hydrus": + s.type = "hydrus" + return + + if scheme in {"http", "https", "rtmp", "rtsp", "magnet", "ytdl"}: + s.type = "hydrus" if self._looks_like_hydrus_url(path) else "remote" + return + + if scheme == "file": + decoded = self._decode_file_url(path) + if decoded: + s.local_path = decoded + s.type = "local" + return + + # 5) Windows/UNC absolute paths + if re.match(r"^[A-Za-z]:[\\/]", path) or path.startswith(("\\\\", "//")): + s.type = "local" + s.local_path = path + return + + # 6) Fallback: if it looks like a URL with a scheme separator treat as remote/hydrus + if "://" in path: + s.type = "hydrus" if self._looks_like_hydrus_url(path) else "remote" + return + + # 7) Otherwise treat as a local path + _set_local_from_path(path) + + + # ------------------------------------------------------------------ + # population helpers + # ------------------------------------------------------------------ + + def _populate_local(self, best_effort: bool = False) -> None: + local_path = self.state.local_path or self._resolve_local_path(self.state.path) + if local_path: + self.state.local_path = local_path + self._load_sidecar_tags(local_path) + if not self.state.hash: + self._compute_local_hash(local_path) + # If Hydrus is configured and we have a hash, enrich from Hydrus; otherwise keep local tags only + if self.state.hash and self.hydrus_settings.base_url and self.hydrus_settings.access_key: + self._populate_hydrus_by_hash() + elif best_effort and self.hydrus_settings.base_url and self.state.source_url and self.hydrus_settings.access_key: + self._populate_hydrus_by_url(self.state.source_url) + + # (helpers for resolving local path and loading sidecars already exist below) + + def _populate_remote(self) -> None: + source_url = self.state.source_url or self.last_url or self.state.path + source_url = self._normalise_url(source_url) + if source_url: + self.state.source_url = source_url + remote_payload = { + "source_url": self.state.source_url, + "existing_tags": self.state.tags, + "metadata": self.payload.get("remote_metadata"), + "mpv_metadata": self.state.mpv_metadata, + "options": self.remote_options, + } + try: + remote_result = metadata.resolve_remote_metadata(remote_payload) + except Exception as exc: # pragma: no cover - surfaced to the caller + self.state.warnings.append(str(exc)) + remote_result = None + if remote_result: + tags = remote_result.get("tags") or [] + self._merge_tags(tags) + self.state.remote_metadata = remote_result.get("metadata") + self.state.title = remote_result.get("title") or self.state.title + self.state.duration = remote_result.get("duration") or self.state.duration + self.state.source_url = remote_result.get("source_url") or self.state.source_url + warnings = remote_result.get("warnings") or [] + if warnings: + self.state.warnings.extend(warnings) + if self.hydrus_settings.base_url and self.state.source_url: + self._populate_hydrus_by_url(self.state.source_url) + + def _populate_hydrus_by_hash(self) -> None: + hash_hex = self.state.hash or self._extract_hash_from_path(self.state.path) + if hash_hex and not _looks_like_hash(hash_hex): + hash_hex = None + if not hash_hex: + return + self.state.hash = hash_hex + if not self.hydrus_settings.base_url: + return + payload: Dict[str, Any] = { + "api_url": self.hydrus_settings.base_url, + "access_key": self.hydrus_settings.access_key or "", + "options": self.hydrus_settings.as_metadata_options(), + "hash": hash_hex, + } + try: + result = metadata.fetch_hydrus_metadata(payload) + except Exception as exc: # pragma: no cover - surfaced to caller + self.state.warnings.append(str(exc)) + return + self._apply_hydrus_result(result) + # Enrich relationships using the dedicated Hydrus endpoint (robust GET) + if self.include_relationships and self.state.hash and self.hydrus_settings.base_url: + self._enrich_relationships_from_api(self.state.hash) + + def _populate_hydrus_by_url(self, url: str) -> None: + if not self.hydrus_settings.base_url: + return + payload: Dict[str, Any] = { + "api_url": self.hydrus_settings.base_url, + "access_key": self.hydrus_settings.access_key or "", + "options": self.hydrus_settings.as_metadata_options(), + "url": url, + } + try: + result = metadata.fetch_hydrus_metadata_by_url(payload) + except Exception as exc: # pragma: no cover - surfaced to caller + self.state.warnings.append(str(exc)) + return + if result.get("error") == "not_found": + self.state.warnings.extend(result.get("warnings") or []) + return + self._apply_hydrus_result(result) + self.state.type = "hydrus" + matched_url = result.get("matched_url") or result.get("url") + if matched_url and matched_url not in self.state.known_urls: + self.state.known_urls.append(matched_url) + # Enrich relationships once we know the hash + if self.include_relationships and self.state.hash and self.hydrus_settings.base_url: + self._enrich_relationships_from_api(self.state.hash) + + # ------------------------------------------------------------------ + # state modification helpers + # ------------------------------------------------------------------ + + + def _apply_hydrus_result(self, result: Dict[str, Any]) -> None: + metadata_payload = result.get("metadata") + if isinstance(metadata_payload, dict): + # Process mime into type for Lua + mime = metadata_payload.get("mime") + if isinstance(mime, str): + if mime.startswith("video/"): + metadata_payload["type"] = "video" + elif mime.startswith("audio/"): + metadata_payload["type"] = "audio" + elif mime.startswith("image/"): + metadata_payload["type"] = "image" + else: + metadata_payload["type"] = "other" + self.state.metadata = metadata_payload + # Do NOT overwrite MPVfile.type with metadata.type + self._merge_known_urls(metadata_payload.get("known_urls") or metadata_payload.get("known_urls_set")) + source_url = metadata_payload.get("original_url") or metadata_payload.get("source_url") + if source_url and not self.state.source_url: + self.state.source_url = self._normalise_url(source_url) + # If file_relationships are embedded in metadata, capture as relationships when missing + if self.state.relationships is None: + embedded = metadata_payload.get("file_relationships") + if isinstance(embedded, dict) and embedded: + self.state.relationships = embedded + tags = result.get("tags") or [] + self._merge_tags(tags) + hash_value = result.get("hash") or result.get("matched_hash") + if isinstance(hash_value, str) and _looks_like_hash(hash_value): + self.state.hash = hash_value.lower() + self.state.tag_service_key = result.get("tag_service_key") or self.state.tag_service_key + self.state.duration = result.get("duration") or self.state.duration + self.state.filesize_mb = result.get("filesize_mb") or self.state.filesize_mb + self.state.is_video = bool(result.get("is_video") or self.state.is_video) + self.state.is_audio = bool(result.get("is_audio") or self.state.is_audio) + if result.get("is_deleted") is not None: + self.state.is_deleted = bool(result.get("is_deleted")) + if result.get("is_local") is not None: + self.state.is_local = bool(result.get("is_local")) + if result.get("has_current_file_service") is not None: + self.state.has_current_file_service = bool(result.get("has_current_file_service")) + # Consolidate relationships from explicit result or embedded metadata + relationships_obj: Optional[Dict[str, Any]] = None + if isinstance(result.get("relationships"), dict): + relationships_obj = result["relationships"] + self.state.relationships = relationships_obj + elif isinstance(self.state.relationships, dict): + relationships_obj = self.state.relationships + + # Helper to flatten any hashes from the relationships object + def _collect_hashes(obj: Any, acc: set[str]) -> None: + if obj is None: + return + if isinstance(obj, dict): + for v in obj.values(): + _collect_hashes(v, acc) + elif isinstance(obj, (list, tuple, set)): + for v in obj: + _collect_hashes(v, acc) + elif isinstance(obj, str) and _looks_like_hash(obj): + acc.add(obj.lower()) + + # Derive king and alts robustly from available data + king: Optional[str] = None + alts: list[str] = [] + + # 1) Try direct king fields on relationships object + rels = relationships_obj or {} + if isinstance(rels, dict): + # Common variants + for key in ("king", "king_hash", "duplicate_king", "best", "best_hash"): + val = rels.get(key) + if isinstance(val, str) and _looks_like_hash(val): + king = val.lower() + break + if isinstance(val, list): + for h in val: + if isinstance(h, str) and _looks_like_hash(h): + king = h.lower() + break + if king: + break + # 2) Extract alternates from known fields: numeric "3" (clips), or textual synonyms + for alt_key in ("3", "alternates", "alts", "clips"): + val = rels.get(alt_key) + if isinstance(val, list): + for h in val: + if isinstance(h, str) and _looks_like_hash(h): + h_low = h.lower() + if not king or h_low != king: + alts.append(h_low) + # some APIs might nest + elif isinstance(val, dict): + tmp: set[str] = set() + _collect_hashes(val, tmp) + for h in sorted(tmp): + if not king or h != king: + alts.append(h) + + # 3) Use relationship_metadata keys as additional alternates and king hint + rel_meta = result.get("relationship_metadata") + if isinstance(rel_meta, dict): + # prefer king candidate with no clip_time if not set + if not king: + for h, meta in rel_meta.items(): + if isinstance(h, str) and _looks_like_hash(h) and isinstance(meta, dict): + if not meta.get("clip_time"): + king = h.lower() + break + for h in rel_meta.keys(): + if isinstance(h, str) and _looks_like_hash(h): + h_low = h.lower() + if not king or h_low != king: + alts.append(h_low) + + # 4) As a last resort, flatten all relationship hashes + if not alts and relationships_obj: + tmp: set[str] = set() + _collect_hashes(relationships_obj, tmp) + for h in sorted(tmp): + if not king or h != king: + alts.append(h) + + # 5) Include current file when appropriate + if self.state.hash and (not king or self.state.hash != king) and self.state.hash not in alts: + alts.append(self.state.hash) + + # 6) Sort alternates by clip start time when available + rel_meta_all = result.get("relationship_metadata") if isinstance(result.get("relationship_metadata"), dict) else {} + def _clip_start_for(h: str) -> float: + meta = rel_meta_all.get(h) if isinstance(rel_meta_all, dict) else None + clip = meta.get("clip_time") if isinstance(meta, dict) else None + if isinstance(clip, str): + m = re.match(r"^(\d+)-(\d+)$", clip) + if m: + try: + return float(m.group(1)) + except Exception: + return float("inf") + return float("inf") + + if alts: + # de-duplicate while preserving earliest clip time ordering + seen: set[str] = set() + alts = [h for h in sorted(alts, key=_clip_start_for) if (h not in seen and not seen.add(h))] + + self.state.king = king + self.state.alts = alts + if isinstance(result.get("relationship_metadata"), dict): + self.state.relationship_metadata = result["relationship_metadata"] + self.state.title = result.get("title") or self.state.title + self.state.clip_time = result.get("clip_time") or self.state.clip_time + if result.get("swap_recommended"): + self.state.swap_recommended = True + warnings = result.get("warnings") or [] + if warnings: + self.state.warnings.extend(warnings) + + # ------------------------------------------------------------------ + # relationships enrichment (Hydrus endpoint + alt metadata) + # ------------------------------------------------------------------ + + def _enrich_relationships_from_api(self, file_hash: str) -> None: + """Fetch relationships for the given hash and enrich state's king/alts and alt metadata. + + - Uses GET /manage_file_relationships/get_file_relationships?hash=... + - If alts exist, batch-fetch their metadata via GET /get_files/file_metadata?hashes=[...] + - Extracts title, duration, size, tags (cleaned: title: kept with namespace, others stripped) + """ + base_url = self.hydrus_settings.base_url or "" + access_key = self.hydrus_settings.access_key or "" + if not base_url: + return + try: + client = HydrusClient(base_url, access_key, timeout=self.hydrus_settings.timeout) + except Exception as exc: # pragma: no cover - construction should rarely fail + self.state.warnings.append(f"Hydrus client init failed: {exc}") + return + try: + rel_resp = client.get_file_relationships(file_hash) + except HydrusRequestError as hre: # pragma: no cover - surfaced but non-fatal + self.state.warnings.append(f"relationships api: {hre}") + return + except Exception as exc: # pragma: no cover + self.state.warnings.append(f"relationships api: {exc}") + return + + rel_map = rel_resp.get("file_relationships") or {} + rel_obj = None + if isinstance(rel_map, dict): + rel_obj = rel_map.get(file_hash) or next((v for v in rel_map.values() if isinstance(v, dict)), None) + if isinstance(rel_obj, dict): + # Preserve the full relationships object + self.state.relationships = rel_obj + # Update king and alts from canonical fields + king = rel_obj.get("king") + alts = rel_obj.get("3") or [] + if isinstance(king, str) and _looks_like_hash(king): + self.state.king = king.lower() + if isinstance(alts, list): + self.state.alts = [h.lower() for h in alts if isinstance(h, str) and _looks_like_hash(h)] + + # Fetch alt metadata if we have alts + if not self.state.alts: + return + try: + meta_resp = client.fetch_file_metadata( + hashes=self.state.alts, + include_service_keys_to_tags=True, + include_duration=True, + include_size=True, + include_file_urls=False, + include_mime=False, + ) + except HydrusRequestError as hre: # pragma: no cover + self.state.warnings.append(f"metadata api: {hre}") + return + except Exception as exc: # pragma: no cover + self.state.warnings.append(f"metadata api: {exc}") + return + + if not isinstance(meta_resp, dict): + return + entries = meta_resp.get("metadata") or [] + if not isinstance(entries, list): + return + + def _extract_tags(meta: Dict[str, Any]) -> list[str]: + tags: list[str] = [] + tag_root = meta.get("tags") or meta.get("service_keys_to_statuses_to_tags") or {} + if isinstance(tag_root, dict): + for service_dict in tag_root.values(): + if not isinstance(service_dict, dict): + continue + # Prefer storage_tags but fall back to any list values under known keys + storage = service_dict.get("storage_tags") + if isinstance(storage, dict): + for vals in storage.values(): + if isinstance(vals, list): + tags.extend([str(t) for t in vals if isinstance(t, str)]) + else: + # fall back: inspect lists directly under service_dict + for vals in service_dict.values(): + if isinstance(vals, list): + tags.extend([str(t) for t in vals if isinstance(t, str)]) + return tags + + def _clean_tags_and_title(all_tags: list[str]) -> tuple[Optional[str], list[str]]: + title_val: Optional[str] = None + cleaned: list[str] = [] + for tag in all_tags: + if not isinstance(tag, str): + continue + if tag.startswith("title:"): + if title_val is None: + title_val = tag.split(":", 1)[1] + cleaned.append(tag) # keep namespaced title + else: + if ":" in tag: + cleaned.append(tag.split(":", 1)[1]) + else: + cleaned.append(tag) + return title_val, cleaned + + for meta in entries: + if not isinstance(meta, dict): + continue + h = meta.get("hash") + if not (isinstance(h, str) and _looks_like_hash(h)): + continue + tags_all = _extract_tags(meta) + title_val, tags_clean = _clean_tags_and_title(tags_all) + alt_info = { + "title": title_val, + "duration": meta.get("duration"), + "size": meta.get("size"), + "tags": tags_clean, + } + self.state.relationship_metadata[h.lower()] = alt_info + + def _merge_tags(self, tags: Sequence[Any]) -> None: + incoming = _normalise_string_list(tags) + if not incoming: + return + combined = list(self.state.tags or []) + incoming + self.state.tags = unique_preserve_order(combined) + for tag in incoming: + if tag not in self.state.original_tags: + self.state.original_tags[tag] = tag + + def _merge_known_urls(self, urls: Optional[Iterable[Any]]) -> None: + if not urls: + return + combined = list(self.state.known_urls or []) + _normalise_string_list(urls) + self.state.known_urls = unique_preserve_order(combined) + + def _load_sidecar_tags(self, local_path: str) -> None: + try: + media_path = Path(local_path) + except Exception: + return + if not media_path.exists(): + return + candidates = [media_path.with_suffix(".tags"), media_path.with_suffix(".tags.txt")] + for candidate in candidates: + if candidate.exists(): + hash_value, tags, known = self._read_sidecar(candidate) + if hash_value and not self.state.hash and _looks_like_hash(hash_value): + self.state.hash = hash_value.lower() + self._merge_tags(tags) + self._merge_known_urls(known) + break + + def _read_sidecar(self, sidecar_path: Path) -> tuple[Optional[str], List[str], List[str]]: + try: + raw = sidecar_path.read_text(encoding="utf-8", errors="ignore") + except OSError: + return None, [], [] + hash_value: Optional[str] = None + tags: List[str] = [] + known_urls: List[str] = [] + for line in raw.splitlines(): + trimmed = line.strip() + if not trimmed: + continue + lowered = trimmed.lower() + if lowered.startswith("hash:"): + candidate = trimmed.split(":", 1)[1].strip() if ":" in trimmed else "" + if candidate: + hash_value = candidate + elif lowered.startswith("known_url:") or lowered.startswith("url:"): + candidate = trimmed.split(":", 1)[1].strip() if ":" in trimmed else "" + if candidate: + known_urls.append(candidate) + else: + tags.append(trimmed) + return hash_value, tags, known_urls + + def _compute_local_hash(self, local_path: str) -> None: + try: + digest = sha256_file(Path(local_path)) + except OSError as exc: + self.state.warnings.append(f"sha256 failed: {exc}") + return + self.state.hash = digest.lower() + + # ------------------------------------------------------------------ + # finalisation helpers + # ------------------------------------------------------------------ + + def _finalise(self) -> None: + if self.state.tags: + self.state.tags = unique_preserve_order(self.state.tags) + if self.state.known_urls: + self.state.known_urls = unique_preserve_order(self.state.known_urls) + # Ensure metadata.type is always present for Lua, but do NOT overwrite MPVfile.type + if not self.state.title: + if self.state.metadata.get("title"): + self.state.title = str(self.state.metadata["title"]).strip() + elif self.state.filename: + self.state.title = self.state.filename + if self.state.hash and not _looks_like_hash(self.state.hash): + self.state.hash = None + if self.state.relationship_metadata is None: + self.state.relationship_metadata = {} + if self.state.relationships is not None and not isinstance(self.state.relationships, dict): + self.state.relationships = None + if self.state.original_tags is None: + self.state.original_tags = {} + + # ------------------------------------------------------------------ + # util helpers + # ------------------------------------------------------------------ + + @staticmethod + def _normalise_url(value: Any) -> Optional[str]: + if value is None: + return None + text = str(value).strip() + if not text: + return None + return text + + @staticmethod + def _resolve_local_path(path: Optional[str]) -> Optional[str]: + if not path: + return None + parsed = urlparse(path) + if parsed.scheme == "file": + decoded = MPVFileBuilder._decode_file_url(path) + return decoded + return path + + @staticmethod + def _decode_file_url(value: str) -> Optional[str]: + parsed = urlparse(value) + if parsed.scheme != "file": + return None + netloc = parsed.netloc or "" + path = unquote(parsed.path or "") + if netloc: + path = f"//{netloc}{path}" + if os.name == "nt" and path.startswith("/") and re.match(r"/[A-Za-z]:", path): + path = path[1:] + path = path.replace("/", os.sep) + return path + + def _looks_like_hydrus_url(self, url: str) -> bool: + if not url: + return False + if url.startswith("hydrus://"): + return True + if "Hydrus-Client-API-Access-Key=" in url: + return True + base = self.hydrus_settings.base_url + if base and url.startswith(base) and "/get_files/" in url: + return True + return False + + @staticmethod + def _extract_hash_from_path(path: Optional[str]) -> Optional[str]: + if not path: + return None + parsed = urlparse(path) + query = parse_qs(parsed.query) + if "hash" in query and query["hash"]: + candidate = query["hash"][0].strip() + if candidate: + return candidate.lower() + match = re.search(r"hash=([0-9a-fA-F]{64})", path) + if match: + return match.group(1).lower() + return None + + +def build_mpv_file_state(payload: Dict[str, Any], config: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + builder = MPVFileBuilder(payload or {}, config or {}) + return builder.build() diff --git a/helper/progress.py b/helper/progress.py new file mode 100644 index 0000000..ce9e6a0 --- /dev/null +++ b/helper/progress.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 +"""Text-based progress bar utilities for consistent display across all downloads.""" + +import sys + +from helper.logger import log, debug + + +def format_progress_bar(current: int, total: int, width: int = 40, label: str = "") -> str: + """Create a text-based progress bar. + + Args: + current: Current progress (bytes/items) + total: Total to complete (bytes/items) + width: Width of the bar in characters (default 40) + label: Optional label prefix + + Returns: + Formatted progress bar string + + Examples: + format_progress_bar(50, 100) + # Returns: "[████████████████░░░░░░░░░░░░░░░░░░░░] 50.0%" + + format_progress_bar(256*1024*1024, 1024*1024*1024, label="download.zip") + # Returns: "download.zip: [████████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 25.0%" + """ + if total <= 0: + percentage = 0 + filled = 0 + else: + percentage = (current / total) * 100 + filled = int((current / total) * width) + + # Create bar: filled blocks + empty blocks + bar = "█" * filled + "░" * (width - filled) + + # Format percentage + pct_str = f"{percentage:.1f}%" + + # Build result + if label: + result = f"{label}: [{bar}] {pct_str}" + else: + result = f"[{bar}] {pct_str}" + + return result + + +def format_size(bytes_val: float) -> str: + """Format bytes to human-readable size. + + Examples: + format_size(1024) -> "1.00 KB" + format_size(1024*1024) -> "1.00 MB" + format_size(1024*1024*1024) -> "1.00 GB" + """ + for unit in ['B', 'KB', 'MB', 'GB', 'TB']: + if bytes_val < 1024: + return f"{bytes_val:.2f} {unit}" + bytes_val /= 1024 + return f"{bytes_val:.2f} PB" + + +def format_download_status(filename: str, current: int, total: int, speed: float = 0) -> str: + """Format download status with progress bar and details. + + Args: + filename: Name of file being downloaded + current: Current bytes downloaded + total: Total file size + speed: Download speed in bytes/sec + + Returns: + Formatted status line + + Examples: + format_download_status("movie.mkv", 512*1024*1024, 2*1024*1024*1024, 10*1024*1024) + # Returns: "movie.mkv: [████████████░░░░░░░░░░░░░░░░░░░░░░░░░░] 25.0% (512.00 MB / 2.00 GB @ 10.00 MB/s)" + """ + bar = format_progress_bar(current, total, width=30) + size_current = format_size(current) + size_total = format_size(total) + + if speed > 0: + speed_str = f" @ {format_size(speed)}/s" + else: + speed_str = "" + + return f"{bar} ({size_current} / {size_total}{speed_str})" + + +def print_progress(filename: str, current: int, total: int, speed: float = 0, end: str = "\r") -> None: + """Print download progress to stderr (doesn't interfere with piped output). + + Args: + filename: File being downloaded + current: Current bytes + total: Total bytes + speed: Speed in bytes/sec + end: Line ending (default "\r" for overwriting, use "\n" for final) + """ + status = format_download_status(filename, current, total, speed) + debug(status, end=end, flush=True) + + +def print_final_progress(filename: str, total: int, elapsed: float) -> None: + """Print final progress line (100%) with time elapsed. + + Args: + filename: File that was downloaded + total: Total size + elapsed: Time elapsed in seconds + """ + bar = format_progress_bar(total, total, width=30) + size_str = format_size(total) + + # Format elapsed time + if elapsed < 60: + time_str = f"{elapsed:.1f}s" + elif elapsed < 3600: + minutes = elapsed / 60 + time_str = f"{minutes:.1f}m" + else: + hours = elapsed / 3600 + time_str = f"{hours:.2f}h" + + debug(f"{bar} ({size_str}) - {time_str}") + + +if __name__ == "__main__": + # Demo + import time + + log("Progress Bar Demo:", file=sys.stderr) + + # Demo 1: Simple progress + for i in range(101): + print_progress("demo.bin", i * 10 * 1024 * 1024, 1024 * 1024 * 1024) + time.sleep(0.02) + + print_final_progress("demo.bin", 1024 * 1024 * 1024, 2.0) + log() diff --git a/helper/query_parser.py b/helper/query_parser.py new file mode 100644 index 0000000..ec4ae9c --- /dev/null +++ b/helper/query_parser.py @@ -0,0 +1,159 @@ +"""Dynamic query parser for filtering and field extraction. + +Supports query syntax like: + - isbn:0557677203 + - author:"Albert Pike" + - title:"Morals and Dogma" + - year:2010 + - isbn:0557677203 author:"Albert Pike" + - Mixed with free text: "Morals" isbn:0557677203 + +This allows flexible query strings that can be parsed by any search provider +to extract specific fields for filtering and searching. +""" + +from typing import Dict, List, Tuple, Optional, Any +import re + + +def parse_query(query: str) -> Dict[str, Any]: + """Parse a query string into field:value pairs and free text. + + Args: + query: Query string like 'isbn:0557677203 author:"Albert Pike" Morals' + + Returns: + Dictionary with: + - 'fields': Dict[field_name, field_value] for structured fields + - 'text': str with remaining free text + - 'raw': str original query + """ + result = { + 'fields': {}, + 'text': '', + 'raw': query, + } + + if not query or not query.strip(): + return result + + query = query.strip() + remaining_parts = [] + + # Pattern to match: field:value or field:"quoted value" + # Matches: word: followed by either quoted string or unquoted word + pattern = r'(\w+):(?:"([^"]*)"|(\S+))' + + pos = 0 + for match in re.finditer(pattern, query): + # Add any text before this match + if match.start() > pos: + before_text = query[pos:match.start()].strip() + if before_text: + remaining_parts.append(before_text) + + field_name = match.group(1).lower() + field_value = match.group(2) if match.group(2) is not None else match.group(3) + + result['fields'][field_name] = field_value + pos = match.end() + + # Add any remaining text after last match + if pos < len(query): + remaining_text = query[pos:].strip() + if remaining_text: + remaining_parts.append(remaining_text) + + result['text'] = ' '.join(remaining_parts) + + return result + + +def get_field(parsed_query: Dict[str, Any], field_name: str, default: Optional[str] = None) -> Optional[str]: + """Get a field value from parsed query, with optional default. + + Args: + parsed_query: Result from parse_query() + field_name: Field name to look up (case-insensitive) + default: Default value if field not found + + Returns: + Field value or default + """ + return parsed_query.get('fields', {}).get(field_name.lower(), default) + + +def has_field(parsed_query: Dict[str, Any], field_name: str) -> bool: + """Check if a field exists in parsed query. + + Args: + parsed_query: Result from parse_query() + field_name: Field name to check (case-insensitive) + + Returns: + True if field exists + """ + return field_name.lower() in parsed_query.get('fields', {}) + + +def get_free_text(parsed_query: Dict[str, Any]) -> str: + """Get the free text portion of a parsed query. + + Args: + parsed_query: Result from parse_query() + + Returns: + Free text or empty string + """ + return parsed_query.get('text', '') + + +def build_query_for_provider( + parsed_query: Dict[str, Any], + provider: str, + extraction_map: Optional[Dict[str, str]] = None +) -> Tuple[str, Dict[str, str]]: + """Build a search query and filters dict for a specific provider. + + Different providers have different search syntax. This function + extracts the appropriate fields for each provider. + + Args: + parsed_query: Result from parse_query() + provider: Provider name ('libgen', 'openlibrary', 'soulseek') + extraction_map: Optional mapping of field names to provider-specific names + e.g. {'isbn': 'isbn', 'author': 'author', 'title': 'title'} + + Returns: + Tuple of (search_query: str, extracted_fields: Dict[field, value]) + """ + extraction_map = extraction_map or {} + extracted = {} + free_text = get_free_text(parsed_query) + + # Extract fields based on map + for field_name, provider_key in extraction_map.items(): + if has_field(parsed_query, field_name): + extracted[provider_key] = get_field(parsed_query, field_name) + + # If provider-specific extraction needed, providers can implement it + # For now, return the free text as query + return free_text, extracted + + +if __name__ == '__main__': + # Test cases + test_queries = [ + 'isbn:0557677203', + 'isbn:0557677203 author:"Albert Pike"', + 'Morals and Dogma isbn:0557677203', + 'title:"Morals and Dogma" author:"Albert Pike" year:2010', + 'search term without fields', + 'author:"John Smith" title:"A Book"', + ] + + for query in test_queries: + print(f"\nQuery: {query}") + parsed = parse_query(query) + print(f" Fields: {parsed['fields']}") + print(f" Text: {parsed['text']}") diff --git a/helper/search_provider.py b/helper/search_provider.py new file mode 100644 index 0000000..93443c5 --- /dev/null +++ b/helper/search_provider.py @@ -0,0 +1,1777 @@ +""" +SearchProvider: Unified interface for different search backends. + +This module defines a base class and registry for search providers that can be +used by search-file and other search-related cmdlets to handle different sources: +- Local file storage (LocalStorageBackend) +- Hydrus database +- AllDebrid magnets (search-debrid) +- Library Genesis / OpenLibrary books (search-libgen) +- Soulseek P2P network (search-soulseek) +- IMDB movies (future) +- Other sources + +Usage: + from helper.search_provider import SearchProvider, get_provider + + provider = get_provider("libgen") + results = provider.search("python programming", limit=10) + + for result in results: + print(result["title"], result["target"], result["annotations"]) +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Optional, Sequence, Tuple +from dataclasses import dataclass +from pathlib import Path +import sys +import subprocess +import json +import shutil + + +from helper.logger import log, debug + + +@dataclass +class SearchResult: + """Unified search result format across all providers.""" + + # Required fields + origin: str # Provider name: "libgen", "soulseek", "debrid", "local", "hydrus", etc. + title: str # Display title/filename + target: str # Unique identifier or download target (URL, path, magnet hash, etc.) + + # Optional fields + detail: str = "" # Additional details (size, status, format, etc.) + annotations: List[str] = None # Tags/annotations: ["ready", "120MB", "mp3", etc.] + media_kind: str = "other" # Type: "book", "audio", "video", "file", "magnet", etc. + size_bytes: Optional[int] = None # File size in bytes + tags: Optional[set[str]] = None # Searchable tags + full_metadata: Optional[Dict[str, Any]] = None # Extra metadata (author, year, etc.) + columns: List[Tuple[str, str]] = None # Display columns: [("Header", "value"), ...] for result table + + def __post_init__(self): + """Ensure mutable defaults are properly initialized.""" + if self.annotations is None: + self.annotations = [] + if self.tags is None: + self.tags = set() + if self.full_metadata is None: + self.full_metadata = {} + if self.columns is None: + self.columns = [] + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary for JSON serialization.""" + data = { + "origin": self.origin, + "title": self.title, + "target": self.target, + "detail": self.detail, + "annotations": self.annotations, + "media_kind": self.media_kind, + "size_bytes": self.size_bytes, + "tags": list(self.tags) if self.tags else [], + "full_metadata": self.full_metadata, + } + if self.columns: + data["columns"] = list(self.columns) + return data + + +class SearchProvider(ABC): + """Abstract base class for search providers.""" + + # Provider-specific field definitions: list of (api_field_name, display_column_name, formatter_func) + # Override in subclasses to define which fields to request and how to display them + # Example: [("title", "Title", None), ("author_name", "Author(s)", lambda x: ", ".join(x) if isinstance(x, list) else x)] + RESULT_FIELDS: List[Tuple[str, str, Optional[Any]]] = [] + + def __init__(self, config: Dict[str, Any] = None): + """ + Initialize provider with optional configuration. + + Args: + config: Configuration dictionary (global config dict) + """ + self.config = config or {} + self.name = self.__class__.__name__.replace("Provider", "").lower() + + @abstractmethod + def search( + self, + query: str, + limit: int = 50, + filters: Optional[Dict[str, Any]] = None, + **kwargs + ) -> List[SearchResult]: + """ + Search for items matching the query. + + Args: + query: Search query string. Special value "*" means "match all" + limit: Maximum number of results to return + filters: Optional filtering criteria (type, size, status, etc.) + **kwargs: Provider-specific arguments + + Returns: + List of SearchResult objects + """ + pass + + @abstractmethod + def get_result_args(self) -> List[str]: + """ + Get command-line arguments from a search result to pass to downstream cmdlets. + + Example: For libgen, returns ["-url", result.target] + For soulseek, returns ["-id", result.target] + For local, returns ["-path", result.target] + + Returns: + List of arguments to append to cmdlet invocation + """ + pass + + def parse_args(self, args: Sequence[str]) -> Tuple[str, Dict[str, Any]]: + """ + Parse provider-specific command-line arguments. + + Args: + args: Sequence of command-line arguments + + Returns: + Tuple of (query, filters_dict) + """ + # Default implementation: first arg is query, rest are filters + query = args[0] if args else "" + filters = {} + return query, filters + + def validate(self) -> bool: + """ + Validate that provider is properly configured and ready to use. + + Returns: + True if provider is available, False otherwise + """ + return True + + def get_columns_format(self) -> List[str]: + """ + Define which columns this provider displays in result table. + + Returns: + List of column names to display. + Each provider can override to customize result table appearance. + Examples: ["Title", "Author", "Year"] for books + ["Title", "Duration", "Format"] for media + ["Title", "Size", "Status"] for files + + Default: Empty list (uses traditional detail/origin/media_kind/target) + """ + return [col_name for _, col_name, _ in self.RESULT_FIELDS] if self.RESULT_FIELDS else [] + + def get_api_fields_string(self) -> str: + """ + Generate comma-separated API fields string from RESULT_FIELDS. + + Returns: + Comma-separated string of API field names to request + Example: "title,author_name,first_publish_year,isbn,key" + """ + if not self.RESULT_FIELDS: + return "" + return ",".join(field_name for field_name, _, _ in self.RESULT_FIELDS) + + def build_columns_from_doc(self, doc: Dict[str, Any], idx: int = None) -> List[Tuple[str, str]]: + """ + Dynamically build columns from a result document using RESULT_FIELDS definition. + + Args: + doc: API response document (dict with field values) + idx: Optional index/number for the result (typically added as first column) + + Returns: + List of (header, value) tuples ready for SearchResult.columns + """ + columns = [] + + # Add index as first column if provided + if idx is not None: + columns.append(("#", str(idx))) + + # Process each field definition + for api_field_name, display_col_name, formatter_func in self.RESULT_FIELDS: + value = doc.get(api_field_name, "") + + # Apply formatter if defined + if formatter_func and value: + value = formatter_func(value) + + # Convert to string and add to columns + value_str = str(value) if value else "Unknown" + columns.append((display_col_name, value_str)) + + return columns + + +class LocalStorageProvider(SearchProvider): + """Search provider for local file system storage.""" + + def __init__(self, config: Dict[str, Any] = None): + super().__init__(config) + self.name = "local" + # Import here to avoid circular dependency + from helper.file_storage import FileStorage + self.storage = FileStorage(config) + + def search( + self, + query: str, + limit: int = 50, + filters: Optional[Dict[str, Any]] = None, + **kwargs + ) -> List[SearchResult]: + """Search local file storage.""" + filters = filters or {} + backend_name = filters.get("backend", "local") + + try: + # Use the backend from FileStorage + results = self.storage[backend_name].search(query, limit=limit) + + search_results = [] + for result_dict in results: + path = result_dict.get("path", "") + size = result_dict.get("size") + annotations = [] + + if size: + annotations.append(f"{size / 1e6:.1f}MB") + + search_results.append(SearchResult( + origin="local", + title=path.split("\\")[-1] if path else "Unknown", + target=path, + detail=f"Local: {path}", + annotations=annotations, + size_bytes=size, + )) + + return search_results + + except Exception as e: + log(f"[local] Search error: {e}", file=sys.stderr) + return [] + + def get_result_args(self) -> List[str]: + """Local storage uses -path argument.""" + return ["-path"] + + def validate(self) -> bool: + """Local storage is always available.""" + return True + + +class LibGenProvider(SearchProvider): + """Search provider for Library Genesis books.""" + + # Define fields to display (note: LibGen doesn't have API field mapping like OpenLibrary) + # These are extracted from the book dict directly + RESULT_FIELDS = [ + ("title", "Title", None), + ("author", "Author(s)", None), + ("year", "Year", None), + ] + + def __init__(self, config: Dict[str, Any] = None): + super().__init__(config) + self.name = "libgen" + + def search( + self, + query: str, + limit: int = 50, + filters: Optional[Dict[str, Any]] = None, + **kwargs + ) -> List[SearchResult]: + """Search Library Genesis for books. + + Supports dynamic query format: + - isbn:0557677203 + - author:"Albert Pike" + - title:"Book Title" + - Combination: isbn:0557677203 author:"Albert Pike" free text + + Priority: ISBN is the authoritative key for searching. + """ + filters = filters or {} + + try: + from helper.unified_book_downloader import UnifiedBookDownloader + from helper.query_parser import parse_query, get_field, get_free_text + + debug(f"[libgen] Starting search for: {query}") + + # Parse the query to extract structured fields + parsed = parse_query(query) + isbn = get_field(parsed, 'isbn') + author = get_field(parsed, 'author') + title = get_field(parsed, 'title') + free_text = get_free_text(parsed) + + # Build the search query for libgen + # Priority: isbn (authoritative key) > title > author > free_text + if isbn: + search_query = isbn + elif title: + search_query = title + elif author: + search_query = author + else: + search_query = free_text or query + + debug(f"[libgen] Built search query: {search_query}") + + downloader = UnifiedBookDownloader(config=self.config) + search_fn = getattr(downloader, "search_libgen", None) + + if not callable(search_fn): + log("[libgen] Searcher unavailable", file=sys.stderr) + return [] + + debug(f"[libgen] Calling search_libgen with query: {search_query}") + books = search_fn(search_query, limit=limit) + debug(f"[libgen] Got {len(books) if books else 0} results from search_libgen") + + search_results = [] + for idx, book in enumerate(books, 1): + # Build columns dynamically from RESULT_FIELDS + columns = self.build_columns_from_doc(book, idx) + + title = book.get("title", "Unknown") + author = book.get("author", "Unknown") + year = book.get("year", "Unknown") + filesize = book.get("filesize_str", "Unknown") + isbn = book.get("isbn", "") + mirror_url = book.get("mirror_url", "") + + # Build detail with author and year + detail = f"By: {author}" + if year and year != "Unknown": + detail += f" ({year})" + + annotations = [f"{filesize}"] + if isbn: + annotations.append(f"ISBN: {isbn}") + + search_results.append(SearchResult( + origin="libgen", + title=title, + target=mirror_url or f"libgen:{book.get('id', '')}", + detail=detail, + annotations=annotations, + media_kind="book", + columns=columns, + full_metadata={ + "number": idx, + "author": author, + "year": year, + "isbn": isbn, + "filesize": filesize, + "mirrors": book.get("mirrors", {}), + "book_id": book.get("book_id", ""), + "md5": book.get("md5", ""), + }, + )) + + debug(f"[libgen] Returning {len(search_results)} formatted results") + return search_results + + except Exception as e: + log(f"[libgen] Search error: {e}", file=sys.stderr) + import traceback + log(traceback.format_exc(), file=sys.stderr) + return [] + + def get_result_args(self) -> List[str]: + """LibGen results use -url for download or -mirror for selection.""" + return ["-url"] + + def validate(self) -> bool: + """Check if LibGen downloader is available.""" + try: + from helper.unified_book_downloader import UnifiedBookDownloader + return True + except Exception: + return False + + +class SoulSeekProvider(SearchProvider): + """Search provider for Soulseek P2P network.""" + + # Allowed music file extensions + MUSIC_EXTENSIONS = { + '.flac', '.mp3', '.m4a', '.aac', '.ogg', '.opus', + '.wav', '.alac', '.wma', '.ape', '.aiff', '.dsf', + '.dff', '.wv', '.tta', '.tak', '.ac3', '.dts' + } + + # Display columns for search results + RESULT_FIELDS = [ + ("track_num", "Track", None), + ("title", "Title", None), + ("artist", "Artist", lambda x: (str(x)[:32] + '...') if x and len(str(x)) > 35 else x), + ("album", "Album", lambda x: (str(x)[:32] + '...') if x and len(str(x)) > 35 else x), + ("size", "Size", lambda x: f"{int(int(x)/1024/1024)} MB" if x else ""), + ] + + # Soulseek config + USERNAME = "asjhkjljhkjfdsd334" + PASSWORD = "khhhg" + DOWNLOAD_DIR = "./downloads" + MAX_WAIT_TRANSFER = 1200 + + def __init__(self, config: Dict[str, Any] = None): + super().__init__(config) + self.name = "soulseek" + + async def perform_search( + self, + query: str, + timeout: float = 9.0, + limit: int = 50 + ) -> List[Dict[str, Any]]: + """Perform async Soulseek search and return flattened results.""" + import asyncio + import os + import re + import time + from aioslsk.client import SoulSeekClient + from aioslsk.settings import Settings, CredentialsSettings + + os.makedirs(self.DOWNLOAD_DIR, exist_ok=True) + + settings = Settings(credentials=CredentialsSettings(username=self.USERNAME, password=self.PASSWORD)) + client = SoulSeekClient(settings) + + try: + await client.start() + await client.login() + except Exception as e: + log(f"[soulseek] Login failed: {type(e).__name__}: {e}", file=sys.stderr) + return [] + + try: + search_request = await client.searches.search(query) + await self._collect_search_results(client, search_request, timeout=timeout) + flat = self._flatten_search_results(search_request)[:limit] + return flat + except Exception as e: + log(f"[soulseek] Search error: {type(e).__name__}: {e}", file=sys.stderr) + return [] + finally: + try: + await client.stop() + except Exception: + pass + + def _flatten_search_results(self, search_request) -> List[dict]: + """Extract files from SearchRequest.results.""" + flat: List[dict] = [] + for result in search_request.results: + username = getattr(result, "username", "?") + + for file_data in getattr(result, "shared_items", []): + flat.append({ + "file": file_data, + "username": username, + "filename": getattr(file_data, "filename", "?"), + "size": getattr(file_data, "filesize", 0), + }) + + for file_data in getattr(result, "locked_results", []): + flat.append({ + "file": file_data, + "username": username, + "filename": getattr(file_data, "filename", "?"), + "size": getattr(file_data, "filesize", 0), + }) + + return flat + + async def _collect_search_results(self, client, search_request, timeout: float = 75.0) -> None: + """Collect search results by waiting.""" + import asyncio + import time + debug(f"[soulseek] Collecting results for {timeout}s...") + end = time.time() + timeout + last_count = 0 + while time.time() < end: + current_count = len(search_request.results) + if current_count > last_count: + debug(f"[soulseek] Got {current_count} result(s) so far...") + last_count = current_count + await asyncio.sleep(0.5) + + async def download_file( + self, + username: str, + filename: str, + file_size: int, + target_dir: Optional[str] = None + ) -> bool: + """Download a file from Soulseek to a specific directory.""" + import asyncio + import os + import time + from aioslsk.client import SoulSeekClient + from aioslsk.settings import Settings, CredentialsSettings + from aioslsk.events import TransferProgressEvent + from tqdm import tqdm + + download_dir = target_dir if target_dir else self.DOWNLOAD_DIR + os.makedirs(download_dir, exist_ok=True) + + settings = Settings(credentials=CredentialsSettings(username=self.USERNAME, password=self.PASSWORD)) + settings.shares.download = download_dir + client = SoulSeekClient(settings) + + try: + await client.start() + await client.login() + + debug(f"[soulseek] Starting: {filename} from {username}") + + transfer = await client.transfers.download(username, filename) + if transfer is None: + log("[soulseek] Failed: transfer object is None") + return False + + success = await self._wait_for_transfer(client, transfer, file_size=file_size, max_wait=self.MAX_WAIT_TRANSFER) + + return success + + except Exception as e: + log(f"[soulseek] Download error: {type(e).__name__}: {e}", file=sys.stderr) + return False + + finally: + try: + await client.stop() + except Exception: + pass + + async def _wait_for_transfer(self, client, transfer_obj: Any, file_size: Any = None, max_wait: float = 1200) -> bool: + """Wait for transfer finish using event listeners with TQDM progress bar. + + Returns: + True if transfer completed successfully, False if failed or timed out. + """ + import asyncio + import time + from aioslsk.events import TransferProgressEvent + from tqdm import tqdm + + if transfer_obj is None: + log("[soulseek] No transfer object returned") + return False + + transfer_finished = False + transfer_success = False + pbar = None + total_size = file_size + last_speed_time = time.time() + last_speed = 0 + + async def on_progress(event): + nonlocal last_speed_time, last_speed, transfer_finished, transfer_success, pbar, total_size + if not hasattr(event, 'updates') or not event.updates: + return + + for transfer, _, curr_snapshot in event.updates: + if (transfer.username == transfer_obj.username and transfer.remote_path == transfer_obj.remote_path): + bytes_xfer = getattr(curr_snapshot, 'bytes_transfered', 0) + state_name = curr_snapshot.state.name if hasattr(curr_snapshot, 'state') else "?" + speed = getattr(curr_snapshot, 'speed', 0) + + if total_size is None and hasattr(transfer, 'file_attributes'): + try: + size = getattr(transfer, 'file_size', None) or getattr(transfer, 'size', None) + if size: + total_size = size + except Exception: + pass + + if pbar is None: + total = total_size if total_size else 100 * 1024 * 1024 + pbar = tqdm(total=total, unit='B', unit_scale=True, desc='[transfer]') + + if pbar: + pbar.n = bytes_xfer + if speed > 0: + pbar.set_postfix({"speed": f"{speed/1024:.1f} KB/s", "state": state_name}) + pbar.refresh() + + if state_name in ('FINISHED', 'COMPLETE'): + if pbar: + pbar.close() + debug(f"[soulseek] Transfer {state_name.lower()}") + transfer_finished = True + transfer_success = True + return + elif state_name in ('ABORTED', 'FAILED', 'PAUSED'): + if pbar: + pbar.close() + debug(f"[soulseek] Transfer {state_name.lower()}") + transfer_finished = True + transfer_success = False + return + + if total_size and bytes_xfer >= total_size: + if pbar: + pbar.close() + debug(f"[soulseek] Transfer complete ({bytes_xfer / 1024 / 1024:.1f} MB)") + transfer_finished = True + transfer_success = True + return + + if speed == 0 and bytes_xfer > 0: + now = time.time() + if now - last_speed_time > 3: + if pbar: + pbar.close() + debug(f"[soulseek] Transfer complete ({bytes_xfer / 1024 / 1024:.1f} MB)") + transfer_finished = True + transfer_success = True + return + else: + last_speed_time = time.time() + + last_speed = speed + + client.events.register(TransferProgressEvent, on_progress) + end = time.time() + max_wait + + while time.time() < end: + if transfer_finished: + break + await asyncio.sleep(0.5) + + client.events.unregister(TransferProgressEvent, on_progress) + + if pbar: + pbar.close() + + if not transfer_finished: + log(f"[soulseek] Timed out after {max_wait}s; transfer may still be in progress") + return False + else: + return transfer_success + + def search( + self, + query: str, + limit: int = 50, + filters: Optional[Dict[str, Any]] = None, + **kwargs + ) -> List[SearchResult]: + """Search Soulseek P2P network (synchronous wrapper).""" + import asyncio + import re + + filters = filters or {} + + try: + # Run async search + flat_results = asyncio.run(self.perform_search(query, timeout=9.0, limit=limit)) + + if not flat_results: + return [] + + # Filter to music files only + music_results = [] + for item in flat_results: + filename = item['filename'] + if '.' in filename: + ext = '.' + filename.rsplit('.', 1)[-1].lower() + else: + ext = '' + + if ext in self.MUSIC_EXTENSIONS: + music_results.append(item) + + if not music_results: + return [] + + # Extract metadata for all results + enriched_results = [] + for item in music_results: + filename = item['filename'] + + # Extract extension + if '.' in filename: + _, ext = filename.rsplit('.', 1) + ext = '.' + ext.lower() + else: + ext = '' + + # Get display filename + if '\\' in filename: + display_name = filename.rsplit('\\', 1)[-1] + elif '/' in filename: + display_name = filename.rsplit('/', 1)[-1] + else: + display_name = filename + + # Extract path hierarchy for artist/album + path_parts = filename.replace('\\', '/').split('/') + artist = '' + album = '' + + if len(path_parts) >= 3: + artist = path_parts[-3] + album = path_parts[-2] + if ' - ' in album and re.match(r'^\d{4}', album): + album = album.split(' - ', 1)[1] + elif len(path_parts) == 2: + artist = path_parts[-2] + + # Extract track number and title + base_name = display_name.rsplit('.', 1)[0] if '.' in display_name else display_name + track_num = '' + title = base_name + filename_artist = '' + + # First, extract track number if present (e.g., "30 Stumfol - Prisoner" -> track=30, rest="Stumfol - Prisoner") + match = re.match(r'^(\d{1,3})\s*[\.\-]?\s+(.+)$', base_name) + if match: + track_num = match.group(1) + remainder = match.group(2) + + # Now parse "Artist - Title" from the remainder + # If there's a " - " separator, split on it + if ' - ' in remainder: + parts = remainder.split(' - ', 1) + filename_artist = parts[0].strip() + title = parts[1].strip() + else: + # No artist-title separator, use the whole remainder as title + title = remainder + else: + # No track number, check if there's "Artist - Title" format + if ' - ' in base_name: + parts = base_name.split(' - ', 1) + filename_artist = parts[0].strip() + title = parts[1].strip() + + # Use filename_artist if extracted, otherwise fall back to path artist + if filename_artist: + artist = filename_artist + + enriched_results.append({ + **item, + 'artist': artist, + 'album': album, + 'title': title, + 'track_num': track_num, + 'ext': ext + }) + + # Apply filters if specified + if filters: + artist_filter = filters.get('artist', '').lower() if filters.get('artist') else '' + album_filter = filters.get('album', '').lower() if filters.get('album') else '' + track_filter = filters.get('track', '').lower() if filters.get('track') else '' + + if artist_filter or album_filter or track_filter: + filtered_results = [] + for item in enriched_results: + if artist_filter and artist_filter not in (item['artist'] or '').lower(): + continue + if album_filter and album_filter not in (item['album'] or '').lower(): + continue + if track_filter and track_filter not in (item['title'] or '').lower(): + continue + filtered_results.append(item) + + enriched_results = filtered_results + + # Sort: .flac first, then others + enriched_results.sort(key=lambda item: (item['ext'].lower() != '.flac', -item['size'])) + + # Convert to SearchResult format + search_results = [] + for idx, item in enumerate(enriched_results, 1): + artist_display = item['artist'] if item['artist'] else "(no artist)" + album_display = item['album'] if item['album'] else "(no album)" + size_mb = int(round(item['size'] / 1024 / 1024)) + + if item['track_num']: + track_title = f"[{item['track_num']}] {item['title']}" + else: + track_title = item['title'] or "(untitled)" + + # Build columns from enriched metadata + columns = self.build_columns_from_doc(item, idx=idx) + + search_results.append(SearchResult( + origin="soulseek", + title=track_title, + target=item['filename'], + detail=f"Artist: {artist_display} | Album: {album_display}", + annotations=[f"{size_mb} MB", item['ext']], + media_kind="audio", + size_bytes=item['size'], + columns=columns, + full_metadata={ + "artist": item['artist'], + "album": item['album'], + "track_num": item['track_num'], + "username": item['username'], + "filename": item['filename'], + "ext": item['ext'], + }, + )) + + return search_results + + except Exception as e: + log(f"Soulseek search error: {e}", file=sys.stderr) + return [] + + def get_result_args(self) -> List[str]: + """Soulseek results use filename/path for results.""" + return ["-path"] + + def validate(self) -> bool: + """Check if Soulseek client is available.""" + try: + import aioslsk # type: ignore + return True + except ImportError: + return False + + +class DebridProvider(SearchProvider): + """Search provider for AllDebrid magnets.""" + + # Status code mappings + STATUS_MAP = { + 0: "In Queue", + 1: "Downloading", + 2: "Compressing", + 3: "Uploading", + 4: "Ready", + 5: "Upload Failed", + 6: "Unpack Error", + 7: "Not Downloaded", + 8: "File Too Big", + 9: "Internal Error", + 10: "Download Timeout", + 11: "Deleted", + 12: "Processing Failed", + 13: "Processing Failed", + 14: "Tracker Error", + 15: "No Peers" + } + + def __init__(self, config: Dict[str, Any] = None): + super().__init__(config) + self.name = "debrid" + self._magnet_files_cache = {} + + def _format_size(self, bytes_val: float) -> str: + """Format bytes to human readable size.""" + for unit in ['B', 'KB', 'MB', 'GB', 'TB']: + if bytes_val < 1024: + return f"{bytes_val:.2f} {unit}" + bytes_val /= 1024 + return f"{bytes_val:.2f} PB" + + def _get_status_display(self, status_code: int) -> str: + """Get human-readable status for AllDebrid status codes.""" + return self.STATUS_MAP.get(status_code, f"Unknown ({status_code})") + + def _should_filter_magnet(self, status_code: int, status_text: str) -> bool: + """Check if magnet should be filtered out (expired/deleted).""" + # Filter expired/deleted entries + return status_code in (5, 6, 7, 8, 11, 12, 13, 14) + + def _fuzzy_match(self, text: str, pattern: str) -> bool: + """Check if pattern fuzzy-matches text (case-insensitive, substring matching).""" + return pattern.lower() in text.lower() + + def search( + self, + query: str, + limit: int = 50, + filters: Optional[Dict[str, Any]] = None, + **kwargs + ) -> List[SearchResult]: + """Search AllDebrid magnets with optional status and name filtering. + + Args: + query: Search query (magnet filename or '*' for all) + limit: Max results to return + filters: Optional dict with 'status' filter ('all', 'active', 'ready', 'error') + + Returns: + List of SearchResult objects + """ + filters = filters or {} + + try: + from helper.alldebrid import AllDebridClient + from config import get_debrid_api_key + + api_key = get_debrid_api_key(self.config) + + if not api_key: + log("[debrid] API key not configured", file=sys.stderr) + return [] + + client = AllDebridClient(api_key) + + # Parse status filter + status_filter_param = filters.get('status', 'all').lower() if filters.get('status') else 'all' + + # Get magnets with optional status filter + response = client._request("magnet/status", {}) + + if response.get("status") != "success": + log(f"[debrid] API error: {response.get('error', 'Unknown')}", file=sys.stderr) + return [] + + magnets = response.get("data", {}).get("magnets", []) + + # Handle both list and dict formats + if isinstance(magnets, dict): + magnets = list(magnets.values()) + + # Filter by status if specified + if status_filter_param == 'active': + magnets = [m for m in magnets if m.get('statusCode', -1) in (0, 1, 2, 3)] + elif status_filter_param == 'ready': + magnets = [m for m in magnets if m.get('statusCode', -1) == 4] + elif status_filter_param == 'error': + magnets = [m for m in magnets if m.get('statusCode', -1) in (5, 6, 8, 9, 10, 12, 13, 14, 15)] + # 'all' includes everything + + # Filter by query (fuzzy match on filename) + results = [] + count = 0 + for magnet in magnets: + if count >= limit: + break + + filename = magnet.get("filename", "") + status_code = magnet.get("statusCode", -1) + status_text = magnet.get("status", "Unknown") + + # Skip expired/deleted unless 'all' filter + if status_filter_param != 'all' and self._should_filter_magnet(status_code, status_text): + continue + + # Apply query filter (skip if doesn't match) + if query and query != "*" and not self._fuzzy_match(filename, query): + continue + + magnet_id = magnet.get("id") + size = magnet.get("size", 0) + downloaded = magnet.get("downloaded", 0) + progress = (downloaded / size * 100) if size > 0 else 0 + + # Get status emoji + if status_code == 4: + status_emoji = "✓" + elif status_code < 4: + status_emoji = "⧗" + else: + status_emoji = "✗" + + annotations = [self._get_status_display(status_code)] + if size > 0: + annotations.append(self._format_size(size)) + if progress > 0 and progress < 100: + annotations.append(f"{progress:.1f}%") + + results.append(SearchResult( + origin="debrid", + title=filename or "Unknown", + target=str(magnet_id), + detail=f"{status_emoji} {self._get_status_display(status_code)} | {self._format_size(size)}", + annotations=annotations, + media_kind="magnet", + size_bytes=size, + full_metadata={ + "magnet_id": magnet_id, + "status_code": status_code, + "status_text": status_text, + "progress": progress, + "downloaded": downloaded, + "seeders": magnet.get("seeders", 0), + "download_speed": magnet.get("downloadSpeed", 0), + }, + )) + + count += 1 + + # Cache metadata for ready magnets + if results: + self._cache_ready_magnet_metadata(client, [r for r in results if r.full_metadata.get('status_code') == 4]) + + return results + + except Exception as e: + log(f"Debrid search error: {e}", file=sys.stderr) + return [] + + def _cache_ready_magnet_metadata(self, client, results: List[SearchResult]) -> None: + """Cache file metadata for ready magnets.""" + if not results: + return + + try: + ready_ids = [r.full_metadata.get('magnet_id') for r in results if r.full_metadata.get('status_code') == 4] + if ready_ids: + self._magnet_files_cache = client.magnet_links(ready_ids) + log(f"[debrid] Cached metadata for {len(self._magnet_files_cache)} ready magnet(s)", file=sys.stderr) + except Exception as e: + log(f"[debrid] Warning: Could not cache magnet metadata: {e}", file=sys.stderr) + + def get_magnet_metadata(self, magnet_id: int) -> Optional[Dict[str, Any]]: + """Get cached metadata for a magnet.""" + return self._magnet_files_cache.get(str(magnet_id)) + + def get_result_args(self) -> List[str]: + """Debrid results use magnet ID for download.""" + return ["-id"] + + def validate(self) -> bool: + """Check if AllDebrid is configured.""" + from config import get_debrid_api_key + return bool(get_debrid_api_key(self.config)) + + +class OpenLibraryProvider(SearchProvider): + """Search provider for OpenLibrary.""" + + # Define fields to request from API and how to display them + RESULT_FIELDS = [ + ("title", "Title", None), + ("author_name", "Author", lambda x: ", ".join(x) if isinstance(x, list) else x), + ("first_publish_year", "Year", None), + ("status", "Status", None), + ] + + def __init__(self, config: Dict[str, Any] = None): + super().__init__(config) + self.name = "openlibrary" + + def _derive_status(self, doc: Dict[str, Any]) -> tuple[str, Optional[str]]: + """Determine availability label and archive identifier.""" + ebook_access = str(doc.get("ebook_access", "") or "").strip().lower() + has_fulltext = bool(doc.get("has_fulltext")) + ia_entries = doc.get("ia") + archive_id = "" + if isinstance(ia_entries, list): + for entry in ia_entries: + if isinstance(entry, str) and entry.strip(): + archive_id = entry.strip() + break + elif isinstance(ia_entries, str) and ia_entries.strip(): + archive_id = ia_entries.strip() + elif isinstance(doc.get("ocaid"), str) and doc["ocaid"].strip(): + archive_id = doc["ocaid"].strip() + + available = False + if ebook_access in {"borrowable", "public", "full"}: + available = True + elif has_fulltext: + available = True + elif archive_id: + available = True + + status = "download" if available else "?Libgen" + return status, archive_id or None + + def search( + self, + query: str, + limit: int = 50, + filters: Optional[Dict[str, Any]] = None, + **kwargs + ) -> List[SearchResult]: + """Search OpenLibrary for books. + + Smart search that detects ISBN, OCLC, OpenLibrary ID, and falls back to title search. + """ + filters = filters or {} + + try: + import requests + + query_clean = query.strip() + search_url = "https://openlibrary.org/search.json" + + # Try to detect query type (ISBN, OCLC, OL ID, or title) + if query_clean.isdigit() and len(query_clean) in (10, 13): + # ISBN search + url = f"https://openlibrary.org/isbn/{query_clean}.json" + response = requests.get(url, timeout=9) + if response.status_code == 200: + book_data = response.json() + return [self._format_isbn_result(book_data, query_clean)] + elif response.status_code == 404: + return [] + + # Default to title/general search + params = { + "q": query_clean, + "limit": limit, + "fields": f"{self.get_api_fields_string()},isbn,oclc_numbers,lccn,number_of_pages_median,language,key,ebook_access,ia,has_fulltext", + } + + response = requests.get(search_url, params=params, timeout=9) + response.raise_for_status() + data = response.json() + + search_results = [] + for idx, doc in enumerate(data.get("docs", []), 1): + # Extract OLID first (needed for metadata) + olid = doc.get("key", "").split("/")[-1] + + # Determine status/availability + status, archive_id = self._derive_status(doc) + doc["status"] = status + + # Build columns dynamically from RESULT_FIELDS (now includes status) + columns = self.build_columns_from_doc(doc, idx) + + # Extract additional metadata + title = doc.get("title", "Unknown") + authors = doc.get("author_name", ["Unknown"]) + year = doc.get("first_publish_year", "") + isbn_list = doc.get("isbn", []) + isbn = isbn_list[0] if isbn_list else "" + oclc_list = doc.get("oclc_numbers", []) + oclc = oclc_list[0] if oclc_list else "" + lccn_list = doc.get("lccn", []) + lccn = lccn_list[0] if lccn_list else "" + pages = doc.get("number_of_pages_median", "") + languages = doc.get("language", []) + language = languages[0] if languages else "" + + author_str = ", ".join(authors) if authors else "Unknown" + + # Build detail with author and year + detail = f"By: {author_str}" + if year: + detail += f" ({year})" + + # Build annotations with additional info + annotations = [] + if pages: + annotations.append(f"{pages} pages") + if isbn: + annotations.append(f"ISBN: {isbn}") + + search_results.append(SearchResult( + origin="openlibrary", + title=title, + target=f"https://openlibrary.org/books/{olid}", + detail=detail, + annotations=annotations, + media_kind="book", + columns=columns, + full_metadata={ + "number": idx, + "authors": authors, + "year": year, + "isbn": isbn, + "oclc": oclc, + "lccn": lccn, + "pages": pages, + "language": language, + "olid": olid, + "ebook_access": doc.get("ebook_access", ""), + "status": status, + "archive_id": archive_id, + }, + )) + + # Sort results: borrowable ones first, then not borrowable, then unknown + def sort_key(result): + status = (result.full_metadata.get("status") or "").strip().lower() + if status == "download": + return (0, result.title) + elif status.startswith("?libgen"): + return (1, result.title) + else: + return (2, result.title) + + search_results.sort(key=sort_key) + + # Rebuild number field after sorting + for new_idx, result in enumerate(search_results, 1): + result.full_metadata["number"] = new_idx + # Update the # column in columns + if result.columns and result.columns[0][0] == "#": + result.columns[0] = ("#", str(new_idx)) + + return search_results + + except Exception as e: + log(f"OpenLibrary search error: {e}", file=sys.stderr) + return [] + + def _format_isbn_result(self, book_data: Dict[str, Any], isbn: str) -> SearchResult: + """Format a book result from ISBN endpoint.""" + # Get title from book data + title = book_data.get("title", "Unknown") + + # Get authors + author_list = [] + for author_key in book_data.get("authors", []): + if isinstance(author_key, dict): + author_list.append(author_key.get("name", "")) + elif isinstance(author_key, str): + author_list.append(author_key) + + author_str = ", ".join(filter(None, author_list)) if author_list else "Unknown" + + # Extract other metadata + year = book_data.get("first_publish_year", "") + publishers = book_data.get("publishers", []) + publisher = publishers[0].get("name", "") if publishers and isinstance(publishers[0], dict) else "" + pages = book_data.get("number_of_pages", "") + languages = book_data.get("languages", []) + language = languages[0].get("key", "").replace("/languages/", "") if languages else "" + olid = book_data.get("key", "").split("/")[-1] if book_data.get("key") else "" + + # Build doc for column rendering + doc = { + "title": title, + "author_name": author_list, + "first_publish_year": year, + "ebook_access": book_data.get("ebook_access", ""), + "has_fulltext": bool(book_data.get("ocaid")), + "ia": [book_data.get("ocaid")] if book_data.get("ocaid") else [], + "ocaid": book_data.get("ocaid", ""), + } + status, archive_id = self._derive_status(doc) + doc["status"] = status + + # Build detail + detail = f"By: {author_str}" + if year: + detail += f" ({year})" + + # Build annotations + annotations = [] + if pages: + annotations.append(f"{pages} pages") + annotations.append(f"ISBN: {isbn}") + + # Build columns using shared helper for consistency + columns = self.build_columns_from_doc(doc, idx=1) + + return SearchResult( + origin="openlibrary", + title=title, + target=f"https://openlibrary.org/books/{olid}", + detail=detail, + annotations=annotations, + media_kind="book", + columns=columns, + full_metadata={ + "number": 1, + "authors": author_list, + "year": year, + "isbn": isbn, + "oclc": "", + "lccn": "", + "pages": pages, + "language": language, + "olid": olid, + "publisher": publisher, + "ebook_access": doc.get("ebook_access", ""), + "status": status, + "archive_id": archive_id, + }, + ) + + def get_result_args(self) -> List[str]: + """OpenLibrary results are info/links only.""" + return ["-info"] + + def validate(self) -> bool: + """OpenLibrary is always available (no auth needed).""" + return True + + +class GogGamesProvider(SearchProvider): + """Search provider for GOG Games.""" + + def __init__(self, config: Dict[str, Any] = None): + super().__init__(config) + self.name = "gog" + self.base_url = "https://gog-games.to" + self.headers = { + "Referer": "https://gog-games.to/", + "Origin": "https://gog-games.to", + "X-Requested-With": "XMLHttpRequest" + } + + def _request(self, client, endpoint: str, is_json: bool = True) -> Any: + """Helper for API requests.""" + url = f"{self.base_url}/api/web/{endpoint}" + try: + response = client.get(url, headers=self.headers) + if response.status_code == 200: + return response.json() if is_json else response.text + elif response.status_code == 404: + return None + else: + log(f"[gog] API request failed: {response.status_code} for {endpoint}", file=sys.stderr) + return None + except Exception as e: + log(f"[gog] Request error: {e}", file=sys.stderr) + return None + + def get_all_games(self, client) -> List[Dict[str, Any]]: + """Fetch all games from the API.""" + return self._request(client, "all-games") or [] + + def get_game_details(self, client, slug: str) -> Optional[Dict[str, Any]]: + """Fetch details for a specific game.""" + return self._request(client, f"query-game/{slug}") + + def get_game_md5(self, client, slug: str) -> Optional[str]: + """Fetch MD5 checksums for a game.""" + return self._request(client, f"download-md5/{slug}", is_json=False) + + def search( + self, + query: str, + limit: int = 50, + filters: Optional[Dict[str, Any]] = None, + **kwargs + ) -> List[SearchResult]: + """Search GOG Games.""" + from helper.http_client import HTTPClient + + results = [] + query_norm = query.strip().lower() + + with HTTPClient() as client: + # 1. Fetch all games to perform fuzzy search + all_games = self.get_all_games(client) + + matches = [] + if all_games: + for game in all_games: + if (query_norm in game.get("title", "").lower() or + query_norm in game.get("slug", "").lower()): + matches.append(game) + + # 2. Fallback: If no matches and query looks like a slug, try direct lookup + if not matches and "_" in query_norm: + details = self.get_game_details(client, query_norm) + if details and "game_info" in details: + matches.append(details["game_info"]) + + for game in matches[:limit]: + slug = game.get("slug") + title = game.get("title", slug) + infohash = game.get("infohash") + gog_url = game.get("gog_url", "") + + # Note: 'all-games' endpoint doesn't provide file size. + # We set size to 0 to avoid N+1 requests. + + if infohash: + magnet_link = f"magnet:?xt=urn:btih:{infohash}&dn={slug}" + results.append(SearchResult( + origin="gog", + title=title, + target=magnet_link, + media_kind="magnet", + detail="Magnet Link", + size_bytes=0, + annotations=["Magnet"], + full_metadata=game + )) + else: + results.append(SearchResult( + origin="gog", + title=title, + target=gog_url, + media_kind="game", + detail="No magnet available", + size_bytes=0, + annotations=["No Magnet"], + full_metadata=game + )) + + return results + + def get_result_args(self) -> List[str]: + """GOG results are URLs.""" + return ["-url"] + + def validate(self) -> bool: + """GOG Games is a public website.""" + return True + + +class YoutubeSearchProvider(SearchProvider): + """ + Search provider for YouTube using yt-dlp. + """ + + RESULT_FIELDS = [ + ("title", "Title", None), + ("uploader", "Uploader", None), + ("duration_string", "Duration", None), + ("view_count", "Views", lambda x: f"{x:,}" if x else ""), + ] + + def search(self, query: str, limit: int = 10, filters: Optional[Dict[str, Any]] = None, **kwargs) -> List[SearchResult]: + """ + Search YouTube using yt-dlp. + + Args: + query: Search query + limit: Maximum number of results + filters: Optional filtering criteria (ignored for now) + + Returns: + List of SearchResult objects + """ + # Check if yt-dlp is available + ytdlp_path = shutil.which("yt-dlp") + if not ytdlp_path: + log("yt-dlp not found in PATH", file=sys.stderr) + return [] + + # Construct command + # ytsearchN:query searches for N results + search_query = f"ytsearch{limit}:{query}" + + cmd = [ + ytdlp_path, + "--dump-json", + "--flat-playlist", # Don't resolve video details fully, faster + "--no-warnings", + search_query + ] + + try: + # Run yt-dlp + # We need to capture stdout. yt-dlp outputs one JSON object per line for search results + process = subprocess.run( + cmd, + capture_output=True, + text=True, + encoding="utf-8", + errors="replace" + ) + + if process.returncode != 0: + log(f"yt-dlp search failed: {process.stderr}", file=sys.stderr) + return [] + + results = [] + for line in process.stdout.splitlines(): + if not line.strip(): + continue + + try: + data = json.loads(line) + + # Extract fields + title = data.get("title", "Unknown Title") + url = data.get("url") + if not url: + # Sometimes flat-playlist gives 'id', construct URL + video_id = data.get("id") + if video_id: + url = f"https://www.youtube.com/watch?v={video_id}" + else: + continue + + uploader = data.get("uploader", "Unknown Uploader") + duration = data.get("duration") # seconds + view_count = data.get("view_count") + + # Format duration + duration_str = "" + if duration: + try: + m, s = divmod(int(duration), 60) + h, m = divmod(m, 60) + if h > 0: + duration_str = f"{h}:{m:02d}:{s:02d}" + else: + duration_str = f"{m}:{s:02d}" + except (ValueError, TypeError): + pass + + # Create annotations + annotations = [] + if duration_str: + annotations.append(duration_str) + if view_count: + # Simple format for views + try: + vc = int(view_count) + if vc >= 1000000: + views_str = f"{vc/1000000:.1f}M views" + elif vc >= 1000: + views_str = f"{vc/1000:.1f}K views" + else: + views_str = f"{vc} views" + annotations.append(views_str) + except (ValueError, TypeError): + pass + + annotations.append("youtube") + + # Create result + result = SearchResult( + origin="youtube", + title=title, + target=url, + detail=f"by {uploader}", + annotations=annotations, + media_kind="video", + full_metadata=data, + columns=[ + ("Title", title), + ("Uploader", uploader), + ("Duration", duration_str), + ("Views", str(view_count) if view_count else "") + ] + ) + results.append(result) + + except json.JSONDecodeError: + continue + + return results + + except Exception as e: + log(f"Error running yt-dlp: {e}", file=sys.stderr) + return [] + + def get_result_args(self) -> List[str]: + """YouTube results are URLs.""" + return ["-url"] + + def validate(self) -> bool: + """Check if yt-dlp is installed.""" + return shutil.which("yt-dlp") is not None + + +# Provider registry +_PROVIDERS = { + "local": LocalStorageProvider, + "libgen": LibGenProvider, + "soulseek": SoulSeekProvider, + "debrid": DebridProvider, + "openlibrary": OpenLibraryProvider, + "gog": GogGamesProvider, + "youtube": YoutubeSearchProvider, +} + + +def get_provider(name: str, config: Optional[Dict[str, Any]] = None) -> Optional[SearchProvider]: + """ + Get a search provider by name. + + Args: + name: Provider name (case-insensitive): "local", "libgen", "soulseek", "debrid", "openlibrary" + config: Optional configuration dictionary + + Returns: + SearchProvider instance or None if not found + """ + provider_class = _PROVIDERS.get(name.lower()) + + if provider_class is None: + log(f"Unknown search provider: {name}", file=sys.stderr) + return None + + try: + provider = provider_class(config) + if not provider.validate(): + log(f"Provider '{name}' is not properly configured or available", file=sys.stderr) + return None + return provider + + except Exception as e: + log(f"Error initializing provider '{name}': {e}", file=sys.stderr) + return None + + +def list_providers(config: Optional[Dict[str, Any]] = None) -> Dict[str, bool]: + """ + List all available providers and whether they're available. + + Args: + config: Optional configuration dictionary + + Returns: + Dictionary mapping provider names to availability (True/False) + """ + availability = {} + for name, provider_class in _PROVIDERS.items(): + try: + provider = provider_class(config) + availability[name] = provider.validate() + except Exception: + availability[name] = False + return availability + + +def register_provider(name: str, provider_class: type) -> None: + """ + Register a new search provider. + + Args: + name: Provider name (lowercase) + provider_class: Class that inherits from SearchProvider + """ + _PROVIDERS[name.lower()] = provider_class + + +class FileProvider(ABC): + """Abstract base class for file hosting providers.""" + + def __init__(self, config: Optional[Dict[str, Any]] = None): + self.config = config or {} + self.name = self.__class__.__name__.replace("FileProvider", "").lower() + + @abstractmethod + def upload(self, file_path: str) -> str: + """Upload a file and return the URL.""" + pass + + def validate(self) -> bool: + """Check if provider is available/configured.""" + return True + + +class ZeroXZeroFileProvider(FileProvider): + """File provider for 0x0.st.""" + + def __init__(self, config: Optional[Dict[str, Any]] = None): + super().__init__(config) + self.name = "0x0" + self.base_url = "https://0x0.st" + + def upload(self, file_path: str) -> str: + """Upload file to 0x0.st.""" + from helper.http_client import HTTPClient + import os + + if not os.path.exists(file_path): + raise FileNotFoundError(f"File not found: {file_path}") + + try: + # 0x0.st expects 'file' field in multipart/form-data + # Use a custom User-Agent to avoid 403 Forbidden + headers = {"User-Agent": "Medeia-Macina/1.0"} + with HTTPClient(headers=headers) as client: + with open(file_path, 'rb') as f: + files = {'file': f} + response = client.post(self.base_url, files=files) + + if response.status_code == 200: + return response.text.strip() + else: + raise Exception(f"Upload failed: {response.status_code} - {response.text}") + + except Exception as e: + log(f"[0x0] Upload error: {e}", file=sys.stderr) + raise + + def validate(self) -> bool: + return True + + +# File provider registry +_FILE_PROVIDERS = { + "0x0": ZeroXZeroFileProvider, +} + + +def get_file_provider(name: str, config: Optional[Dict[str, Any]] = None) -> Optional[FileProvider]: + """ + Get a file hosting provider by name. + + Args: + name: Provider name (case-insensitive): "0x0" + config: Optional configuration dictionary + + Returns: + FileProvider instance or None if not found + """ + provider_class = _FILE_PROVIDERS.get(name.lower()) + + if provider_class is None: + log(f"Unknown file provider: {name}", file=sys.stderr) + return None + + try: + provider = provider_class(config) + if not provider.validate(): + log(f"File provider '{name}' is not properly configured or available", file=sys.stderr) + return None + return provider + + except Exception as e: + log(f"Error initializing file provider '{name}': {e}", file=sys.stderr) + return None + + +def list_file_providers(config: Optional[Dict[str, Any]] = None) -> Dict[str, bool]: + """ + List all available file hosting providers and whether they're available. + + Args: + config: Optional configuration dictionary + + Returns: + Dictionary mapping provider names to availability (True/False) + """ + availability = {} + for name, provider_class in _FILE_PROVIDERS.items(): + try: + provider = provider_class(config) + availability[name] = provider.validate() + except Exception: + availability[name] = False + return availability + + +def register_file_provider(name: str, provider_class: type) -> None: + """ + Register a new file hosting provider. + + Args: + name: Provider name (lowercase) + provider_class: Class that inherits from FileProvider + """ + _FILE_PROVIDERS[name.lower()] = provider_class + + + + diff --git a/helper/tasks.py b/helper/tasks.py new file mode 100644 index 0000000..c5fd61c --- /dev/null +++ b/helper/tasks.py @@ -0,0 +1,155 @@ +"""Background task handling and IPC helpers for mpv integration.""" +from __future__ import annotations +import errno +import json +import os +import socket +import subprocess +import sys + +from helper.logger import log +import threading +import time +from typing import IO, Iterable +def connect_ipc(path: str, timeout: float = 5.0) -> IO[bytes] | None: + """Connect to the mpv IPC server located at *path*.""" + deadline = time.time() + timeout + if not path: + return None + if os.name == 'nt': + # mpv exposes a named pipe on Windows. Keep retrying until it is ready. + while True: + try: + return open(path, 'r+b', buffering=0) + except FileNotFoundError: + if time.time() > deadline: + return None + time.sleep(0.05) + except OSError as exc: # Pipe busy + if exc.errno not in (errno.ENOENT, errno.EPIPE, errno.EBUSY): + raise + if time.time() > deadline: + return None + time.sleep(0.05) + else: + sock = socket.socket(socket.AF_UNIX) + while True: + try: + sock.connect(path) + return sock.makefile('r+b', buffering=0) + except FileNotFoundError: + if time.time() > deadline: + return None + time.sleep(0.05) + except OSError as exc: + if exc.errno not in (errno.ENOENT, errno.ECONNREFUSED): + raise + if time.time() > deadline: + return None + time.sleep(0.05) +def ipc_sender(ipc: IO[bytes] | None): + """Create a helper function for sending script messages via IPC.""" + if ipc is None: + def _noop(_event: str, _payload: dict) -> None: + return None + return _noop + lock = threading.Lock() + def _send(event: str, payload: dict) -> None: + message = json.dumps({'command': ['script-message', event, json.dumps(payload)]}, ensure_ascii=False) + encoded = message.encode('utf-8') + b'\n' + with lock: + try: + ipc.write(encoded) + ipc.flush() + except OSError: + pass + return _send +def iter_stream(stream: Iterable[str]) -> Iterable[str]: + for raw in stream: + yield raw.rstrip('\r\n') +def _run_task(args, parser) -> int: + if not args.command: + parser.error('run-task requires a command to execute (use "--" before the command).') + env = os.environ.copy() + for entry in args.env: + key, sep, value = entry.partition('=') + if not sep: + parser.error(f'Invalid environment variable definition: {entry!r}') + env[key] = value + command = list(args.command) + if command and command[0] == '--': + command.pop(0) + notifier = ipc_sender(connect_ipc(args.ipc, timeout=args.ipc_timeout)) + if not command: + notifier('downlow-task-event', { + 'id': args.task_id, + 'event': 'error', + 'message': 'No command provided after separator', + }) + log('[downlow.py] No command provided for run-task', file=sys.stderr) + return 1 + if command and isinstance(command[0], str) and sys.executable: + first = command[0].lower() + if first in {'python', 'python3', 'py', 'python.exe', 'python3.exe', 'py.exe'}: + command[0] = sys.executable + if os.environ.get('DOWNLOW_DEBUG'): + log(f"Launching command: {command}", file=sys.stderr) + notifier('downlow-task-event', { + 'id': args.task_id, + 'event': 'start', + 'command': command, + 'cwd': args.cwd or os.getcwd(), + }) + try: + process = subprocess.Popen( + command, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + cwd=args.cwd or None, + env=env, + text=True, + bufsize=1, + universal_newlines=True, + ) + except FileNotFoundError as exc: + notifier('downlow-task-event', { + 'id': args.task_id, + 'event': 'error', + 'message': f'Executable not found: {exc.filename}', + }) + log(f"{exc}", file=sys.stderr) + return 1 + stdout_lines: list[str] = [] + stderr_lines: list[str] = [] + def pump(stream: IO[str], label: str, sink: list[str]) -> None: + for line in iter_stream(stream): + sink.append(line) + notifier('downlow-task-event', { + 'id': args.task_id, + 'event': label, + 'line': line, + }) + threads = [] + if process.stdout: + t_out = threading.Thread(target=pump, args=(process.stdout, 'stdout', stdout_lines), daemon=True) + t_out.start() + threads.append(t_out) + if process.stderr: + t_err = threading.Thread(target=pump, args=(process.stderr, 'stderr', stderr_lines), daemon=True) + t_err.start() + threads.append(t_err) + return_code = process.wait() + for t in threads: + t.join(timeout=0.1) + notifier('downlow-task-event', { + 'id': args.task_id, + 'event': 'exit', + 'returncode': return_code, + 'success': return_code == 0, + }) + # Also mirror aggregated output to stdout/stderr for compatibility when IPC is unavailable. + if stdout_lines: + log('\n'.join(stdout_lines)) + if stderr_lines: + log('\n'.join(stderr_lines), file=sys.stderr) + return return_code diff --git a/helper/unified_book_downloader.py b/helper/unified_book_downloader.py new file mode 100644 index 0000000..1ce6f87 --- /dev/null +++ b/helper/unified_book_downloader.py @@ -0,0 +1,706 @@ +"""Unified book downloader - handles Archive.org borrowing and Libgen fallback. + +This module provides a single interface for downloading books from multiple sources: +1. Try Archive.org direct download (if available) +2. Try Archive.org borrowing (if user has credentials) +3. Fallback to Libgen search by ISBN +4. Attempt Libgen download + +All sources integrated with proper metadata scraping and error handling. +""" + +import logging +import asyncio +import requests +from typing import Optional, Dict, Any, Tuple, List, Callable, cast +from pathlib import Path + +from helper.logger import debug + +logger = logging.getLogger(__name__) + + +class UnifiedBookDownloader: + """Unified interface for downloading books from multiple sources.""" + + def __init__(self, config: Optional[Dict[str, Any]] = None, output_dir: Optional[str] = None): + """Initialize the unified book downloader. + + Args: + config: Configuration dict with credentials + output_dir: Default output directory + """ + self.config = config or {} + self.output_dir = output_dir + self.session = requests.Session() + + # Import download functions from their modules + self._init_downloaders() + + def _init_downloaders(self) -> None: + """Initialize downloader functions from their modules.""" + try: + from helper.archive_client import ( + check_direct_download, + get_openlibrary_by_isbn, + loan + ) + self.check_direct_download = check_direct_download + self.get_openlibrary_by_isbn = get_openlibrary_by_isbn + self.loan_func = loan + logger.debug("[UnifiedBookDownloader] Loaded archive.org downloaders from archive_client") + except Exception as e: + logger.warning(f"[UnifiedBookDownloader] Failed to load archive.org functions: {e}") + self.check_direct_download = None + self.get_openlibrary_by_isbn = None + self.loan_func = None + + try: + from helper.libgen_service import ( + DEFAULT_LIMIT as _LIBGEN_DEFAULT_LIMIT, + download_from_mirror as _libgen_download, + search_libgen as _libgen_search, + ) + + def _log_info(message: str) -> None: + debug(f"[UnifiedBookDownloader] {message}") + + def _log_error(message: str) -> None: + logger.error(f"[UnifiedBookDownloader] {message}") + + self.search_libgen = lambda query, limit=_LIBGEN_DEFAULT_LIMIT: _libgen_search( + query, + limit=limit, + log_info=_log_info, + log_error=_log_error, + ) + self.download_from_mirror = lambda mirror_url, output_path: _libgen_download( + mirror_url, + output_path, + log_info=_log_info, + log_error=_log_error, + ) + logger.debug("[UnifiedBookDownloader] Loaded Libgen helpers") + except Exception as e: + logger.warning(f"[UnifiedBookDownloader] Failed to load Libgen helpers: {e}") + self.search_libgen = None + self.download_from_mirror = None + + def get_download_options(self, book_data: Dict[str, Any]) -> Dict[str, Any]: + """Get all available download options for a book. + + Checks in priority order: + 1. Archive.org direct download (public domain) + 2. Archive.org borrowing (if credentials available and book is borrowable) + 3. Libgen fallback (by ISBN) + + Args: + book_data: Book metadata dict with at least 'openlibrary_id' or 'isbn' + + Returns: + Dict with available download methods and metadata + """ + options = { + 'book_title': book_data.get('title', 'Unknown'), + 'book_author': book_data.get('author', 'Unknown'), + 'isbn': book_data.get('isbn', ''), + 'openlibrary_id': book_data.get('openlibrary_id', ''), + 'methods': [], # Will be sorted by priority + 'metadata': {} + } + + # Extract book ID from openlibrary_id (e.g., OL8513721M -> 8513721, OL8513721W -> 8513721) + ol_id = book_data.get('openlibrary_id', '') + book_id = None + + if ol_id.startswith('OL') and len(ol_id) > 2: + # Remove 'OL' prefix (keep everything after it including the suffix letter) + # The book_id is all digits after 'OL' + book_id = ''.join(c for c in ol_id[2:] if c.isdigit()) + + # PRIORITY 1: Check direct download (fastest, no auth needed) + if self.check_direct_download: + try: + can_download, pdf_url = self.check_direct_download(book_id) + if can_download: + options['methods'].append({ + 'type': 'archive.org_direct', + 'label': 'Archive.org Direct Download', + 'requires_auth': False, + 'pdf_url': pdf_url, + 'book_id': book_id, + 'priority': 1 # Highest priority + }) + logger.info(f"[UnifiedBookDownloader] Direct download available for {book_id}") + except Exception as e: + logger.debug(f"[UnifiedBookDownloader] Direct download check failed: {e}") + + # PRIORITY 2: Check borrowing option (requires auth, 14-day loan) + # First verify the book is actually lendable via OpenLibrary API + if self._has_archive_credentials(): + is_lendable, status = self._check_book_lendable_status(ol_id) + + if is_lendable: + options['methods'].append({ + 'type': 'archive.org_borrow', + 'label': 'Archive.org Borrow', + 'requires_auth': True, + 'book_id': book_id, + 'priority': 2 # Second priority + }) + logger.info(f"[UnifiedBookDownloader] Borrow option available for {book_id} (status: {status})") + else: + logger.debug(f"[UnifiedBookDownloader] Borrow not available for {book_id} (status: {status})") + + # PRIORITY 3: Check Libgen fallback (by ISBN, no auth needed, most reliable) + isbn = book_data.get('isbn', '') + title = book_data.get('title', '') + author = book_data.get('author', '') + + if self.search_libgen: + # Can use Libgen if we have ISBN OR title (or both) + if isbn or title: + options['methods'].append({ + 'type': 'libgen', + 'label': 'Libgen Search & Download', + 'requires_auth': False, + 'isbn': isbn, + 'title': title, + 'author': author, + 'priority': 3 # Third priority (fallback) + }) + logger.info(f"[UnifiedBookDownloader] Libgen fallback available (ISBN: {isbn if isbn else 'N/A'}, Title: {title})") + + # Sort by priority (higher priority first) + options['methods'].sort(key=lambda x: x.get('priority', 999)) + + return options + + def _has_archive_credentials(self) -> bool: + """Check if Archive.org credentials are available.""" + try: + from helper.archive_client import credential_openlibrary + email, password = credential_openlibrary(self.config) + return bool(email and password) + except Exception: + return False + + def _check_book_lendable_status(self, ol_id: str) -> Tuple[bool, Optional[str]]: + """Check if a book is lendable via OpenLibrary API. + + Queries: https://openlibrary.org/api/volumes/brief/json/OLID:{ol_id} + Note: Only works with Edition IDs (OL...M), not Work IDs (OL...W) + + Args: + ol_id: OpenLibrary ID (e.g., OL8513721M for Edition or OL4801915W for Work) + + Returns: + Tuple of (is_lendable: bool, status_reason: Optional[str]) + """ + try: + if not ol_id.startswith('OL'): + return False, "Invalid OpenLibrary ID format" + + # If this is a Work ID (ends with W), we can't query Volumes API + # Work IDs are abstract umbrella records, not specific editions + if ol_id.endswith('W'): + logger.debug(f"[UnifiedBookDownloader] Work ID {ol_id} - skipping Volumes API (not lendable)") + return False, "Work ID not supported by Volumes API (not a specific edition)" + + # If it ends with M, it's an Edition ID - proceed with query + if not ol_id.endswith('M'): + logger.debug(f"[UnifiedBookDownloader] Unknown ID type {ol_id} (not M or W)") + return False, "Invalid OpenLibrary ID type" + + url = f"https://openlibrary.org/api/volumes/brief/json/OLID:{ol_id}" + response = self.session.get(url, timeout=10) + response.raise_for_status() + data = response.json() + + # Empty response means no records found + if not data: + logger.debug(f"[UnifiedBookDownloader] Empty response for {ol_id}") + return False, "No availability data found" + + # The response is wrapped in OLID key + olid_key = f"OLID:{ol_id}" + if olid_key not in data: + logger.debug(f"[UnifiedBookDownloader] OLID key not found in response") + return False, "No availability data found" + + olid_data = data[olid_key] + + # Check items array for lendable status + if 'items' in olid_data and olid_data['items'] and len(olid_data['items']) > 0: + items = olid_data['items'] + + # Check the first item for lending status + first_item = items[0] + + # Handle both dict and string representations (PowerShell converts to string) + if isinstance(first_item, dict): + status = first_item.get('status', '') + else: + # String representation - check if 'lendable' is in it + status = str(first_item).lower() + + is_lendable = 'lendable' in str(status).lower() + + if is_lendable: + logger.info(f"[UnifiedBookDownloader] Book {ol_id} is lendable") + return True, "LENDABLE" + else: + status_str = status.get('status', 'NOT_LENDABLE') if isinstance(status, dict) else 'NOT_LENDABLE' + logger.debug(f"[UnifiedBookDownloader] Book {ol_id} is not lendable (status: {status_str})") + return False, status_str + else: + # No items array or empty + logger.debug(f"[UnifiedBookDownloader] No items found for {ol_id}") + return False, "Not available for lending" + + except requests.exceptions.Timeout: + logger.warning(f"[UnifiedBookDownloader] OpenLibrary API timeout for {ol_id}") + return False, "API timeout" + except Exception as e: + logger.debug(f"[UnifiedBookDownloader] Failed to check lendable status for {ol_id}: {e}") + return False, f"API error" + + + async def download_book(self, method: Dict[str, Any], output_dir: Optional[str] = None) -> Tuple[bool, str]: + """Download a book using the specified method. + + Args: + method: Download method dict from get_download_options() + output_dir: Directory to save the book + + Returns: + Tuple of (success: bool, message: str) + """ + output_dir = output_dir or self.output_dir or str(Path.home() / "Downloads") + method_type = method.get('type', '') + + logger.info(f"[UnifiedBookDownloader] Starting download with method: {method_type}") + + try: + if method_type == 'archive.org_direct': + return await self._download_archive_direct(method, output_dir) + + elif method_type == 'archive.org_borrow': + return await self._download_archive_borrow(method, output_dir) + + elif method_type == 'libgen': + return await self._download_libgen(method, output_dir) + + else: + return False, f"Unknown download method: {method_type}" + + except Exception as e: + logger.error(f"[UnifiedBookDownloader] Download error: {e}", exc_info=True) + return False, f"Download failed: {str(e)}" + + async def _download_archive_direct(self, method: Dict[str, Any], output_dir: str) -> Tuple[bool, str]: + """Download directly from Archive.org.""" + try: + pdf_url = method.get('pdf_url', '') + book_id = method.get('book_id', '') + + if not pdf_url: + return False, "No PDF URL available" + + # Determine output filename + filename = f"{book_id}.pdf" + output_path = Path(output_dir) / filename + + logger.info(f"[UnifiedBookDownloader] Downloading PDF from: {pdf_url}") + + # Download in a thread to avoid blocking + loop = asyncio.get_event_loop() + success = await loop.run_in_executor( + None, + self._download_file, + pdf_url, + str(output_path) + ) + + if success: + logger.info(f"[UnifiedBookDownloader] Successfully downloaded to: {output_path}") + return True, f"Downloaded to: {output_path}" + else: + return False, "Failed to download PDF" + + except Exception as e: + logger.error(f"[UnifiedBookDownloader] Archive direct download error: {e}") + return False, f"Archive download failed: {str(e)}" + + async def _download_archive_borrow(self, method: Dict[str, Any], output_dir: str) -> Tuple[bool, str]: + """Download via Archive.org borrowing (requires credentials). + + Process (follows archive_client.py pattern): + 1. Login to Archive.org with credentials + 2. Call loan endpoint to borrow the book (14-day loan) + 3. Get book info (page links, metadata) + 4. Download all pages as images + 5. Merge images into PDF + + The loan function from archive_client.py handles: + - Checking if book needs borrowing (status 400 = "doesn't need to be borrowed") + - Creating borrow token for access + - Handling borrow failures + + get_book_infos() extracts page links from the borrowed book viewer + download() downloads all pages using thread pool + img2pdf merges pages into searchable PDF + """ + try: + from helper.archive_client import credential_openlibrary + + book_id = method.get('book_id', '') + + # Get credentials + email, password = credential_openlibrary(self.config) + if not email or not password: + return False, "Archive.org credentials not configured" + + logger.info(f"[UnifiedBookDownloader] Logging into Archive.org...") + + # Login and borrow (in thread, following download_book.py pattern) + loop = asyncio.get_event_loop() + borrow_result = await loop.run_in_executor( + None, + self._archive_borrow_and_download, + email, + password, + book_id, + output_dir + ) + + if borrow_result and isinstance(borrow_result, tuple): + success, filepath = borrow_result + if success: + logger.info(f"[UnifiedBookDownloader] Borrow succeeded: {filepath}") + return True, filepath + else: + logger.warning(f"[UnifiedBookDownloader] Borrow failed: {filepath}") + return False, filepath + else: + return False, "Failed to borrow book from Archive.org" + + except Exception as e: + logger.error(f"[UnifiedBookDownloader] Archive borrow error: {e}") + return False, f"Archive borrow failed: {str(e)}" + + async def _download_libgen(self, method: Dict[str, Any], output_dir: str) -> Tuple[bool, str]: + """Download via Libgen search and download with mirror fallback.""" + try: + isbn = method.get('isbn', '') + title = method.get('title', '') + + if not isbn and not title: + return False, "Need ISBN or title for Libgen search" + + if not self.search_libgen: + return False, "Libgen searcher not available" + + # Define wrapper functions to safely call the methods + search_func = self.search_libgen + if search_func is None: + return False, "Search function not available" + + preloaded_results = method.get('results') + loop = asyncio.get_event_loop() + + if preloaded_results: + results = list(preloaded_results) + if not results: + results = await loop.run_in_executor(None, lambda: search_func(isbn or title, 10)) + else: + results = await loop.run_in_executor(None, lambda: search_func(isbn or title, 10)) + + if not results: + logger.warning(f"[UnifiedBookDownloader] No Libgen results for: {isbn or title}") + return False, f"No Libgen results found for: {isbn or title}" + + logger.info(f"[UnifiedBookDownloader] Found {len(results)} Libgen results") + + # Determine output filename (use first result for naming) + first_result = results[0] + filename = f"{first_result.get('title', 'book')}" + filename = "".join(c for c in filename if c.isalnum() or c in (' ', '.', '-'))[:100] + + # Try each result's mirror until one succeeds + for idx, result in enumerate(results, 1): + mirror_url = result.get('mirror_url', '') + + if not mirror_url: + logger.debug(f"[UnifiedBookDownloader] Result {idx}: No mirror URL") + continue + + # Use extension from this result if available + extension = result.get('extension', 'pdf') + if extension and not extension.startswith('.'): + extension = f".{extension}" + elif not extension: + extension = '.pdf' + + output_path = Path(output_dir) / (filename + extension) + + logger.info(f"[UnifiedBookDownloader] Trying mirror {idx}/{len(results)}: {mirror_url}") + + download_func = self.download_from_mirror + if download_func is None: + return False, "Download function not available" + + download_callable = cast(Callable[[str, str], bool], download_func) + + def download_wrapper(): + return download_callable(mirror_url, str(output_path)) + + # Download (in thread) + try: + success = await loop.run_in_executor(None, download_wrapper) + + if success: + # Validate downloaded file is not HTML (common Libgen issue) + if output_path.exists(): + try: + with open(output_path, 'rb') as f: + file_start = f.read(1024).decode('utf-8', errors='ignore').lower() + if ' Tuple[bool, str]: + """Download a specific Libgen result with optional fallbacks.""" + + if not isinstance(selected, dict): + return False, "Selected result must be a dictionary" + + ordered_results: List[Dict[str, Any]] = [selected] + if remaining: + for item in remaining: + if isinstance(item, dict) and item is not selected: + ordered_results.append(item) + + method: Dict[str, Any] = { + 'type': 'libgen', + 'isbn': selected.get('isbn', '') or '', + 'title': selected.get('title', '') or '', + 'author': selected.get('author', '') or '', + 'results': ordered_results, + } + + return await self.download_book(method, output_dir) + + def download_libgen_selection_sync( + self, + selected: Dict[str, Any], + remaining: Optional[List[Dict[str, Any]]] = None, + output_dir: Optional[str] = None, + ) -> Tuple[bool, str]: + """Synchronous helper for downloading a Libgen selection.""" + + async def _run() -> Tuple[bool, str]: + return await self.download_libgen_selection(selected, remaining, output_dir) + + loop = asyncio.new_event_loop() + try: + asyncio.set_event_loop(loop) + return loop.run_until_complete(_run()) + finally: + loop.close() + asyncio.set_event_loop(None) + + def _download_file(self, url: str, output_path: str) -> bool: + """Download a file from URL.""" + try: + response = requests.get(url, stream=True, timeout=30) + response.raise_for_status() + + with open(output_path, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) + + return True + except Exception as e: + logger.error(f"[UnifiedBookDownloader] File download error: {e}") + return False + + def _archive_borrow_and_download(self, email: str, password: str, book_id: str, output_dir: str) -> Tuple[bool, str]: + """Borrow a book from Archive.org and download pages as PDF. + + This follows the exact process from archive_client.py: + 1. Login with credentials + 2. Call loan() to create 14-day borrow + 3. Get book info (extract page URLs) + 4. Download all pages as images + 5. Merge images into searchable PDF + + Returns tuple of (success: bool, filepath/message: str) + """ + try: + from helper.archive_client import login, loan, get_book_infos, download + import tempfile + import shutil + + logger.info(f"[UnifiedBookDownloader] Logging into Archive.org as {email}") + session = login(email, password) + + logger.info(f"[UnifiedBookDownloader] Attempting to borrow book: {book_id}") + # Call loan to create the 14-day borrow + session = loan(session, book_id, verbose=True) + + # If we get here, borrowing succeeded + logger.info(f"[UnifiedBookDownloader] Successfully borrowed book: {book_id}") + + # Now get the book info (page URLs and metadata) + logger.info(f"[UnifiedBookDownloader] Extracting book page information...") + # Try both URL formats: with /borrow and without + book_urls = [ + f"https://archive.org/borrow/{book_id}", # Try borrow page first (for borrowed books) + f"https://archive.org/details/{book_id}" # Fallback to details page + ] + + title = None + links = None + metadata = None + last_error = None + + for book_url in book_urls: + try: + logger.debug(f"[UnifiedBookDownloader] Trying to get book info from: {book_url}") + response = session.get(book_url, timeout=10) + + # Log response status + if response.status_code != 200: + logger.debug(f"[UnifiedBookDownloader] URL returned {response.status_code}: {book_url}") + # Continue to try next URL + continue + + # Try to parse the response + title, links, metadata = get_book_infos(session, book_url) + logger.info(f"[UnifiedBookDownloader] Successfully got info from: {book_url}") + logger.info(f"[UnifiedBookDownloader] Found {len(links)} pages to download") + break + except Exception as e: + logger.debug(f"[UnifiedBookDownloader] Failed with {book_url}: {e}") + last_error = e + continue + + if links is None: + logger.error(f"[UnifiedBookDownloader] Failed to get book info from all URLs: {last_error}") + # Borrow extraction failed - return False + return False, "Could not extract borrowed book pages" + + # Create temporary directory for images + temp_dir = tempfile.mkdtemp(prefix=f"{title}_", dir=output_dir) + logger.info(f"[UnifiedBookDownloader] Downloading {len(links)} pages to temporary directory...") + + try: + # Download all pages (uses thread pool) + images = download( + session=session, + n_threads=10, + directory=temp_dir, + links=links, + scale=3, # Default resolution + book_id=book_id + ) + + logger.info(f"[UnifiedBookDownloader] Downloaded {len(images)} pages") + + # Try to merge pages into PDF + try: + import img2pdf + logger.info(f"[UnifiedBookDownloader] Merging pages into PDF...") + + # Prepare PDF metadata + pdfmeta = {} + if metadata: + if "title" in metadata: + pdfmeta["title"] = metadata["title"] + if "creator" in metadata: + pdfmeta["author"] = metadata["creator"] + pdfmeta["keywords"] = [f"https://archive.org/details/{book_id}"] + pdfmeta["creationdate"] = None # Avoid timezone issues + + # Convert images to PDF + pdf_content = img2pdf.convert(images, **pdfmeta) if images else None + if not pdf_content: + logger.error(f"[UnifiedBookDownloader] PDF conversion failed") + return False, "Failed to convert pages to PDF" + + # Save the PDF + pdf_filename = f"{title}.pdf" if title else "book.pdf" + pdf_path = Path(output_dir) / pdf_filename + + # Handle duplicate filenames + i = 1 + while pdf_path.exists(): + pdf_path = Path(output_dir) / f"{title or 'book'}({i}).pdf" + i += 1 + + with open(pdf_path, 'wb') as f: + f.write(pdf_content) + + logger.info(f"[UnifiedBookDownloader] Successfully created PDF: {pdf_path}") + + return True, str(pdf_path) + + except ImportError: + logger.warning(f"[UnifiedBookDownloader] img2pdf not available, saving as JPG collection instead") + + # Create JPG collection directory + if not title: + title = f"book_{book_id}" + jpg_dir = Path(output_dir) / title + i = 1 + while jpg_dir.exists(): + jpg_dir = Path(output_dir) / f"{title}({i})" + i += 1 + + # Move temporary directory to final location + shutil.move(temp_dir, str(jpg_dir)) + temp_dir = None # Mark as already moved + + logger.info(f"[UnifiedBookDownloader] Saved as JPG collection: {jpg_dir}") + return True, str(jpg_dir) + + finally: + # Clean up temporary directory if it still exists + if temp_dir and Path(temp_dir).exists(): + shutil.rmtree(temp_dir) + + except SystemExit: + # loan() function calls sys.exit on failure - catch it + logger.error(f"[UnifiedBookDownloader] Borrow process exited (book may not be borrowable)") + return False, "Book could not be borrowed (may not be available for borrowing)" + except Exception as e: + logger.error(f"[UnifiedBookDownloader] Archive borrow error: {e}") + return False, f"Borrow failed: {str(e)}" + + def close(self) -> None: + """Close the session.""" + self.session.close() diff --git a/helper/utils.py b/helper/utils.py new file mode 100644 index 0000000..385fbac --- /dev/null +++ b/helper/utils.py @@ -0,0 +1,492 @@ +"""General-purpose helpers used across the downlow CLI.""" +from __future__ import annotations + +import json +import hashlib +import ffmpeg +import base64 +import logging +import time +from pathlib import Path +from typing import Any, Iterable +from datetime import datetime +from dataclasses import dataclass, field +from fnmatch import fnmatch +from urllib.parse import urlparse + +import helper.utils_constant + +try: + import cbor2 +except ImportError: + cbor2 = None # type: ignore + +CHUNK_SIZE = 1024 * 1024 # 1 MiB +_format_logger = logging.getLogger(__name__) +def ensure_directory(path: Path) -> None: + """Ensure *path* exists as a directory.""" + try: + path.mkdir(parents=True, exist_ok=True) + except OSError as exc: # pragma: no cover - surfaced to caller + raise RuntimeError(f"Failed to create directory {path}: {exc}") from exc +def unique_path(path: Path) -> Path: + """Return a unique path by appending " (n)" if needed.""" + if not path.exists(): + return path + stem = path.stem + suffix = path.suffix + parent = path.parent + counter = 1 + while True: + candidate = parent / f"{stem} ({counter}){suffix}" + if not candidate.exists(): + return candidate + counter += 1 + +def sanitize_metadata_value(value: Any) -> str | None: + if value is None: + return None + if not isinstance(value, str): + value = str(value) + value = value.replace('\x00', ' ').replace('\r', ' ').replace('\n', ' ').strip() + if not value: + return None + return value +def unique_preserve_order(values: Iterable[str]) -> list[str]: + seen: set[str] = set() + ordered: list[str] = [] + for value in values: + if value not in seen: + seen.add(value) + ordered.append(value) + return ordered +def sha256_file(file_path: Path) -> str: + """Return the SHA-256 hex digest of *path*.""" + hasher = hashlib.sha256() + with file_path.open('rb') as handle: + for chunk in iter(lambda: handle.read(CHUNK_SIZE), b''): + hasher.update(chunk) + return hasher.hexdigest() + + +def create_metadata_sidecar(file_path: Path, metadata: dict) -> None: + """Create a .metadata sidecar file with JSON metadata. + + The metadata dict should contain title. If not present, it will be derived from + the filename. This ensures the .metadata file can be matched during batch import. + + Args: + file_path: Path to the exported file + metadata: Dictionary of metadata to save + """ + if not metadata: + return + file_name = file_path.stem + file_ext = file_path.suffix.lower() + # Ensure metadata has a title field that matches the filename (without extension) + # This allows the sidecar to be matched and imported properly during batch import + if 'title' not in metadata or not metadata.get('title'): + metadata['title'] = file_name + metadata['hash'] = sha256_file(file_path) + metadata['size'] = Path(file_path).stat().st_size + format_found = False + for mime_type, ext_map in helper.utils_constant.mime_maps.items(): + for key, info in ext_map.items(): + if info.get("ext") == file_ext: + metadata['type'] = mime_type + format_found = True + break + if format_found: + break + else: + metadata['type'] = 'unknown' + metadata.update(ffprobe(str(file_path))) + + + metadata_path = file_path.with_suffix(file_path.suffix + '.metadata') + try: + with open(metadata_path, 'w', encoding='utf-8') as f: + json.dump(metadata, f, ensure_ascii=False, indent=2) + except OSError as exc: + raise RuntimeError(f"Failed to write metadata sidecar {metadata_path}: {exc}") from exc + +def create_tags_sidecar(file_path: Path, tags: set) -> None: + """Create a .tags sidecar file with tags (one per line). + + Args: + file_path: Path to the exported file + tags: Set of tag strings + """ + if not tags: + return + + tags_path = file_path.with_suffix(file_path.suffix + '.tags') + try: + with open(tags_path, 'w', encoding='utf-8') as f: + for tag in sorted(tags): + f.write(f"{tag}\n") + except Exception as e: + raise RuntimeError(f"Failed to create tags sidecar {tags_path}: {e}") from e + + +def ffprobe(file_path: str) -> dict: + probe = ffmpeg.probe(file_path) + metadata = {} + + # Format-level info + fmt = probe.get("format", {}) + metadata["duration"] = float(fmt.get("duration", 0)) if "duration" in fmt else None + metadata["size"] = int(fmt.get("size", 0)) if "size" in fmt else None + metadata["format_name"] = fmt.get("format_name", None) + + # Stream-level info + for stream in probe.get("streams", []): + codec_type = stream.get("codec_type") + if codec_type == "audio": + metadata["audio_codec"] = stream.get("codec_name") + metadata["bitrate"] = int(stream.get("bit_rate", 0)) if "bit_rate" in stream else None + metadata["samplerate"] = int(stream.get("sample_rate", 0)) if "sample_rate" in stream else None + metadata["channels"] = int(stream.get("channels", 0)) if "channels" in stream else None + elif codec_type == "video": + metadata["video_codec"] = stream.get("codec_name") + metadata["width"] = int(stream.get("width", 0)) if "width" in stream else None + metadata["height"] = int(stream.get("height", 0)) if "height" in stream else None + elif codec_type == "image": + metadata["image_codec"] = stream.get("codec_name") + metadata["width"] = int(stream.get("width", 0)) if "width" in stream else None + metadata["height"] = int(stream.get("height", 0)) if "height" in stream else None + + return metadata + + +# ============================================================================ +# CBOR Utilities - Consolidated from cbor.py +# ============================================================================ +"""CBOR utilities backed by the `cbor2` library.""" + + +def decode_cbor(data: bytes) -> Any: + """Decode *data* from CBOR into native Python objects.""" + if not data: + return None + if cbor2 is None: + raise ImportError("cbor2 library is required for CBOR decoding") + return cbor2.loads(data) + + +def jsonify(value: Any) -> Any: + """Convert *value* into a JSON-friendly structure.""" + if isinstance(value, dict): + return {str(key): jsonify(val) for key, val in value.items()} + if isinstance(value, list): + return [jsonify(item) for item in value] + if isinstance(value, bytes): + return {"__bytes__": base64.b64encode(value).decode("ascii")} + return value + + +# ============================================================================ +# Format Utilities - Consolidated from format_utils.py +# ============================================================================ +"""Formatting utilities for displaying metadata consistently across the application.""" + + +def format_bytes(bytes_value) -> str: + """Format bytes to human-readable format (e.g., '1.5 MB', '250 KB'). + + Args: + bytes_value: Size in bytes (int or float) + + Returns: + Formatted string like '1.5 MB' or '756 MB' + """ + if bytes_value is None or bytes_value <= 0: + return "0 B" + + if isinstance(bytes_value, (int, float)): + for unit in ("B", "KB", "MB", "GB", "TB"): + if bytes_value < 1024: + if unit == "B": + return f"{int(bytes_value)} {unit}" + return f"{bytes_value:.1f} {unit}" + bytes_value /= 1024 + return f"{bytes_value:.1f} PB" + return str(bytes_value) + + +def format_duration(seconds) -> str: + """Format duration in seconds to human-readable format (e.g., '1h 23m 5s', '5m 30s'). + + Args: + seconds: Duration in seconds (int or float) + + Returns: + Formatted string like '1:23:45' or '5:30' + """ + if seconds is None or seconds == '': + return "N/A" + + if isinstance(seconds, str): + try: + seconds = float(seconds) + except ValueError: + return str(seconds) + + if not isinstance(seconds, (int, float)): + return str(seconds) + + total_seconds = int(seconds) + if total_seconds < 0: + return "N/A" + + hours = total_seconds // 3600 + minutes = (total_seconds % 3600) // 60 + secs = total_seconds % 60 + + if hours > 0: + return f"{hours}:{minutes:02d}:{secs:02d}" + elif minutes > 0: + return f"{minutes}:{secs:02d}" + else: + return f"{secs}s" + + +def format_timestamp(timestamp_str) -> str: + """Format ISO timestamp to readable format. + + Args: + timestamp_str: ISO format timestamp string or None + + Returns: + Formatted string like "2025-10-28 19:36:01" or original string if parsing fails + """ + if not timestamp_str: + return "N/A" + + try: + # Handle ISO format timestamps + if isinstance(timestamp_str, str): + # Try parsing ISO format + if 'T' in timestamp_str: + dt = datetime.fromisoformat(timestamp_str.replace('Z', '+00:00')) + else: + # Try other common formats + dt = datetime.fromisoformat(timestamp_str) + return dt.strftime("%Y-%m-%d %H:%M:%S") + except Exception as e: + _format_logger.debug(f"Could not parse timestamp '{timestamp_str}': {e}") + + return str(timestamp_str) + + +def format_metadata_value(key: str, value) -> str: + """Format a metadata value based on its key for display. + + This is the central formatting rule for all metadata display. + + Args: + key: Metadata field name + value: Value to format + + Returns: + Formatted string for display + """ + if value is None or value == '': + return "N/A" + + # Apply field-specific formatting + if key in ('size', 'file_size'): + return format_bytes(value) + elif key in ('duration', 'length'): + return format_duration(value) + elif key in ('time_modified', 'time_imported', 'created_at', 'updated_at', 'indexed_at', 'timestamp'): + return format_timestamp(value) + else: + return str(value) + + +# ============================================================================ +# Link Utilities - Consolidated from link_utils.py +# ============================================================================ +"""Link utilities - Extract and process URLs from various sources.""" + + +def extract_link_from_args(args: Iterable[str]) -> Any | None: + """Extract HTTP/HTTPS URL from command arguments. + + Args: + args: Command arguments + + Returns: + URL string if found, None otherwise + """ + args_list = list(args) if not isinstance(args, (list, tuple)) else args + if not args_list or len(args_list) == 0: + return None + + potential_link = str(args_list[0]) + if potential_link.startswith(('http://', 'https://')): + return potential_link + + return None + + +def extract_link_from_result(result: Any) -> Any | None: + """Extract URL from a result object (dict or object with attributes). + + Args: + result: Result object from pipeline (dict or object) + + Returns: + URL string if found, None otherwise + """ + if isinstance(result, dict): + return result.get('url') or result.get('link') or result.get('href') + + return ( + getattr(result, 'url', None) or + getattr(result, 'link', None) or + getattr(result, 'href', None) + ) + + +def extract_link(result: Any, args: Iterable[str]) -> Any | None: + """Extract link from args or result (args take priority). + + Args: + result: Pipeline result object + args: Command arguments + + Returns: + URL string if found, None otherwise + """ + # Try args first + link = extract_link_from_args(args) + if link: + return link + + # Fall back to result + return extract_link_from_result(result) + + +def get_api_key(config: dict[str, Any], service: str, key_path: str) -> str | None: + """Get API key from config with fallback support. + + Args: + config: Configuration dictionary + service: Service name for logging + key_path: Dot-notation path to key (e.g., "Debrid.All-debrid") + + Returns: + API key if found and not empty, None otherwise + """ + try: + parts = key_path.split('.') + value = config + for part in parts: + if isinstance(value, dict): + value = value.get(part) + else: + return None + + if isinstance(value, str): + return value.strip() or None + + return None + except Exception: + return None + + +def add_direct_link_to_result(result: Any, direct_link: str, original_link: str) -> None: + """Add direct link information to result object. + + Args: + result: Result object to modify (dict or object) + direct_link: The unlocked/direct URL + original_link: The original restricted URL + """ + if isinstance(result, dict): + result['direct_link'] = direct_link + result['original_link'] = original_link + else: + setattr(result, 'direct_link', direct_link) + setattr(result, 'original_link', original_link) + + +# ============================================================================ +# URL Policy Resolution - Consolidated from url_parser.py +# ============================================================================ +"""URL policy resolution for downlow workflows.""" + + +@dataclass(slots=True) +class UrlPolicy: + """Describe how a URL should be handled by download and screenshot flows.""" + + skip_download: bool = False + skip_metadata: bool = False + force_screenshot: bool = False + extra_tags: list[str] = field(default_factory=list) + + def apply_tags(self, sources: Iterable[str]) -> list[str]: + tags = [tag.strip() for tag in self.extra_tags if tag and tag.strip()] + for value in sources: + text = str(value).strip() + if text: + tags.append(text) + return tags + + +def _normalise_rule(rule: dict[str, Any]) -> dict[str, Any] | None: + pattern = str(rule.get("pattern") or rule.get("host") or "").strip() + if not pattern: + return None + skip_download = bool(rule.get("skip_download")) + skip_metadata = bool(rule.get("skip_metadata")) + force_screenshot = bool(rule.get("force_screenshot")) + extra_tags_raw = rule.get("extra_tags") + if isinstance(extra_tags_raw, str): + extra_tags = [part.strip() for part in extra_tags_raw.split(",") if part.strip()] + elif isinstance(extra_tags_raw, (list, tuple, set)): + extra_tags = [str(item).strip() for item in extra_tags_raw if str(item).strip()] + else: + extra_tags = [] + return { + "pattern": pattern, + "skip_download": skip_download, + "skip_metadata": skip_metadata, + "force_screenshot": force_screenshot, + "extra_tags": extra_tags, + } + + +def resolve_url_policy(config: dict[str, Any], url: str) -> UrlPolicy: + policies_raw = config.get("url_policies") + if not policies_raw: + return UrlPolicy() + if not isinstance(policies_raw, list): + return UrlPolicy() + parsed = urlparse(url) + subject = f"{parsed.netloc}{parsed.path}" + host = parsed.netloc + resolved = UrlPolicy() + for rule_raw in policies_raw: + if not isinstance(rule_raw, dict): + continue + rule = _normalise_rule(rule_raw) + if rule is None: + continue + pattern = rule["pattern"] + if not (fnmatch(host, pattern) or fnmatch(subject, pattern)): + continue + if rule["skip_download"]: + resolved.skip_download = True + if rule["skip_metadata"]: + resolved.skip_metadata = True + if rule["force_screenshot"]: + resolved.force_screenshot = True + if rule["extra_tags"]: + for tag in rule["extra_tags"]: + if tag not in resolved.extra_tags: + resolved.extra_tags.append(tag) + return resolved \ No newline at end of file diff --git a/helper/utils_constant.py b/helper/utils_constant.py new file mode 100644 index 0000000..b6cc1c0 --- /dev/null +++ b/helper/utils_constant.py @@ -0,0 +1,79 @@ +mime_maps = { + "image": { + "jpg": { "ext": ".jpg", "mimes": ["image/jpeg", "image/jpg"] }, + "png": { "ext": ".png", "mimes": ["image/png"] }, + "gif": { "ext": ".gif", "mimes": ["image/gif"] }, + "webp": { "ext": ".webp", "mimes": ["image/webp"] }, + "avif": { "ext": ".avif", "mimes": ["image/avif"] }, + "jxl": { "ext": ".jxl", "mimes": ["image/jxl"] }, + "bmp": { "ext": ".bmp", "mimes": ["image/bmp"] }, + "heic": { "ext": ".heic", "mimes": ["image/heic"] }, + "heif": { "ext": ".heif", "mimes": ["image/heif"] }, + "ico": { "ext": ".ico", "mimes": ["image/x-icon", "image/vnd.microsoft.icon"] }, + "qoi": { "ext": ".qoi", "mimes": ["image/qoi"] }, + "tiff": { "ext": ".tiff", "mimes": ["image/tiff", "image/x-tiff"] }, + "svg": { "ext": ".svg", "mimes": ["image/svg+xml"] } + }, + "image_sequence": { + "apng": { "ext": ".apng", "mimes": ["image/apng"], "sequence": True }, + "avifs": { "ext": ".avifs", "mimes": ["image/avif-sequence"], "sequence": True }, + "heics": { "ext": ".heics", "mimes": ["image/heic-sequence"], "sequence": True }, + "heifs": { "ext": ".heifs", "mimes": ["image/heif-sequence"], "sequence": True } + }, + "video": { + "mp4": { "ext": ".mp4", "mimes": ["video/mp4", "audio/mp4"] }, + "webm": { "ext": ".webm", "mimes": ["video/webm", "audio/webm"] }, + "mov": { "ext": ".mov", "mimes": ["video/quicktime"] }, + "ogv": { "ext": ".ogv", "mimes": ["video/ogg"] }, + "mpeg": { "ext": ".mpeg", "mimes": ["video/mpeg"] }, + "avi": { "ext": ".avi", "mimes": ["video/x-msvideo", "video/avi"] }, + "flv": { "ext": ".flv", "mimes": ["video/x-flv"] }, + "mkv": { "ext": ".mkv", "mimes": ["video/x-matroska", "application/x-matroska"], "audio_only_ext": ".mka" }, + "wmv": { "ext": ".wmv", "mimes": ["video/x-ms-wmv"] }, + "rv": { "ext": ".rv", "mimes": ["video/vnd.rn-realvideo"] } + }, + "audio": { + "mp3": { "ext": ".mp3", "mimes": ["audio/mpeg", "audio/mp3"] }, + "m4a": { "ext": ".m4a", "mimes": ["audio/mp4", "audio/x-m4a"] }, + "ogg": { "ext": ".ogg", "mimes": ["audio/ogg"] }, + "flac": { "ext": ".flac", "mimes": ["audio/flac"] }, + "wav": { "ext": ".wav", "mimes": ["audio/wav", "audio/x-wav", "audio/vnd.wave"] }, + "wma": { "ext": ".wma", "mimes": ["audio/x-ms-wma"] }, + "tta": { "ext": ".tta", "mimes": ["audio/x-tta"] }, + "wv": { "ext": ".wv", "mimes": ["audio/x-wavpack", "audio/wavpack"] }, + "mka": { "ext": ".mka", "mimes": ["audio/x-matroska", "video/x-matroska"] } + }, + "document": { + "pdf": { "ext": ".pdf", "mimes": ["application/pdf"] }, + "epub": { "ext": ".epub", "mimes": ["application/epub+zip"] }, + "djvu": { "ext": ".djvu", "mimes": ["application/vnd.djvu"] }, + "rtf": { "ext": ".rtf", "mimes": ["application/rtf"] }, + "docx": { "ext": ".docx", "mimes": ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"] }, + "xlsx": { "ext": ".xlsx", "mimes": ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"] }, + "pptx": { "ext": ".pptx", "mimes": ["application/vnd.openxmlformats-officedocument.presentationml.presentation"] }, + "doc": { "ext": ".doc", "mimes": ["application/msword"] }, + "xls": { "ext": ".xls", "mimes": ["application/vnd.ms-excel"] }, + "ppt": { "ext": ".ppt", "mimes": ["application/vnd.ms-powerpoint"] } + }, + "archive": { + "zip": { "ext": ".zip", "mimes": ["application/zip"] }, + "7z": { "ext": ".7z", "mimes": ["application/x-7z-compressed"] }, + "rar": { "ext": ".rar", "mimes": ["application/x-rar-compressed", "application/vnd.rar"] }, + "gz": { "ext": ".gz", "mimes": ["application/gzip", "application/x-gzip"] }, + "tar": { "ext": ".tar", "mimes": ["application/x-tar"] }, + "cbz": { "ext": ".cbz", "mimes": ["application/zip"], "note": "zip archive of images; prefer extension-based detection for comics" } + }, + "project": { + "clip": { "ext": ".clip", "mimes": ["application/clip"] }, + "kra": { "ext": ".kra", "mimes": ["application/x-krita"] }, + "procreate": { "ext": ".procreate", "mimes": ["application/x-procreate"] }, + "psd": { "ext": ".psd", "mimes": ["image/vnd.adobe.photoshop"] }, + "swf": { "ext": ".swf", "mimes": ["application/x-shockwave-flash"] } + }, + "other": { + "octet-stream": { "ext": "", "mimes": ["application/octet-stream"] }, + "json": { "ext": ".json", "mimes": ["application/json"] }, + "xml": { "ext": ".xml", "mimes": ["application/xml", "text/xml"] }, + "csv": { "ext": ".csv", "mimes": ["text/csv"] } + } +} diff --git a/helper/worker_manager.py b/helper/worker_manager.py new file mode 100644 index 0000000..18b987f --- /dev/null +++ b/helper/worker_manager.py @@ -0,0 +1,655 @@ +"""Worker task management with persistent database storage. + +Manages worker tasks for downloads, searches, imports, etc. with automatic +persistence to database and optional auto-refresh callbacks. +""" + +import logging +from pathlib import Path +from typing import Optional, Dict, Any, List, Callable +from datetime import datetime +from threading import Thread, Lock +import time + +from .local_library import LocalLibraryDB +from helper.logger import log + +logger = logging.getLogger(__name__) + + +class Worker: + """Represents a single worker task with state management.""" + + def __init__(self, worker_id: str, worker_type: str, title: str = "", + description: str = "", manager: Optional['WorkerManager'] = None): + """Initialize a worker. + + Args: + worker_id: Unique identifier for this worker + worker_type: Type of work (e.g., 'download', 'search', 'import') + title: Human-readable title + description: Detailed description + manager: Reference to parent WorkerManager for state updates + """ + self.id = worker_id + self.type = worker_type + self.title = title or worker_type + self.description = description + self.manager = manager + self.status = "running" + self.progress = "" + self.details = "" + self.error_message = "" + self.result = "pending" + self._stdout_buffer = [] + self._steps_buffer = [] + + def log_step(self, step_text: str) -> None: + """Log a step for this worker. + + Args: + step_text: Text describing the step + """ + try: + if self.manager: + self.manager.log_step(self.id, step_text) + else: + logger.info(f"[{self.id}] {step_text}") + except Exception as e: + logger.error(f"Error logging step for worker {self.id}: {e}") + + def append_stdout(self, text: str) -> None: + """Append text to stdout log. + + Args: + text: Text to append + """ + try: + if self.manager: + self.manager.append_worker_stdout(self.id, text) + else: + self._stdout_buffer.append(text) + except Exception as e: + logger.error(f"Error appending stdout for worker {self.id}: {e}") + + def get_stdout(self) -> str: + """Get all stdout for this worker. + + Returns: + Complete stdout text + """ + try: + if self.manager: + return self.manager.get_stdout(self.id) + else: + return "\n".join(self._stdout_buffer) + except Exception as e: + logger.error(f"Error getting stdout for worker {self.id}: {e}") + return "" + + def get_steps(self) -> str: + """Get all steps for this worker. + + Returns: + Complete steps text + """ + try: + if self.manager: + return self.manager.get_steps(self.id) + else: + return "\n".join(self._steps_buffer) + except Exception as e: + logger.error(f"Error getting steps for worker {self.id}: {e}") + return "" + + def update_progress(self, progress: str = "", details: str = "") -> None: + """Update worker progress. + + Args: + progress: Progress string (e.g., "50%") + details: Additional details + """ + self.progress = progress + self.details = details + try: + if self.manager: + self.manager.update_worker(self.id, progress, details) + except Exception as e: + logger.error(f"Error updating worker {self.id}: {e}") + + def finish(self, result: str = "completed", message: str = "") -> None: + """Mark worker as finished. + + Args: + result: Result status ('completed', 'error', 'cancelled') + message: Result message/error details + """ + self.result = result + self.status = "finished" + self.error_message = message + try: + if self.manager: + # Flush and disable logging handler before marking finished + self.manager.disable_logging_for_worker(self.id) + # Then mark as finished in database + self.manager.finish_worker(self.id, result, message) + except Exception as e: + logger.error(f"Error finishing worker {self.id}: {e}") + + +class WorkerLoggingHandler(logging.StreamHandler): + """Custom logging handler that captures logs for a worker.""" + + def __init__(self, worker_id: str, db: LocalLibraryDB, + manager: Optional['WorkerManager'] = None, + buffer_size: int = 50): + """Initialize the handler. + + Args: + worker_id: ID of the worker to capture logs for + db: Reference to LocalLibraryDB for storing logs + buffer_size: Number of logs to buffer before flushing to DB + """ + super().__init__() + self.worker_id = worker_id + self.db = db + self.manager = manager + self.buffer_size = buffer_size + self.buffer = [] + self._lock = Lock() + + # Set a format that includes timestamp and level + formatter = logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + ) + self.setFormatter(formatter) + + def emit(self, record): + """Emit a log record.""" + try: + # Try to format the record normally + try: + msg = self.format(record) + except (TypeError, ValueError): + # If formatting fails (e.g., %d format with non-int arg), + # build message manually without calling getMessage() + try: + # Try to format with args if possible + if record.args: + msg = record.msg % record.args + else: + msg = record.msg + except (TypeError, ValueError): + # If that fails too, just use the raw message string + msg = str(record.msg) + + # Add timestamp and level if not already in message + import time + timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(record.created)) + msg = f"{timestamp} - {record.name} - {record.levelname} - {msg}" + + with self._lock: + self.buffer.append(msg) + + # Flush to DB when buffer reaches size + if len(self.buffer) >= self.buffer_size: + self._flush() + except Exception: + self.handleError(record) + + def _flush(self): + """Flush buffered logs to database.""" + if self.buffer: + log_text = '\n'.join(self.buffer) + try: + if self.manager: + self.manager.append_worker_stdout(self.worker_id, log_text, channel='log') + else: + self.db.append_worker_stdout(self.worker_id, log_text, channel='log') + except Exception as e: + # If we can't write to DB, at least log it + log(f"Error flushing worker logs: {e}") + self.buffer = [] + + def flush(self): + """Flush any buffered records.""" + with self._lock: + self._flush() + super().flush() + + def close(self): + """Close the handler.""" + self.flush() + super().close() + + +class WorkerManager: + """Manages persistent worker tasks with auto-refresh capability.""" + + def __init__(self, library_root: Path, auto_refresh_interval: float = 2.0): + """Initialize the worker manager. + + Args: + library_root: Root directory for the local library database + auto_refresh_interval: Seconds between auto-refresh checks (0 = disabled) + """ + self.library_root = Path(library_root) + self.db = LocalLibraryDB(library_root) + self.auto_refresh_interval = auto_refresh_interval + self.refresh_callbacks: List[Callable] = [] + self.refresh_thread: Optional[Thread] = None + self._stop_refresh = False + self._lock = Lock() + self.worker_handlers: Dict[str, WorkerLoggingHandler] = {} # Track active handlers + self._worker_last_step: Dict[str, str] = {} + + def add_refresh_callback(self, callback: Callable[[List[Dict[str, Any]]], None]) -> None: + """Register a callback to be called on worker updates. + + Args: + callback: Function that receives list of active workers + """ + with self._lock: + self.refresh_callbacks.append(callback) + + def expire_running_workers( + self, + older_than_seconds: int = 300, + worker_id_prefix: Optional[str] = None, + reason: Optional[str] = None, + status: str = "error", + ) -> int: + """Mark stale running workers as finished. + + Args: + older_than_seconds: Idle threshold before expiring. + worker_id_prefix: Optional wildcard filter (e.g., 'cli_%'). + reason: Error message if none already exists. + status: New status to apply. + + Returns: + Count of workers updated. + """ + try: + return self.db.expire_running_workers( + older_than_seconds=older_than_seconds, + status=status, + reason=reason, + worker_id_prefix=worker_id_prefix, + ) + except Exception as exc: + logger.error(f"Failed to expire stale workers: {exc}", exc_info=True) + return 0 + + def remove_refresh_callback(self, callback: Callable) -> None: + """Remove a refresh callback. + + Args: + callback: The callback function to remove + """ + with self._lock: + if callback in self.refresh_callbacks: + self.refresh_callbacks.remove(callback) + + def enable_logging_for_worker(self, worker_id: str) -> Optional[WorkerLoggingHandler]: + """Enable logging capture for a worker. + + Creates a logging handler that captures all logs for this worker. + + Args: + worker_id: ID of the worker to capture logs for + + Returns: + The logging handler that was created, or None if there was an error + """ + try: + handler = WorkerLoggingHandler(worker_id, self.db, manager=self) + with self._lock: + self.worker_handlers[worker_id] = handler + + # Add the handler to the root logger so it captures all logs + root_logger = logging.getLogger() + root_logger.addHandler(handler) + root_logger.setLevel(logging.DEBUG) # Capture all levels + + logger.debug(f"[WorkerManager] Enabled logging for worker: {worker_id}") + return handler + except Exception as e: + logger.error(f"[WorkerManager] Error enabling logging for worker {worker_id}: {e}", exc_info=True) + return None + + def disable_logging_for_worker(self, worker_id: str) -> None: + """Disable logging capture for a worker and flush any pending logs. + + Args: + worker_id: ID of the worker to stop capturing logs for + """ + try: + with self._lock: + handler = self.worker_handlers.pop(worker_id, None) + + if handler: + # Flush and close the handler + handler.flush() + handler.close() + + # Remove from root logger + root_logger = logging.getLogger() + root_logger.removeHandler(handler) + + logger.debug(f"[WorkerManager] Disabled logging for worker: {worker_id}") + except Exception as e: + logger.error(f"[WorkerManager] Error disabling logging for worker {worker_id}: {e}", exc_info=True) + + def track_worker(self, worker_id: str, worker_type: str, title: str = "", + description: str = "", total_steps: int = 0, + pipe: Optional[str] = None) -> bool: + """Start tracking a new worker. + + Args: + worker_id: Unique identifier for the worker + worker_type: Type of worker (e.g., 'download', 'search', 'import') + title: Worker title/name + description: Worker description + total_steps: Total number of steps for progress tracking + pipe: Text of the originating pipe/prompt, if any + + Returns: + True if worker was inserted successfully + """ + try: + result = self.db.insert_worker(worker_id, worker_type, title, description, total_steps, pipe=pipe) + if result > 0: + logger.debug(f"[WorkerManager] Tracking worker: {worker_id} ({worker_type})") + self._start_refresh_if_needed() + return True + return False + except Exception as e: + logger.error(f"[WorkerManager] Error tracking worker: {e}", exc_info=True) + return False + + def update_worker(self, worker_id: str, progress: float = 0.0, current_step: str = "", + details: str = "", error: str = "") -> bool: + """Update worker progress and status. + + Args: + worker_id: Unique identifier for the worker + progress: Progress percentage (0-100) + current_step: Current step description + details: Additional details + error: Error message if any + + Returns: + True if update was successful + """ + try: + kwargs = {} + if progress > 0: + kwargs['progress'] = progress + if current_step: + kwargs['current_step'] = current_step + if details: + kwargs['description'] = details + if error: + kwargs['error_message'] = error + + if kwargs: + kwargs['last_updated'] = datetime.now().isoformat() + if 'current_step' in kwargs and kwargs['current_step']: + self._worker_last_step[worker_id] = str(kwargs['current_step']) + return self.db.update_worker(worker_id, **kwargs) + return True + except Exception as e: + logger.error(f"[WorkerManager] Error updating worker {worker_id}: {e}", exc_info=True) + return False + + def finish_worker(self, worker_id: str, result: str = "completed", + error_msg: str = "", result_data: str = "") -> bool: + """Mark a worker as finished. + + Args: + worker_id: Unique identifier for the worker + result: Result status ('completed', 'error', 'cancelled') + error_msg: Error message if any + result_data: Result data as JSON string + + Returns: + True if update was successful + """ + try: + kwargs = { + 'status': result, + 'completed_at': datetime.now().isoformat() + } + if error_msg: + kwargs['error_message'] = error_msg + if result_data: + kwargs['result_data'] = result_data + + success = self.db.update_worker(worker_id, **kwargs) + logger.info(f"[WorkerManager] Worker finished: {worker_id} ({result})") + self._worker_last_step.pop(worker_id, None) + return success + except Exception as e: + logger.error(f"[WorkerManager] Error finishing worker {worker_id}: {e}", exc_info=True) + return False + + def get_active_workers(self) -> List[Dict[str, Any]]: + """Get all active (running) workers. + + Returns: + List of active worker dictionaries + """ + try: + return self.db.get_active_workers() + except Exception as e: + logger.error(f"[WorkerManager] Error getting active workers: {e}", exc_info=True) + return [] + + def get_finished_workers(self, limit: int = 100) -> List[Dict[str, Any]]: + """Get all finished workers (completed, errored, or cancelled). + + Args: + limit: Maximum number of workers to retrieve + + Returns: + List of finished worker dictionaries + """ + try: + all_workers = self.db.get_all_workers(limit=limit) + # Filter to only finished workers + finished = [w for w in all_workers if w.get('status') in ['completed', 'error', 'cancelled']] + return finished + except Exception as e: + logger.error(f"[WorkerManager] Error getting finished workers: {e}", exc_info=True) + return [] + + def get_worker(self, worker_id: str) -> Optional[Dict[str, Any]]: + """Get a specific worker's data. + + Args: + worker_id: Unique identifier for the worker + + Returns: + Worker data or None if not found + """ + try: + return self.db.get_worker(worker_id) + except Exception as e: + logger.error(f"[WorkerManager] Error getting worker {worker_id}: {e}", exc_info=True) + return None + + def get_worker_events(self, worker_id: str, limit: int = 500) -> List[Dict[str, Any]]: + """Fetch recorded worker timeline events.""" + return self.db.get_worker_events(worker_id, limit) + + def log_step(self, worker_id: str, step_text: str) -> bool: + """Log a step to a worker's step history. + + Args: + worker_id: Unique identifier for the worker + step_text: Step description to log + + Returns: + True if successful + """ + try: + success = self.db.append_worker_steps(worker_id, step_text) + if success: + self._worker_last_step[worker_id] = step_text + return success + except Exception as e: + logger.error(f"[WorkerManager] Error logging step for worker {worker_id}: {e}", exc_info=True) + return False + + def _get_last_step(self, worker_id: str) -> Optional[str]: + """Return the most recent step description for a worker.""" + return self._worker_last_step.get(worker_id) + + def get_steps(self, worker_id: str) -> str: + """Get step logs for a worker. + + Args: + worker_id: Unique identifier for the worker + + Returns: + Steps text or empty string if not found + """ + try: + return self.db.get_worker_steps(worker_id) + except Exception as e: + logger.error(f"[WorkerManager] Error getting steps for worker {worker_id}: {e}", exc_info=True) + return '' + + def start_auto_refresh(self) -> None: + """Start the auto-refresh thread for periodic worker updates.""" + if self.auto_refresh_interval <= 0: + logger.debug("[WorkerManager] Auto-refresh disabled (interval <= 0)") + return + + if self.refresh_thread and self.refresh_thread.is_alive(): + logger.debug("[WorkerManager] Auto-refresh already running") + return + + logger.info(f"[WorkerManager] Starting auto-refresh with {self.auto_refresh_interval}s interval") + self._stop_refresh = False + self.refresh_thread = Thread(target=self._auto_refresh_loop, daemon=True) + self.refresh_thread.start() + + def stop_auto_refresh(self) -> None: + """Stop the auto-refresh thread.""" + logger.info("[WorkerManager] Stopping auto-refresh") + self._stop_refresh = True + if self.refresh_thread: + self.refresh_thread.join(timeout=5) + self.refresh_thread = None + + def _start_refresh_if_needed(self) -> None: + """Start auto-refresh if we have active workers and callbacks.""" + active = self.get_active_workers() + if active and self.refresh_callbacks and not self._stop_refresh: + self.start_auto_refresh() + + def _auto_refresh_loop(self) -> None: + """Main auto-refresh loop that periodically queries and notifies.""" + try: + while not self._stop_refresh: + time.sleep(self.auto_refresh_interval) + + # Check if there are active workers + active = self.get_active_workers() + + if not active: + # No more active workers, stop refreshing + logger.debug("[WorkerManager] No active workers, stopping auto-refresh") + break + + # Call all registered callbacks with the active workers + with self._lock: + for callback in self.refresh_callbacks: + try: + callback(active) + except Exception as e: + logger.error(f"[WorkerManager] Error in refresh callback: {e}", exc_info=True) + + except Exception as e: + logger.error(f"[WorkerManager] Error in auto-refresh loop: {e}", exc_info=True) + finally: + logger.debug("[WorkerManager] Auto-refresh loop ended") + + def cleanup_old_workers(self, days: int = 7) -> int: + """Clean up completed/errored workers older than specified days. + + Args: + days: Delete workers completed more than this many days ago + + Returns: + Number of workers deleted + """ + try: + count = self.db.cleanup_old_workers(days) + if count > 0: + logger.info(f"[WorkerManager] Cleaned up {count} old workers") + return count + except Exception as e: + logger.error(f"[WorkerManager] Error cleaning up old workers: {e}", exc_info=True) + return 0 + + def append_stdout(self, worker_id: str, text: str, channel: str = "stdout") -> bool: + """Append text to a worker's stdout log. + + Args: + worker_id: Unique identifier for the worker + text: Text to append + channel: Logical channel (stdout, stderr, log, etc.) + + Returns: + True if append was successful + """ + try: + step_label = self._get_last_step(worker_id) + return self.db.append_worker_stdout(worker_id, text, step=step_label, channel=channel) + except Exception as e: + logger.error(f"[WorkerManager] Error appending stdout: {e}", exc_info=True) + return False + + def get_stdout(self, worker_id: str) -> str: + """Get stdout logs for a worker. + + Args: + worker_id: Unique identifier for the worker + + Returns: + Worker's stdout or empty string + """ + try: + return self.db.get_worker_stdout(worker_id) + except Exception as e: + logger.error(f"[WorkerManager] Error getting stdout: {e}", exc_info=True) + return "" + + def append_worker_stdout(self, worker_id: str, text: str, channel: str = "stdout") -> bool: + """Compatibility wrapper for append_stdout.""" + return self.append_stdout(worker_id, text, channel=channel) + + def clear_stdout(self, worker_id: str) -> bool: + """Clear stdout logs for a worker. + + Args: + worker_id: Unique identifier for the worker + + Returns: + True if clear was successful + """ + try: + return self.db.clear_worker_stdout(worker_id) + except Exception as e: + logger.error(f"[WorkerManager] Error clearing stdout: {e}", exc_info=True) + return False + + def close(self) -> None: + """Close the worker manager and database connection.""" + self.stop_auto_refresh() + self.db.close() + logger.info("[WorkerManager] Closed") diff --git a/hydrus_health_check.py b/hydrus_health_check.py new file mode 100644 index 0000000..c8c2ddd --- /dev/null +++ b/hydrus_health_check.py @@ -0,0 +1,425 @@ +"""Hydrus API health check and initialization. + +Provides startup health checks for Hydrus API availability and gracefully +disables Hydrus features if the API is unavailable. +""" + +import logging +import sys + +from helper.logger import log +from typing import Tuple, Optional, Dict, Any +from pathlib import Path + +logger = logging.getLogger(__name__) + +# Global state for Hydrus availability +_HYDRUS_AVAILABLE: Optional[bool] = None +_HYDRUS_UNAVAILABLE_REASON: Optional[str] = None +_HYDRUS_CHECK_COMPLETE = False + +# Global state for Debrid availability +_DEBRID_AVAILABLE: Optional[bool] = None +_DEBRID_UNAVAILABLE_REASON: Optional[str] = None +_DEBRID_CHECK_COMPLETE = False + +# Global state for MPV availability +_MPV_AVAILABLE: Optional[bool] = None +_MPV_UNAVAILABLE_REASON: Optional[str] = None +_MPV_CHECK_COMPLETE = False + + +def check_hydrus_availability(config: Dict[str, Any]) -> Tuple[bool, Optional[str]]: + """Check if Hydrus API is available by pinging it. + + Args: + config: Application configuration dictionary + + Returns: + Tuple of (is_available: bool, reason: Optional[str]) + - (True, None) if Hydrus is available + - (False, reason) if Hydrus is unavailable with reason + """ + try: + from helper.hydrus import is_available as _is_hydrus_available + + logger.info("[Hydrus Health Check] Pinging Hydrus API...") + is_available, reason = _is_hydrus_available(config, use_cache=False) + + if is_available: + logger.info("[Hydrus Health Check] ✅ Hydrus API is AVAILABLE") + return True, None + else: + reason_str = f": {reason}" if reason else "" + logger.warning(f"[Hydrus Health Check] ❌ Hydrus API is UNAVAILABLE{reason_str}") + return False, reason + + except Exception as e: + error_msg = str(e) + logger.error(f"[Hydrus Health Check] ❌ Error checking Hydrus availability: {error_msg}") + return False, error_msg + + +def initialize_hydrus_health_check(config: Dict[str, Any]) -> None: + """Initialize Hydrus health check at startup. + + This should be called once at application startup to determine if Hydrus + features should be enabled or disabled. + + Args: + config: Application configuration dictionary + """ + global _HYDRUS_AVAILABLE, _HYDRUS_UNAVAILABLE_REASON, _HYDRUS_CHECK_COMPLETE + + logger.info("[Startup] Starting Hydrus health check...") + + try: + is_available, reason = check_hydrus_availability(config) + _HYDRUS_AVAILABLE = is_available + _HYDRUS_UNAVAILABLE_REASON = reason + _HYDRUS_CHECK_COMPLETE = True + + if is_available: + log("✅ Hydrus: ENABLED - All Hydrus features available", file=sys.stderr) + else: + log(f"⚠️ Hydrus: DISABLED - {reason or 'Connection failed'}", file=sys.stderr) + log("- Export functionality disabled", file=sys.stderr) + log("- Hydrus library features disabled", file=sys.stderr) + log("- Hydrus tag operations disabled", file=sys.stderr) + log("→ Local storage and All-Debrid features still available", file=sys.stderr) + + except Exception as e: + logger.error(f"[Startup] Failed to initialize Hydrus health check: {e}", exc_info=True) + _HYDRUS_AVAILABLE = False + _HYDRUS_UNAVAILABLE_REASON = str(e) + _HYDRUS_CHECK_COMPLETE = True + log(f"⚠️ Hydrus: DISABLED - Error during health check: {e}", file=sys.stderr) + + +def check_debrid_availability(config: Dict[str, Any]) -> Tuple[bool, Optional[str]]: + """Check if Debrid API is available. + + Args: + config: Application configuration dictionary + + Returns: + Tuple of (is_available: bool, reason: Optional[str]) + - (True, None) if Debrid API is available + - (False, reason) if Debrid API is unavailable with reason + """ + try: + from helper.http_client import HTTPClient + + logger.info("[Debrid Health Check] Pinging Debrid API at https://api.alldebrid.com/v4/ping...") + + try: + # Use the public ping endpoint to check API availability + # This endpoint doesn't require authentication + with HTTPClient(timeout=10.0, verify_ssl=True) as client: + response = client.get('https://api.alldebrid.com/v4/ping') + logger.debug(f"[Debrid Health Check] Response status: {response.status_code}") + + # Read response text first (handles gzip decompression) + try: + response_text = response.text + logger.debug(f"[Debrid Health Check] Response text: {response_text}") + except Exception as e: + logger.error(f"[Debrid Health Check] ❌ Failed to read response text: {e}") + return False, f"Failed to read response: {e}" + + # Parse JSON + try: + result = response.json() + logger.debug(f"[Debrid Health Check] Response JSON: {result}") + except Exception as e: + logger.error(f"[Debrid Health Check] ❌ Failed to parse JSON: {e}") + logger.error(f"[Debrid Health Check] Response was: {response_text}") + return False, f"Failed to parse response: {e}" + + # Validate response format + if result.get('status') == 'success' and result.get('data', {}).get('ping') == 'pong': + logger.info("[Debrid Health Check] ✅ Debrid API is AVAILABLE") + return True, None + else: + logger.warning(f"[Debrid Health Check] ❌ Debrid API returned unexpected response: {result}") + return False, "Invalid API response" + except Exception as e: + error_msg = str(e) + logger.warning(f"[Debrid Health Check] ❌ Debrid API error: {error_msg}") + import traceback + logger.debug(f"[Debrid Health Check] Traceback: {traceback.format_exc()}") + return False, error_msg + + except Exception as e: + error_msg = str(e) + logger.error(f"[Debrid Health Check] ❌ Error checking Debrid availability: {error_msg}") + return False, error_msg + + +def initialize_debrid_health_check(config: Dict[str, Any]) -> None: + """Initialize Debrid health check at startup. + + This should be called once at application startup to determine if Debrid + features should be enabled or disabled. + + Args: + config: Application configuration dictionary + """ + global _DEBRID_AVAILABLE, _DEBRID_UNAVAILABLE_REASON, _DEBRID_CHECK_COMPLETE + + logger.info("[Startup] Starting Debrid health check...") + + try: + is_available, reason = check_debrid_availability(config) + _DEBRID_AVAILABLE = is_available + _DEBRID_UNAVAILABLE_REASON = reason + _DEBRID_CHECK_COMPLETE = True + + if is_available: + log("✅ Debrid: ENABLED - All Debrid features available", file=sys.stderr) + logger.info("[Startup] Debrid health check PASSED") + else: + log(f"⚠️ Debrid: DISABLED - {reason or 'Connection failed'}", file=sys.stderr) + log("- Debrid export disabled", file=sys.stderr) + log("- Debrid library features disabled", file=sys.stderr) + log("→ Local storage and Hydrus features still available", file=sys.stderr) + logger.warning(f"[Startup] Debrid health check FAILED: {reason}") + + except Exception as e: + logger.error(f"[Startup] Failed to initialize Debrid health check: {e}", exc_info=True) + _DEBRID_AVAILABLE = False + _DEBRID_UNAVAILABLE_REASON = str(e) + _DEBRID_CHECK_COMPLETE = True + log(f"⚠️ Debrid: DISABLED - Error during health check: {e}", file=sys.stderr) + + +def check_mpv_availability() -> Tuple[bool, Optional[str]]: + """Check if MPV is available (installed and runnable). + + Returns: + Tuple of (is_available: bool, reason: Optional[str]) + """ + global _MPV_AVAILABLE, _MPV_UNAVAILABLE_REASON, _MPV_CHECK_COMPLETE + + if _MPV_CHECK_COMPLETE and _MPV_AVAILABLE is not None: + return _MPV_AVAILABLE, _MPV_UNAVAILABLE_REASON + + import shutil + import subprocess + + logger.info("[MPV Health Check] Checking for MPV executable...") + + mpv_path = shutil.which("mpv") + if not mpv_path: + _MPV_AVAILABLE = False + _MPV_UNAVAILABLE_REASON = "Executable 'mpv' not found in PATH" + _MPV_CHECK_COMPLETE = True + logger.warning(f"[MPV Health Check] ❌ MPV is UNAVAILABLE: {_MPV_UNAVAILABLE_REASON}") + return False, _MPV_UNAVAILABLE_REASON + + # Try to get version to confirm it works + try: + result = subprocess.run( + [mpv_path, "--version"], + capture_output=True, + text=True, + timeout=2 + ) + if result.returncode == 0: + version_line = result.stdout.split('\n')[0] + _MPV_AVAILABLE = True + _MPV_UNAVAILABLE_REASON = None + _MPV_CHECK_COMPLETE = True + logger.info(f"[MPV Health Check] ✅ MPV is AVAILABLE ({version_line})") + return True, None + else: + _MPV_AVAILABLE = False + _MPV_UNAVAILABLE_REASON = f"MPV returned non-zero exit code: {result.returncode}" + _MPV_CHECK_COMPLETE = True + logger.warning(f"[MPV Health Check] ❌ MPV is UNAVAILABLE: {_MPV_UNAVAILABLE_REASON}") + return False, _MPV_UNAVAILABLE_REASON + except Exception as e: + _MPV_AVAILABLE = False + _MPV_UNAVAILABLE_REASON = f"Error running MPV: {e}" + _MPV_CHECK_COMPLETE = True + logger.warning(f"[MPV Health Check] ❌ MPV is UNAVAILABLE: {_MPV_UNAVAILABLE_REASON}") + return False, _MPV_UNAVAILABLE_REASON + + +def initialize_mpv_health_check() -> None: + """Initialize MPV health check at startup. + + This should be called once at application startup to determine if MPV + features should be enabled or disabled. + """ + global _MPV_AVAILABLE, _MPV_UNAVAILABLE_REASON, _MPV_CHECK_COMPLETE + + logger.info("[Startup] Starting MPV health check...") + + try: + is_available, reason = check_mpv_availability() + _MPV_AVAILABLE = is_available + _MPV_UNAVAILABLE_REASON = reason + _MPV_CHECK_COMPLETE = True + + if is_available: + log("✅ MPV: ENABLED - All MPV features available", file=sys.stderr) + logger.info("[Startup] MPV health check PASSED") + else: + log(f"⚠️ MPV: DISABLED - {reason or 'Connection failed'}", file=sys.stderr) + log("→ Hydrus features still available", file=sys.stderr) + logger.warning(f"[Startup] MPV health check FAILED: {reason}") + + except Exception as e: + logger.error(f"[Startup] Failed to initialize MPV health check: {e}", exc_info=True) + _MPV_AVAILABLE = False + _MPV_UNAVAILABLE_REASON = str(e) + _MPV_CHECK_COMPLETE = True + log(f"⚠️ MPV: DISABLED - Error during health check: {e}", file=sys.stderr) + + +def is_hydrus_available() -> bool: + """Check if Hydrus is available (from cached health check). + + Returns: + True if Hydrus API is available, False otherwise + """ + return _HYDRUS_AVAILABLE is True + + +def get_hydrus_unavailable_reason() -> Optional[str]: + """Get the reason why Hydrus is unavailable. + + Returns: + String explaining why Hydrus is unavailable, or None if available + """ + return _HYDRUS_UNAVAILABLE_REASON if not is_hydrus_available() else None + + +def is_hydrus_check_complete() -> bool: + """Check if the Hydrus health check has been completed. + + Returns: + True if health check has run, False if still pending + """ + return _HYDRUS_CHECK_COMPLETE + + +def disable_hydrus_features() -> None: + """Manually disable all Hydrus features (for testing/fallback). + + This can be called if Hydrus connectivity is lost after startup. + """ + global _HYDRUS_AVAILABLE, _HYDRUS_UNAVAILABLE_REASON + _HYDRUS_AVAILABLE = False + _HYDRUS_UNAVAILABLE_REASON = "Manually disabled or lost connection" + logger.warning("[Hydrus] Features manually disabled") + + +def enable_hydrus_features() -> None: + """Manually enable Hydrus features (for testing/fallback). + + This can be called if Hydrus connectivity is restored after startup. + """ + global _HYDRUS_AVAILABLE, _HYDRUS_UNAVAILABLE_REASON + _HYDRUS_AVAILABLE = True + _HYDRUS_UNAVAILABLE_REASON = None + logger.info("[Hydrus] Features manually enabled") + + +def is_debrid_available() -> bool: + """Check if Debrid is available (from cached health check). + + Returns: + True if Debrid API is available, False otherwise + """ + return _DEBRID_AVAILABLE is True + + +def get_debrid_unavailable_reason() -> Optional[str]: + """Get the reason why Debrid is unavailable. + + Returns: + String explaining why Debrid is unavailable, or None if available + """ + return _DEBRID_UNAVAILABLE_REASON if not is_debrid_available() else None + + +def is_debrid_check_complete() -> bool: + """Check if the Debrid health check has been completed. + + Returns: + True if health check has run, False if still pending + """ + return _DEBRID_CHECK_COMPLETE + + +def disable_debrid_features() -> None: + """Manually disable all Debrid features (for testing/fallback). + + This can be called if Debrid connectivity is lost after startup. + """ + global _DEBRID_AVAILABLE, _DEBRID_UNAVAILABLE_REASON + _DEBRID_AVAILABLE = False + _DEBRID_UNAVAILABLE_REASON = "Manually disabled or lost connection" + logger.warning("[Debrid] Features manually disabled") + + +def enable_debrid_features() -> None: + """Manually enable Debrid features (for testing/fallback). + + This can be called if Debrid connectivity is restored after startup. + """ + global _DEBRID_AVAILABLE, _DEBRID_UNAVAILABLE_REASON + _DEBRID_AVAILABLE = True + _DEBRID_UNAVAILABLE_REASON = None + logger.info("[Debrid] Features manually enabled") + + +def is_mpv_available() -> bool: + """Check if MPV is available (from cached health check). + + Returns: + True if MPV is available, False otherwise + """ + return _MPV_AVAILABLE is True + + +def get_mpv_unavailable_reason() -> Optional[str]: + """Get the reason why MPV is unavailable. + + Returns: + String explaining why MPV is unavailable, or None if available + """ + return _MPV_UNAVAILABLE_REASON if not is_mpv_available() else None + + +def is_mpv_check_complete() -> bool: + """Check if the MPV health check has been completed. + + Returns: + True if health check has run, False if still pending + """ + return _MPV_CHECK_COMPLETE + + +def disable_mpv_features() -> None: + """Manually disable all MPV features (for testing/fallback). + + This can be called if MPV connectivity is lost after startup. + """ + global _MPV_AVAILABLE, _MPV_UNAVAILABLE_REASON + _MPV_AVAILABLE = False + _MPV_UNAVAILABLE_REASON = "Manually disabled or lost connection" + logger.warning("[MPV] Features manually disabled") + + +def enable_mpv_features() -> None: + """Manually enable MPV features (for testing/fallback). + + This can be called if MPV connectivity is restored after startup. + """ + global _MPV_AVAILABLE, _MPV_UNAVAILABLE_REASON + _MPV_AVAILABLE = True + _MPV_UNAVAILABLE_REASON = None + logger.info("[MPV] Features manually enabled") diff --git a/medeia_entry.py b/medeia_entry.py new file mode 100644 index 0000000..7e9df76 --- /dev/null +++ b/medeia_entry.py @@ -0,0 +1,13 @@ +"""Entry point wrapper for Medeia-Macina CLI.""" +import sys +from pathlib import Path + +# Add the current directory to sys.path so we can import CLI +root_dir = Path(__file__).parent +if str(root_dir) not in sys.path: + sys.path.insert(0, str(root_dir)) + +from CLI import main + +if __name__ == "__main__": + main() diff --git a/medeia_macina/__init__.py b/medeia_macina/__init__.py new file mode 100644 index 0000000..db1dde0 --- /dev/null +++ b/medeia_macina/__init__.py @@ -0,0 +1,2 @@ +"""Medeia-Macina package - Media management system.""" +__version__ = "0.1.0" diff --git a/medeia_macina/cli_entry.py b/medeia_macina/cli_entry.py new file mode 100644 index 0000000..72a5d08 --- /dev/null +++ b/medeia_macina/cli_entry.py @@ -0,0 +1,13 @@ +"""Entry point wrapper for Medeia-Macina CLI.""" +import sys +from pathlib import Path + +# Add the parent directory to sys.path so we can import CLI +root_dir = Path(__file__).parent.parent +if str(root_dir) not in sys.path: + sys.path.insert(0, str(root_dir)) + +from CLI import main + +if __name__ == "__main__": + main() diff --git a/metadata.py b/metadata.py new file mode 100644 index 0000000..591d4d9 --- /dev/null +++ b/metadata.py @@ -0,0 +1,3199 @@ +import json +import re +import subprocess +import sys +import shutil +import sqlite3 +import requests +from helper.logger import log +from urllib.parse import urlsplit, urlunsplit, unquote +from collections import deque +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple +from models import PipeObject, FileRelationshipTracker, _get_file_hash +try: + import musicbrainzngs # type: ignore +except ImportError: # pragma: no cover + musicbrainzngs = None + +from imdbinfo.services import get_movie # type: ignore + +try: + import yt_dlp # type: ignore +except ImportError: # pragma: no cover + yt_dlp = None +try: + from config import load_config, resolve_output_dir # type: ignore +except ImportError: # pragma: no cover + load_config = None # type: ignore[assignment] + resolve_output_dir = None # type: ignore[assignment] + +try: + from helpers.hydrus import HydrusClient, HydrusRequestError, HydrusRequestSpec # type: ignore +except ImportError: # pragma: no cover + HydrusClient = None # type: ignore[assignment] + HydrusRequestError = RuntimeError # type: ignore[assignment] + HydrusRequestSpec = None # type: ignore[assignment] +if musicbrainzngs: # pragma: no branch + musicbrainzngs.set_useragent("DownlowScript", "0.1", "admin@example.com") + MusicBrainzRequestError = getattr(musicbrainzngs, "MusicBrainzRequestError", Exception) +else: # pragma: no cover + MusicBrainzRequestError = Exception + + +# Global relationship tracker for the current session +_CURRENT_RELATIONSHIP_TRACKER = FileRelationshipTracker() + + +def _generate_hydrus_url_variants(url: str) -> List[str]: + seen: Set[str] = set() + variants: List[str] = [] + + def push(candidate: Optional[str]) -> None: + if not candidate: + return + text = candidate.strip() + if not text or text in seen: + return + seen.add(text) + variants.append(text) + + push(url) + try: + parsed = urlsplit(url) + except Exception: + return variants + + if parsed.scheme in {"http", "https"}: + alternate_scheme = "https" if parsed.scheme == "http" else "http" + push(urlunsplit((alternate_scheme, parsed.netloc, parsed.path, parsed.query, parsed.fragment))) + + normalised_netloc = parsed.netloc.lower() + if normalised_netloc and normalised_netloc != parsed.netloc: + push(urlunsplit((parsed.scheme, normalised_netloc, parsed.path, parsed.query, parsed.fragment))) + + if parsed.path: + trimmed_path = parsed.path.rstrip('/') + if trimmed_path != parsed.path: + push(urlunsplit((parsed.scheme, parsed.netloc, trimmed_path, parsed.query, parsed.fragment))) + else: + push(urlunsplit((parsed.scheme, parsed.netloc, parsed.path + '/', parsed.query, parsed.fragment))) + unquoted_path = unquote(parsed.path) + if unquoted_path != parsed.path: + push(urlunsplit((parsed.scheme, parsed.netloc, unquoted_path, parsed.query, parsed.fragment))) + + if parsed.query or parsed.fragment: + push(urlunsplit((parsed.scheme, parsed.netloc, parsed.path, '', ''))) + if parsed.path: + unquoted_path = unquote(parsed.path) + push(urlunsplit((parsed.scheme, parsed.netloc, unquoted_path, '', ''))) + + return variants + + +def value_normalize(value: str) -> str: + """Normalize whitespace: collapse internal spaces, strip, remove newlines.""" + value = value.replace("\n", " ").replace("\r", " ") + value = re.sub(r"\s+", " ", value).strip() + return value + + +def import_pending_sidecars(db_root: Path, db: Any) -> None: + """Import any .tags or .metadata sidecars that exist in the filesystem. + + Scans for sidecar files (.tags, .metadata, .notes) and imports their contents + into the database as tags and metadata for the associated files. + + Args: + db_root: Root directory to search for sidecar files + db: LocalLibraryDB instance to import metadata into + """ + try: + sidecar_patterns = ['**/*.tags', '**/*.metadata', '**/*.notes'] + + for pattern in sidecar_patterns: + for sidecar_path in db_root.glob(pattern): + if '.downlow' in sidecar_path.parts: + continue + + if sidecar_path.suffix == '.tags': + orig_path = sidecar_path.parent / sidecar_path.name[:-5] + elif sidecar_path.suffix == '.metadata': + orig_path = sidecar_path.parent / sidecar_path.name[:-9] + elif sidecar_path.suffix == '.notes': + orig_path = sidecar_path.parent / sidecar_path.name[:-6] + else: + continue + + if not orig_path.exists(): + continue + + try: + cursor = db.connection.cursor() if db.connection else None + if cursor: + cursor.execute('SELECT id FROM files WHERE file_path = ?', (str(orig_path),)) + result = cursor.fetchone() + file_id = result[0] if result else None + except Exception: + file_id = None + + if not file_id: + try: + cursor = db.connection.cursor() if db.connection else None + if cursor: + cursor.execute( + 'INSERT INTO files (file_path, indexed_at, updated_at) VALUES (?, datetime("now"), datetime("now"))', + (str(orig_path),) + ) + db.connection.commit() + file_id = cursor.lastrowid + except Exception: + continue + + if sidecar_path.suffix == '.tags' and file_id: + try: + with open(sidecar_path, 'r', encoding='utf-8') as f: + content = f.read().strip() + + if content: + if '\n' in content: + tags = [tag.strip() for tag in content.split('\n') if tag.strip()] + else: + tags = [tag.strip() for tag in content.split(',') if tag.strip()] + + cursor = db.connection.cursor() if db.connection else None + if cursor: + for tag in tags: + cursor.execute( + 'INSERT OR IGNORE INTO tags (file_id, tag, tag_type) VALUES (?, ?, ?)', + (file_id, tag, 'sidecar_import') + ) + db.connection.commit() + + sidecar_path.unlink() + except Exception: + pass + + elif sidecar_path.suffix == '.metadata' and file_id: + try: + with open(sidecar_path, 'r', encoding='utf-8') as f: + metadata_dict = json.load(f) + + cursor = db.connection.cursor() if db.connection else None + if cursor and metadata_dict: + cursor.execute( + 'INSERT OR REPLACE INTO metadata (file_id, hash, size, ext, duration, media_type, time_imported, time_modified) VALUES (?, ?, ?, ?, ?, ?, datetime("now"), datetime("now"))', + ( + file_id, + metadata_dict.get('hash'), + metadata_dict.get('size'), + metadata_dict.get('ext'), + metadata_dict.get('duration'), + metadata_dict.get('media_type'), + ) + ) + db.connection.commit() + + sidecar_path.unlink() + except Exception: + pass + + except Exception: + pass + + +def _extract_from_sequence(values: Sequence) -> Iterable[str]: + """Extract string values from a sequence of mixed types (dicts, strings, etc.).""" + seen = set() + for item in values: + candidate = None + if isinstance(item, dict): + candidate = item.get("name") or item.get("title") or item.get("value") or item.get("text") or item.get("id") or item.get("imdb_id") + else: + candidate = str(item) + if candidate: + normalized = value_normalize(str(candidate)) + if normalized and normalized not in seen: + seen.add(normalized) + yield normalized + + +def _add_tag(tags: List[str], namespace: str, value: Optional[str]) -> None: + """Add a single namespaced tag (e.g., 'artist:Beatles').""" + if not value: + return + value = value_normalize(str(value)) + if not value: + return + tags.append(f"{namespace}:{value}") + + +def _extend_tags(tags: List[str], namespace: str, values) -> None: + """Extend tags from a single value or sequence, with optional namespace.""" + if not values: + return + if isinstance(values, set): + values = list(values) + if isinstance(values, (list, tuple)): + for candidate in _extract_from_sequence(values): + _add_tag(tags, namespace, candidate) + else: + _add_tag(tags, namespace, values) + +def imdb_tag(imdb_id: str) -> Dict[str, object]: + movie = get_movie(imdb_id) + if movie is None: + raise ValueError(f"IMDb title not found: {imdb_id}") + if hasattr(movie, "model_dump"): + info = movie.model_dump() + elif hasattr(movie, "dict"): + info = movie.dict() + else: + info = {} + tags: List[str] = [] + canonical_id = getattr(movie, "imdb_id", None) or info.get("imdb_id") or imdb_id + if canonical_id: + canonical_id = str(canonical_id).strip().lower() + if not canonical_id.startswith("tt"): + canonical_id = f"tt{canonical_id}" + else: + canonical_id = imdb_id.lower() + if not canonical_id.startswith("tt"): + canonical_id = f"tt{canonical_id}" + _add_tag(tags, "imdb", canonical_id) + _add_tag(tags, "title", info.get("title") or getattr(movie, "title", None)) + _add_tag(tags, "year", info.get("year") or info.get("start_year") or getattr(movie, "year", None)) + _add_tag(tags, "rating", info.get("rating")) + runtime_value = None + if isinstance(info.get("runtime"), (str, int)): + runtime_value = info["runtime"] + elif isinstance(info.get("runtimes"), (list, tuple)) and info["runtimes"]: + runtime_value = info["runtimes"][0] + elif info.get("duration"): + runtime_value = info["duration"] + _add_tag(tags, "runtime", runtime_value) + kind = None + if hasattr(movie, "is_series") and movie.is_series(): + kind = "series" + elif hasattr(movie, "is_episode") and movie.is_episode(): + kind = "episode" + else: + kind = info.get("kind") or "movie" + _add_tag(tags, "kind", kind) + _extend_tags(tags, "genre", info.get("genres") or info.get("genre")) + _extend_tags(tags, "language", info.get("languages")) + _extend_tags(tags, "country", info.get("countries")) + creators = info.get("directors") or info.get("director") or info.get("producers") or info.get("writers") + if creators: + _extend_tags(tags, "creator", creators) + info_episode = getattr(movie, "info_episode", None) + series_title = None + season = info.get("season") or info.get("series_season") + episode = info.get("episode") or info.get("series_episode") + if info_episode: + if hasattr(info_episode, "model_dump"): + episode_meta = info_episode.model_dump() + elif hasattr(info_episode, "dict"): + episode_meta = info_episode.dict() + else: + episode_meta = getattr(info_episode, "__dict__", {}) or {} + season = season or episode_meta.get("season") or episode_meta.get("season_n") + episode = episode or episode_meta.get("episode") or episode_meta.get("episode_n") + series_title = episode_meta.get("series_title") + if not series_title: + series_title = getattr(getattr(movie, "series_info", None), "title", None) + if kind == "episode" and not season: + season = getattr(getattr(movie, "series_info", None), "season", None) + if season: + _add_tag(tags, "season", season) + if episode: + _add_tag(tags, "episode", episode) + series_title = series_title or info.get("series_title") or info.get("series") or getattr(getattr(movie, "series_info", None), "title", None) + if series_title: + _add_tag(tags, "series", series_title) + summary = info.get("plot outline") or info.get("plot_outline") or info.get("plot") + if isinstance(summary, (list, tuple)): + summary = summary[0] if summary else None + if not summary and hasattr(movie, "plot_outline"): + summary = getattr(movie, "plot_outline") + if not summary: + summaries = info.get("summaries") + if isinstance(summaries, (list, tuple)) and summaries: + summary = summaries[0] + if summary: + _add_tag(tags, "summary", summary) + cast_sources = info.get("cast") or info.get("actors") or info.get("cast_members") or info.get("stars") + cast_names: List[str] = [] + if cast_sources: + for name in _extract_from_sequence(cast_sources): + if name: + cast_names.append(name) + if len(cast_names) >= 10: + break + if cast_names: + _extend_tags(tags, "cast", cast_names) + return PipeObject("imdb", canonical_id, tags=tags).to_dict() +def fetch_musicbrainz_tags(mbid: str, entity: str) -> Dict[str, object]: + if not musicbrainzngs: + raise RuntimeError("musicbrainzngs package is not available") + entity = entity.lower() + if entity not in {"release", "recording", "artist"}: + raise ValueError("Unsupported MusicBrainz entity: %s" % entity) + def _fetch_with_fallback(getter, key: str, includes: List[str]): + try: + return getter(mbid, includes=includes)[key] + except MusicBrainzRequestError as exc: + if "Bad includes" in str(exc) and "genres" in includes: + fallback = [inc for inc in includes if inc != "genres"] + return getter(mbid, includes=fallback)[key] + raise + include = ["tags", "genres"] + match entity: + case "release": + include.extend(["artist-credits", "release-groups"]) + data = _fetch_with_fallback(musicbrainzngs.get_release_by_id, "release", include) + case "recording": + include.extend(["artists", "releases"]) + data = _fetch_with_fallback(musicbrainzngs.get_recording_by_id, "recording", include) + case _: + include.extend(["release-groups", "aliases"]) + data = _fetch_with_fallback(musicbrainzngs.get_artist_by_id, "artist", include) + tags: List[str] = [] + _add_tag(tags, "musicbrainz", mbid) + _add_tag(tags, "entity", entity) + _add_tag(tags, "title", data.get("title")) + if entity != "artist": + date = data.get("date") or data.get("first-release-date") + if date: + _add_tag(tags, "date", date) + _add_tag(tags, "year", date[:4]) + if data.get("country"): + _add_tag(tags, "country", data["country"]) + if data.get("status"): + _add_tag(tags, "status", data["status"]) + artist_credit = data.get("artist-credit") or data.get("artists") + if artist_credit: + names = [] + for item in artist_credit: + if isinstance(item, dict): + name = item.get("name") or item.get("artist", {}).get("name") + if name: + names.append(name) + _extend_tags(tags, "artist", names) + tag_list = data.get("tag-list") or data.get("tags") or [] + for tag in tag_list: + if isinstance(tag, dict) and tag.get("name"): + _add_tag(tags, "tag", tag["name"]) + genre_list = data.get("genre-list") or data.get("genres") or [] + for genre in genre_list: + if isinstance(genre, dict) and genre.get("name"): + _add_tag(tags, "genre", genre["name"]) + return PipeObject("musicbrainz", mbid, tags=tags, extra={"entity": entity}).to_dict() + + +def fetch_openlibrary_tags(ol_id: str) -> Dict[str, object]: + """Fetch metadata tags from OpenLibrary. + + Args: + ol_id: OpenLibrary ID (e.g., 'OL123456M' for a book) + + Returns: + Dictionary with 'tags' key containing list of extracted tags + """ + import urllib.request + + # Normalize OL ID + ol_id = ol_id.strip().upper() + if not ol_id.startswith('OL'): + ol_id = f'OL{ol_id}' + + # Fetch from OpenLibrary API + url = f"https://openlibrary.org/books/{ol_id}.json" + tags: List[str] = [] + + try: + with urllib.request.urlopen(url, timeout=10) as response: + data = json.loads(response.read().decode('utf-8')) + except Exception as e: + raise ValueError(f"Failed to fetch OpenLibrary data for {ol_id}: {e}") + + # Add OpenLibrary ID tag + _add_tag(tags, "openlibrary", ol_id) + + # Extract title + _add_tag(tags, "title", data.get("title")) + + # Extract subtitle if present + if data.get("subtitle"): + _add_tag(tags, "subtitle", data["subtitle"]) + + # Extract authors + authors = data.get("authors", []) + author_names: List[str] = [] + for author in authors: + if isinstance(author, dict): + name = author.get("name") + else: + name = str(author) + if name: + author_names.append(name) + if author_names: + _extend_tags(tags, "author", author_names) + + # Extract publication details + if data.get("publish_date"): + _add_tag(tags, "publish_date", data["publish_date"]) + # Extract year if present + year_match = re.search(r'\b(\d{4})\b', str(data.get("publish_date", ""))) + if year_match: + _add_tag(tags, "year", year_match.group(1)) + + # Extract publishers + publishers = data.get("publishers", []) + if publishers: + publisher_names = [] + for pub in publishers: + if isinstance(pub, dict): + name = pub.get("name") + else: + name = str(pub) + if name: + publisher_names.append(name) + if publisher_names: + _extend_tags(tags, "publisher", publisher_names) + + # Extract languages + languages = data.get("languages", []) + if languages: + lang_codes = [] + for lang in languages: + if isinstance(lang, dict): + code = lang.get("key", "").split("/")[-1] + else: + code = str(lang).split("/")[-1] + if code and code != "": + lang_codes.append(code) + if lang_codes: + _extend_tags(tags, "language", lang_codes) + + # Extract ISBN + isbns = data.get("isbn_10", []) + data.get("isbn_13", []) + if isbns: + for isbn in isbns[:1]: # Just take first one + if len(str(isbn)) == 10: + _add_tag(tags, "isbn_10", isbn) + elif len(str(isbn)) == 13: + _add_tag(tags, "isbn_13", isbn) + + # Extract page count + _add_tag(tags, "pages", data.get("number_of_pages")) + + # Extract genres/subjects (OpenLibrary calls them subjects) + # Subjects are added as plain freeform tags (no namespace prefix) + subjects = data.get("subjects", []) + if subjects: + for subject in subjects[:10]: # Limit to 10 subjects + if isinstance(subject, dict): + name = subject.get("name") + else: + name = str(subject) + if name: + # Add subject as plain tag without "subject:" prefix + normalized = value_normalize(str(name)) + if normalized: + tags.append(normalized) + + # Extract OpenLibrary description + description = data.get("description") + if description: + if isinstance(description, dict): + description = description.get("value") + _add_tag(tags, "summary", description) + + return PipeObject("openlibrary", ol_id, tags=tags).to_dict() + + +def _append_unique(target: List[str], seen: Set[str], value: Optional[str]) -> None: + """Append a single value if not already in seen set (deduplication).""" + if value is None: + return + normalized = value_normalize(str(value)) + if not normalized or normalized in seen: + return + seen.add(normalized) + target.append(normalized) + + +def _extend_namespaced(target: List[str], seen: Set[str], namespace: str, values: Iterable[Optional[str]]) -> None: + """Append namespaced values if not already in seen set.""" + for val in values: + if val: + _append_unique(target, seen, f"{namespace}:{val}") + + +def _coerce_duration(metadata: Dict[str, Any]) -> Optional[float]: + for key in ("duration", "duration_seconds", "length", "duration_sec"): + value = metadata.get(key) + if value is None: + continue + if isinstance(value, (int, float)): + if value > 0: + return float(value) + elif isinstance(value, str): + try: + candidate = float(value.strip()) + except ValueError: + continue + if candidate > 0: + return candidate + return None +def _sanitize_url(value: Optional[str]) -> Optional[str]: + """Sanitize URL: normalize and remove ytdl:// prefix.""" + if value is None: + return None + cleaned = value_normalize(str(value)) + if not cleaned: + return None + if cleaned.lower().startswith("ytdl://"): + cleaned = cleaned[7:] + return cleaned + + +def _clean_existing_tags(existing: Any) -> List[str]: + tags: List[str] = [] + seen: Set[str] = set() + if isinstance(existing, (list, tuple, set)): + iterable = existing + elif existing is None: + iterable = [] + else: + iterable = [existing] + for tag in iterable: + _append_unique(tags, seen, tag) + return tags +def _should_fetch_url(url: Optional[str]) -> bool: + if not url or not isinstance(url, str): + return False + return url.lower().startswith(('http://', 'https://')) +def fetch_remote_metadata(url: str, options: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[str]]: + warnings: List[str] = [] + info: Optional[Dict[str, Any]] = None + if yt_dlp is not None: + try: # pragma: no cover - depends on runtime availability + ydl_opts = { + 'quiet': True, + 'no_warnings': True, + 'skip_download': True, + 'noplaylist': True, + } + with yt_dlp.YoutubeDL(ydl_opts) as ydl: # type: ignore[attr-defined] + info_dict = ydl.extract_info(url, download=False) + if info_dict is not None: + info = dict(info_dict) + except Exception as exc: # pragma: no cover - best effort + warnings.append(f"yt_dlp extract failed: {exc}") + if info is None: + executable = str(options.get('ytdlp_path') or 'yt-dlp') + extra_args = options.get('ytdlp_args') or [] + if isinstance(extra_args, (str, bytes)): + extra_args = [extra_args] + cmd = [executable, '--dump-single-json', '--no-playlist', '--skip-download', '--no-warnings'] + cmd.extend(str(arg) for arg in extra_args) + cmd.append(url) + timeout = float(options.get('timeout') or 45.0) + try: + completed = subprocess.run(cmd, capture_output=True, text=True, check=False, timeout=timeout) + except Exception as exc: # pragma: no cover - subprocess failure + warnings.append(f"yt-dlp invocation failed: {exc}") + return None, warnings + if completed.returncode != 0: + message = completed.stderr.strip() or completed.stdout.strip() or f"status {completed.returncode}" + warnings.append(message) + return None, warnings + try: + info = json.loads(completed.stdout) + except json.JSONDecodeError as exc: # pragma: no cover - parse failure + warnings.append(f"invalid JSON from yt-dlp: {exc}") + return None, warnings + if isinstance(info, dict) and 'entries' in info: + entries = info.get('entries') + if isinstance(entries, list) and entries: + info = entries[0] + if isinstance(info, dict): + info.setdefault('source_url', url) + return info if isinstance(info, dict) else None, warnings +def resolve_remote_metadata(payload: Dict[str, Any]) -> Dict[str, Any]: + options_raw = payload.get('options') + options: Dict[str, Any] = options_raw if isinstance(options_raw, dict) else {} + source_url = payload.get('source_url') + sanitized = _sanitize_url(source_url) or source_url + existing_tags = _clean_existing_tags(payload.get('existing_tags')) + metadata_sources: List[Dict[str, Any]] = [] + for key in ('metadata', 'mpv_metadata', 'remote_metadata', 'info'): + candidate = payload.get(key) + if isinstance(candidate, dict): + metadata_sources.append(candidate) + remote_info: Optional[Dict[str, Any]] = None + warnings: List[str] = [] + if not options.get('no_fetch'): + fetch_url = sanitized + if _should_fetch_url(fetch_url): + remote_info, fetch_warnings = fetch_remote_metadata(fetch_url or '', options) + warnings.extend(fetch_warnings) + if remote_info: + metadata_sources.append(remote_info) + combined_metadata = {} + for source in metadata_sources: + if isinstance(source, dict): + combined_metadata.update(source) + context = {'source_url': sanitized} + bundle = build_remote_bundle(combined_metadata, existing_tags, context) + merged_metadata = {**combined_metadata, **(bundle.get('metadata') or {})} + bundle['metadata'] = merged_metadata + if not bundle.get('source_url'): + bundle['source_url'] = sanitized + mpv_meta_candidate = payload.get('mpv_metadata') + mpv_metadata = mpv_meta_candidate if isinstance(mpv_meta_candidate, dict) else None + result_tags = bundle.get('tags') or existing_tags + result = PipeObject( + source='remote-metadata', + identifier=sanitized or 'unknown', + tags=result_tags, + title=bundle.get('title'), + source_url=bundle.get('source_url') or sanitized, + duration=bundle.get('duration'), + metadata=merged_metadata, + remote_metadata=remote_info, + warnings=warnings, + mpv_metadata=mpv_metadata, + ) + return result.to_serializable() + + +def _ensure_hydrus_client() -> None: + if HydrusClient is None or HydrusRequestSpec is None: # pragma: no cover - depends on optional module + raise RuntimeError("Hydrus helpers are unavailable") + + +def _normalize_hash(value: Any) -> str: + candidate = str(value or '').strip().lower() + if not candidate: + raise ValueError("Hydrus hash is required") + if len(candidate) != 64 or any(ch not in '0123456789abcdef' for ch in candidate): + raise ValueError("Hydrus hash must be a 64-character hex string") + return candidate + + +def _normalize_tag(tag: Any) -> Optional[str]: + if tag is None: + return None + if isinstance(tag, str): + candidate = tag.strip() + else: + candidate = str(tag).strip() + return candidate or None + + +def _extract_tag_services(entry: Dict[str, Any]) -> List[Dict[str, Any]]: + tags_section = entry.get('tags') + services: List[Dict[str, Any]] = [] + if not isinstance(tags_section, dict): + return services + names_map = tags_section.get('service_keys_to_names') + if not isinstance(names_map, dict): + names_map = {} + + def get_record(service_key: Optional[str], service_name: Optional[str]) -> Dict[str, Any]: + key_lower = service_key.lower() if isinstance(service_key, str) else None + name_lower = service_name.lower() if isinstance(service_name, str) else None + for record in services: + existing_key = record.get('service_key') + if key_lower and isinstance(existing_key, str) and existing_key.lower() == key_lower: + if service_name and not record.get('service_name'): + record['service_name'] = service_name + return record + existing_name = record.get('service_name') + if name_lower and isinstance(existing_name, str) and existing_name.lower() == name_lower: + if service_key and not record.get('service_key'): + record['service_key'] = service_key + return record + record = { + 'service_key': service_key, + 'service_name': service_name, + 'tags': [], + } + services.append(record) + return record + + def _iter_current_status_lists(container: Any) -> Iterable[List[Any]]: + if isinstance(container, dict): + for status_key, tags_list in container.items(): + if str(status_key) != '0': + continue + if isinstance(tags_list, list): + yield tags_list + elif isinstance(container, list): + yield container + + statuses_map = tags_section.get('service_keys_to_statuses_to_tags') + if isinstance(statuses_map, dict): + for service_key, status_map in statuses_map.items(): + record = get_record(service_key if isinstance(service_key, str) else None, names_map.get(service_key)) + for tags_list in _iter_current_status_lists(status_map): + for tag in tags_list: + normalized = _normalize_tag(tag) + if normalized: + record['tags'].append(normalized) + + ignored_keys = { + 'service_keys_to_statuses_to_tags', + 'service_keys_to_statuses_to_display_tags', + 'service_keys_to_display_friendly_tags', + 'service_keys_to_names', + 'tag_display_types_to_namespaces', + 'namespace_display_string_lookup', + 'tag_display_decoration_colour_lookup', + } + + for key, service in tags_section.items(): + if key in ignored_keys: + continue + if isinstance(service, dict): + service_key = service.get('service_key') or (key if isinstance(key, str) else None) + service_name = service.get('service_name') or service.get('name') or names_map.get(service_key) + record = get_record(service_key if isinstance(service_key, str) else None, service_name) + storage = service.get('storage_tags') or service.get('statuses_to_tags') or service.get('tags') + if isinstance(storage, dict): + for tags_list in _iter_current_status_lists(storage): + for tag in tags_list: + normalized = _normalize_tag(tag) + if normalized: + record['tags'].append(normalized) + elif isinstance(storage, list): + for tag in storage: + normalized = _normalize_tag(tag) + if normalized: + record['tags'].append(normalized) + + # Use canonical dedup function + for record in services: + record['tags'] = dedup_tags_by_namespace(record['tags'], keep_first=True) + return services + + +def _select_primary_tags(services: List[Dict[str, Any]], aggregated: List[str], prefer_service: Optional[str]) -> Tuple[Optional[str], List[str]]: + prefer_lower = prefer_service.lower() if isinstance(prefer_service, str) else None + if prefer_lower: + for record in services: + name = record.get('service_name') + if isinstance(name, str) and name.lower() == prefer_lower and record['tags']: + return record.get('service_key'), record['tags'] + for record in services: + if record['tags']: + return record.get('service_key'), record['tags'] + return None, aggregated + + +def _derive_title(tags_primary: List[str], tags_aggregated: List[str], entry: Dict[str, Any]) -> Optional[str]: + for source in (tags_primary, tags_aggregated): + for tag in source: + namespace, sep, value = tag.partition(':') + if sep and namespace and namespace.lower() == 'title': + cleaned = value.strip() + if cleaned: + return cleaned + for key in ('title', 'display_name', 'pretty_name', 'original_display_filename', 'original_filename'): + value = entry.get(key) + if isinstance(value, str): + cleaned = value.strip() + if cleaned: + return cleaned + return None + + +def _derive_clip_time(tags_primary: List[str], tags_aggregated: List[str], entry: Dict[str, Any]) -> Optional[str]: + namespaces = {'clip', 'clip_time', 'cliptime'} + for source in (tags_primary, tags_aggregated): + for tag in source: + namespace, sep, value = tag.partition(':') + if sep and namespace and namespace.lower() in namespaces: + cleaned = value.strip() + if cleaned: + return cleaned + clip_value = entry.get('clip_time') + if isinstance(clip_value, str): + cleaned_clip = clip_value.strip() + if cleaned_clip: + return cleaned_clip + return None + + +def _summarize_hydrus_entry(entry: Dict[str, Any], prefer_service: Optional[str]) -> Tuple[Dict[str, Any], List[str], Optional[str], Optional[str], Optional[str]]: + services = _extract_tag_services(entry) + aggregated: List[str] = [] + seen: Set[str] = set() + for record in services: + for tag in record['tags']: + if tag not in seen: + seen.add(tag) + aggregated.append(tag) + service_key, primary_tags = _select_primary_tags(services, aggregated, prefer_service) + title = _derive_title(primary_tags, aggregated, entry) + clip_time = _derive_clip_time(primary_tags, aggregated, entry) + summary = dict(entry) + if title and not summary.get('title'): + summary['title'] = title + if clip_time and not summary.get('clip_time'): + summary['clip_time'] = clip_time + summary['tag_service_key'] = service_key + summary['has_current_file_service'] = _has_current_file_service(entry) + if 'is_local' not in summary: + summary['is_local'] = bool(entry.get('is_local')) + return summary, primary_tags, service_key, title, clip_time + + +def _looks_like_hash(value: Any) -> bool: + if not isinstance(value, str): + return False + candidate = value.strip().lower() + return len(candidate) == 64 and all(ch in '0123456789abcdef' for ch in candidate) + + +def _collect_relationship_hashes(payload: Any, accumulator: Set[str]) -> None: + if isinstance(payload, dict): + for value in payload.values(): + _collect_relationship_hashes(value, accumulator) + elif isinstance(payload, (list, tuple, set)): + for value in payload: + _collect_relationship_hashes(value, accumulator) + elif isinstance(payload, str) and _looks_like_hash(payload): + accumulator.add(payload) + + +def _build_hydrus_query( + hashes: Optional[Sequence[str]], + file_ids: Optional[Sequence[int]], + include_relationships: bool, + minimal: bool, +) -> Dict[str, str]: + query: Dict[str, str] = {} + if hashes: + query['hashes'] = json.dumps(list(hashes)) + if file_ids: + query['file_ids'] = json.dumps([int(value) for value in file_ids]) + if not query: + raise ValueError('hashes or file_ids must be provided') + query['include_service_keys_to_tags'] = json.dumps(True) + query['include_tag_services'] = json.dumps(True) + query['include_file_services'] = json.dumps(True) + if include_relationships: + query['include_file_relationships'] = json.dumps(True) + if not minimal: + extras = ( + 'include_known_urls', + 'include_size', + 'include_width', + 'include_height', + 'include_duration', + 'include_mime', + 'include_has_audio', + 'include_is_trashed', + ) + for key in extras: + query[key] = json.dumps(True) + return query + + +def _fetch_hydrus_entries( + client: Any, + hashes: Optional[Sequence[str]], + file_ids: Optional[Sequence[int]], + include_relationships: bool, + minimal: bool, +) -> List[Dict[str, Any]]: + if not hashes and not file_ids: + return [] + assert HydrusRequestSpec is not None + spec = HydrusRequestSpec( + method='GET', + endpoint='/get_files/file_metadata', + query=_build_hydrus_query(hashes, file_ids, include_relationships, minimal), + ) + response = client._perform_request(spec) # type: ignore[attr-defined] + metadata = response.get('metadata') if isinstance(response, dict) else None + if isinstance(metadata, list): + return [entry for entry in metadata if isinstance(entry, dict)] + return [] + + +def _has_current_file_service(entry: Dict[str, Any]) -> bool: + services = entry.get('file_services') + if not isinstance(services, dict): + return False + current = services.get('current') + if isinstance(current, dict): + for value in current.values(): + if value: + return True + return False + if isinstance(current, list): + return len(current) > 0 + return False + + +def _compute_file_flags(entry: Dict[str, Any]) -> Tuple[bool, bool, bool]: + mime = entry.get('mime') + mime_lower = mime.lower() if isinstance(mime, str) else '' + is_video = mime_lower.startswith('video/') + is_audio = mime_lower.startswith('audio/') + is_deleted = False + if entry.get('is_trashed'): + is_deleted = True + file_services = entry.get('file_services') + if not is_deleted and isinstance(file_services, dict): + deleted = file_services.get('deleted') + if isinstance(deleted, dict) and deleted: + is_deleted = True + return is_video, is_audio, is_deleted + + +def fetch_hydrus_metadata(payload: Dict[str, Any]) -> Dict[str, Any]: + _ensure_hydrus_client() + assert HydrusClient is not None + hash_hex = None + raw_hash_value = payload.get('hash') + if raw_hash_value is not None: + hash_hex = _normalize_hash(raw_hash_value) + file_ids: List[int] = [] + raw_file_ids = payload.get('file_ids') + if isinstance(raw_file_ids, (list, tuple, set)): + for value in raw_file_ids: + try: + file_ids.append(int(value)) + except (TypeError, ValueError): + continue + elif raw_file_ids is not None: + try: + file_ids.append(int(raw_file_ids)) + except (TypeError, ValueError): + file_ids = [] + raw_file_id = payload.get('file_id') + if raw_file_id is not None: + try: + coerced = int(raw_file_id) + except (TypeError, ValueError): + coerced = None + if coerced is not None and coerced not in file_ids: + file_ids.append(coerced) + base_url = str(payload.get('api_url') or '').strip() + if not base_url: + raise ValueError('Hydrus api_url is required') + access_key = str(payload.get('access_key') or '').strip() + options_raw = payload.get('options') + options = options_raw if isinstance(options_raw, dict) else {} + prefer_service = options.get('prefer_service_name') + if isinstance(prefer_service, str): + prefer_service = prefer_service.strip() + else: + prefer_service = None + include_relationships = bool(options.get('include_relationships')) + minimal = bool(options.get('minimal')) + timeout = float(options.get('timeout') or 60.0) + client = HydrusClient(base_url, access_key, timeout) + hashes: Optional[List[str]] = None + if hash_hex: + hashes = [hash_hex] + if not hashes and not file_ids: + raise ValueError('Hydrus hash or file id is required') + try: + entries = _fetch_hydrus_entries(client, hashes, file_ids or None, include_relationships, minimal) + except HydrusRequestError as exc: # type: ignore[misc] + raise RuntimeError(str(exc)) + if not entries: + response: Dict[str, Any] = { + 'hash': hash_hex, + 'metadata': {}, + 'tags': [], + 'warnings': [f'No Hydrus metadata for {hash_hex or file_ids}'], + 'error': 'not_found', + } + if file_ids: + response['file_id'] = file_ids[0] + return response + entry = entries[0] + if not hash_hex: + entry_hash = entry.get('hash') + if isinstance(entry_hash, str) and entry_hash: + hash_hex = entry_hash + hashes = [hash_hex] + summary, primary_tags, service_key, title, clip_time = _summarize_hydrus_entry(entry, prefer_service) + is_video, is_audio, is_deleted = _compute_file_flags(entry) + has_current_file_service = _has_current_file_service(entry) + is_local = bool(entry.get('is_local')) + size_bytes = entry.get('size') or entry.get('file_size') + filesize_mb = None + if isinstance(size_bytes, (int, float)) and size_bytes > 0: + filesize_mb = float(size_bytes) / (1024.0 * 1024.0) + duration = entry.get('duration') + if duration is None and isinstance(entry.get('duration_ms'), (int, float)): + duration = float(entry['duration_ms']) / 1000.0 + warnings: List[str] = [] + if not primary_tags: + warnings.append('No tags returned for preferred service') + relationships = None + relationship_metadata: Dict[str, Dict[str, Any]] = {} + if include_relationships and hash_hex: + try: + assert HydrusRequestSpec is not None + rel_spec = HydrusRequestSpec( + method='GET', + endpoint='/manage_file_relationships/get_file_relationships', + query={'hash': hash_hex}, + ) + relationships = client._perform_request(rel_spec) # type: ignore[attr-defined] + except HydrusRequestError as exc: # type: ignore[misc] + warnings.append(f'Relationship lookup failed: {exc}') + relationships = None + if isinstance(relationships, dict): + related_hashes: Set[str] = set() + _collect_relationship_hashes(relationships, related_hashes) + related_hashes.discard(hash_hex) + if related_hashes: + try: + related_entries = _fetch_hydrus_entries(client, sorted(related_hashes), None, False, True) + except HydrusRequestError as exc: # type: ignore[misc] + warnings.append(f'Relationship metadata fetch failed: {exc}') + else: + for rel_entry in related_entries: + rel_hash = rel_entry.get('hash') + if not isinstance(rel_hash, str): + continue + rel_summary, rel_tags, _, rel_title, rel_clip = _summarize_hydrus_entry(rel_entry, prefer_service) + rel_summary['tags'] = rel_tags + if rel_title: + rel_summary['title'] = rel_title + if rel_clip: + rel_summary['clip_time'] = rel_clip + relationship_metadata[rel_hash] = rel_summary + result: Dict[str, Any] = { + 'hash': entry.get('hash') or hash_hex, + 'metadata': summary, + 'tags': primary_tags, + 'tag_service_key': service_key, + 'title': title, + 'clip_time': clip_time, + 'duration': duration, + 'filesize_mb': filesize_mb, + 'is_video': is_video, + 'is_audio': is_audio, + 'is_deleted': is_deleted, + 'is_local': is_local, + 'has_current_file_service': has_current_file_service, + 'matched_hash': entry.get('hash') or hash_hex, + 'swap_recommended': False, + } + file_id_value = entry.get('file_id') + if isinstance(file_id_value, (int, float)): + result['file_id'] = int(file_id_value) + if relationships is not None: + result['relationships'] = relationships + if relationship_metadata: + result['relationship_metadata'] = relationship_metadata + if warnings: + result['warnings'] = warnings + return result + + +def fetch_hydrus_metadata_by_url(payload: Dict[str, Any]) -> Dict[str, Any]: + _ensure_hydrus_client() + assert HydrusClient is not None + raw_url = payload.get('url') or payload.get('source_url') + url = str(raw_url or '').strip() + if not url: + raise ValueError('URL is required to fetch Hydrus metadata by URL') + base_url = str(payload.get('api_url') or '').strip() + if not base_url: + raise ValueError('Hydrus api_url is required') + access_key = str(payload.get('access_key') or '').strip() + options_raw = payload.get('options') + options = options_raw if isinstance(options_raw, dict) else {} + timeout = float(options.get('timeout') or 60.0) + client = HydrusClient(base_url, access_key, timeout) + hashes: Optional[List[str]] = None + file_ids: Optional[List[int]] = None + matched_url = None + normalised_reported = None + seen: Set[str] = set() + queue = deque() + for variant in _generate_hydrus_url_variants(url): + queue.append(variant) + if not queue: + queue.append(url) + tried_variants: List[str] = [] + while queue: + candidate = queue.popleft() + candidate = str(candidate or '').strip() + if not candidate or candidate in seen: + continue + seen.add(candidate) + tried_variants.append(candidate) + assert HydrusRequestSpec is not None + spec = HydrusRequestSpec( + method='GET', + endpoint='/add_urls/get_url_files', + query={'url': candidate}, + ) + try: + response = client._perform_request(spec) # type: ignore[attr-defined] + except HydrusRequestError as exc: # type: ignore[misc] + raise RuntimeError(str(exc)) + response_hashes_list: List[str] = [] + response_file_ids_list: List[int] = [] + if isinstance(response, dict): + normalised_value = response.get('normalised_url') + if isinstance(normalised_value, str): + trimmed = normalised_value.strip() + if trimmed: + normalised_reported = normalised_reported or trimmed + if trimmed not in seen: + queue.append(trimmed) + for redirect_key in ('redirect_url', 'url'): + redirect_value = response.get(redirect_key) + if isinstance(redirect_value, str): + redirect_trimmed = redirect_value.strip() + if redirect_trimmed and redirect_trimmed not in seen: + queue.append(redirect_trimmed) + raw_hashes = response.get('hashes') or response.get('file_hashes') + if isinstance(raw_hashes, list): + for item in raw_hashes: + try: + normalized = _normalize_hash(item) + except ValueError: + continue + if normalized: + response_hashes_list.append(normalized) + raw_ids = response.get('file_ids') or response.get('file_id') + if isinstance(raw_ids, list): + for item in raw_ids: + try: + response_file_ids_list.append(int(item)) + except (TypeError, ValueError): + continue + elif raw_ids is not None: + try: + response_file_ids_list.append(int(raw_ids)) + except (TypeError, ValueError): + pass + statuses = response.get('url_file_statuses') + if isinstance(statuses, list): + for entry in statuses: + if not isinstance(entry, dict): + continue + status_hash = entry.get('hash') or entry.get('file_hash') + if status_hash: + try: + normalized = _normalize_hash(status_hash) + except ValueError: + normalized = None + if normalized: + response_hashes_list.append(normalized) + status_id = entry.get('file_id') or entry.get('fileid') + if status_id is not None: + try: + response_file_ids_list.append(int(status_id)) + except (TypeError, ValueError): + continue + if response_hashes_list: + hashes = response_hashes_list + if response_file_ids_list: + file_ids = response_file_ids_list + if hashes or file_ids: + matched_url = candidate + break + if not hashes and not file_ids: + result = { + 'found': False, + 'url': url, + 'variants': tried_variants, + 'metadata': {}, + 'tags': [], + 'warnings': [f'No Hydrus file found for {url}'], + 'error': 'not_found', + } + if normalised_reported: + result['normalised_url'] = normalised_reported + return result + hash_value = str(hashes[0]) if hashes else None + followup_payload: Dict[str, Any] = { + 'api_url': base_url, + 'access_key': access_key, + 'options': options, + } + if hash_value: + followup_payload['hash'] = hash_value + if file_ids: + followup_payload['file_id'] = file_ids[0] + result = fetch_hydrus_metadata(followup_payload) + result['found'] = True + result['url'] = url + if matched_url and matched_url != url: + result['matched_url'] = matched_url + if file_ids: + result['file_id'] = file_ids[0] + if normalised_reported: + result['normalised_url'] = normalised_reported + result['variants'] = tried_variants + return result + + +def _normalise_string_list(values: Optional[Iterable[Any]]) -> List[str]: + if not values: + return [] + seen: Set[str] = set() + items: List[str] = [] + for value in values: + if value is None: + continue + text = str(value).strip() + if not text: + continue + if text in seen: + continue + seen.add(text) + items.append(text) + return items + + +def _derive_sidecar_path(media_path: Path) -> Path: + try: + return media_path.parent / (media_path.name + '.tags') + except ValueError: + return media_path.with_name(media_path.name + '.tags') + + +def _read_sidecar_metadata(sidecar_path: Path) -> tuple[Optional[str], List[str], List[str]]: + """Read hash, tags, and known_urls from .tags sidecar file. + + Consolidated with read_tags_from_file - this extracts extra metadata (hash, urls). + """ + if not sidecar_path.exists(): + return None, [], [] + try: + raw = sidecar_path.read_text(encoding='utf-8') + except OSError: + return None, [], [] + + hash_value: Optional[str] = None + tags: List[str] = [] + known_urls: List[str] = [] + + for raw_line in raw.splitlines(): + line = raw_line.strip() + if not line or line.startswith('#'): + continue + + lower = line.lower() + if lower.startswith('hash:'): + hash_value = line.split(':', 1)[1].strip() if ':' in line else '' + elif lower.startswith('known_url:') or lower.startswith('url:'): + # Parse URLs (handle legacy 'url:' format) + urls_part = line.split(':', 1)[1].strip() if ':' in line else '' + if urls_part: + for url_segment in urls_part.split(','): + for url in url_segment.split(): + url_clean = url.strip() + if url_clean and url_clean not in known_urls: + known_urls.append(url_clean) + else: + # Everything else is a tag (including relationship: lines) + tags.append(line) + + return hash_value, tags, known_urls + + + +def rename_by_metadata(file_path: Path, tags: Iterable[str]) -> Optional[Path]: + """Rename a file based on title: tag in the tags list. + + If a title: tag is present, renames the file and any .tags/.metadata sidecars. + + Args: + file_path: Path to the file to potentially rename + tags: Iterable of tag strings (should contain title: tag if rename needed) + + Returns: + New path if renamed, None if not renamed or error occurred + """ + # Extract title from tags + new_title = None + for tag in tags: + if isinstance(tag, str) and tag.lower().startswith('title:'): + new_title = tag.split(':', 1)[1].strip() + break + + if not new_title or not file_path.exists(): + return None + + try: + old_name = file_path.name + old_suffix = file_path.suffix + + # Create new filename: title + extension + new_name = f"{new_title}{old_suffix}" + new_path = file_path.parent / new_name + + # Don't rename if already the same name + if new_path == file_path: + return None + + # If target exists, delete it first (replace mode) + if new_path.exists(): + try: + new_path.unlink() + log(f"[rename_by_metadata] Replaced existing file: {new_name}", file=sys.stderr) + except Exception as e: + log(f"[rename_by_metadata] Warning: Could not replace target file {new_name}: {e}", file=sys.stderr) + return None + + file_path.rename(new_path) + log(f"[rename_by_metadata] Renamed file: {old_name} → {new_name}", file=sys.stderr) + + # Rename the .tags sidecar if it exists + old_tags_path = file_path.parent / (old_name + '.tags') + if old_tags_path.exists(): + new_tags_path = file_path.parent / (new_name + '.tags') + if new_tags_path.exists(): + try: + new_tags_path.unlink() + except Exception: + pass + else: + old_tags_path.rename(new_tags_path) + log(f"[rename_by_metadata] Renamed sidecar: {old_tags_path.name} → {new_tags_path.name}", file=sys.stderr) + + # Rename the .metadata sidecar if it exists + old_metadata_path = file_path.parent / (old_name + '.metadata') + if old_metadata_path.exists(): + new_metadata_path = file_path.parent / (new_name + '.metadata') + if new_metadata_path.exists(): + log(f"[rename_by_metadata] Warning: Target metadata already exists: {new_metadata_path.name}", file=sys.stderr) + else: + old_metadata_path.rename(new_metadata_path) + log(f"[rename_by_metadata] Renamed metadata: {old_metadata_path.name} → {new_metadata_path.name}", file=sys.stderr) + + return new_path + except Exception as exc: + log(f"[rename_by_metadata] Warning: Failed to rename file: {exc}", file=sys.stderr) + return None + + +def write_tags(media_path: Path, tags: Iterable[str], known_urls: Iterable[str], hash_value: Optional[str] = None, db=None) -> None: + """Write tags and metadata to database or sidecar file. + + If db is provided, inserts into LocalLibraryDB and skips sidecar file creation. + Otherwise, creates .tags sidecar file with name: media.ext.tags (e.g., song.mp3.tags) + + Args: + media_path: Path to the media file + tags: Iterable of tag strings + known_urls: Iterable of known URL strings + hash_value: Optional hash value for the file + db: Optional LocalLibraryDB instance. If provided, skips sidecar creation. + """ + if media_path.exists() and media_path.is_dir(): + raise ValueError(f"write_tags_sidecar: media_path is a directory: {media_path}") + + # Prepare tags lines and convert to list if needed + tag_list = list(tags) if not isinstance(tags, list) else tags + url_list = list(known_urls) if not isinstance(known_urls, list) else known_urls + + # If database provided, insert directly and skip sidecar + if db is not None: + try: + # Build tag list with hash and known_urls + db_tags = [] + if hash_value: + db_tags.append(f"hash:{hash_value}") + db_tags.extend(str(tag).strip() for tag in tag_list if str(tag).strip()) + db_tags.extend(f"known_url:{str(url).strip()}" for url in url_list if str(url).strip()) + + if db_tags: + db.add_tags(media_path, db_tags) + log(f"Added tags to database for {media_path.name}") + return + except Exception as e: + log(f"Failed to add tags to database: {e}", file=sys.stderr) + # Fall through to sidecar creation as fallback + + # Create sidecar path + try: + sidecar = media_path.parent / (media_path.name + '.tags') + except Exception: + sidecar = media_path.with_name(media_path.name + '.tags') + + # Handle edge case: empty/invalid base name + try: + if not sidecar.stem or sidecar.name in {'.tags', '-.tags', '_.tags'}: + fallback_base = media_path.stem or _sanitize_title_for_filename(extract_title(tag_list) or '') or 'untitled' + sidecar = media_path.parent / f"{fallback_base}.tags" + except Exception: + pass + + # Write via consolidated function + try: + lines = [] + if hash_value: + lines.append(f"hash:{hash_value}") + lines.extend(str(tag).strip() for tag in tag_list if str(tag).strip()) + lines.extend(f"known_url:{str(url).strip()}" for url in url_list if str(url).strip()) + + if lines: + sidecar.write_text("\n".join(lines) + "\n", encoding="utf-8") + log(f"Wrote tags to {sidecar}") + # Clean up legacy files + for legacy_path in [media_path.with_name(media_path.name + '.tags'), + media_path.with_name(media_path.name + '.tags.txt')]: + if legacy_path.exists() and legacy_path != sidecar: + try: + legacy_path.unlink() + except OSError: + pass + else: + try: + sidecar.unlink() + except FileNotFoundError: + pass + except OSError as exc: + log(f"Failed to write tag sidecar {sidecar}: {exc}", file=sys.stderr) + + +def write_metadata(media_path: Path, hash_value: Optional[str] = None, known_urls: Optional[Iterable[str]] = None, relationships: Optional[Iterable[str]] = None, db=None) -> None: + """Write metadata to database or sidecar file. + + If db is provided, inserts into LocalLibraryDB and skips sidecar file creation. + Otherwise, creates .metadata sidecar file with hash, URLs, and relationships. + + Args: + media_path: Path to the media file + hash_value: Optional hash value for the file + known_urls: Optional iterable of known URL strings + relationships: Optional iterable of relationship strings + db: Optional LocalLibraryDB instance. If provided, skips sidecar creation. + """ + if media_path.exists() and media_path.is_dir(): + raise ValueError(f"write_metadata_sidecar: media_path is a directory: {media_path}") + + # Prepare metadata lines + url_list = list(known_urls) if known_urls else [] + rel_list = list(relationships) if relationships else [] + + # If database provided, insert directly and skip sidecar + if db is not None: + try: + # Build metadata tag list + db_tags = [] + if hash_value: + db_tags.append(f"hash:{hash_value}") + for url in url_list: + if str(url).strip(): + db_tags.append(f"known_url:{str(url).strip()}") + for rel in rel_list: + if str(rel).strip(): + db_tags.append(f"relationship:{str(rel).strip()}") + + if db_tags: + db.add_tags(media_path, db_tags) + log(f"Added metadata to database for {media_path.name}") + return + except Exception as e: + log(f"Failed to add metadata to database: {e}", file=sys.stderr) + # Fall through to sidecar creation as fallback + + # Create sidecar path + try: + sidecar = media_path.parent / (media_path.name + '.metadata') + except Exception: + sidecar = media_path.with_name(media_path.name + '.metadata') + + try: + lines = [] + + # Add hash if available + if hash_value: + lines.append(f"hash:{hash_value}") + + # Add known URLs + for url in url_list: + if str(url).strip(): + lines.append(f"known_url:{str(url).strip()}") + + # Add relationships + for rel in rel_list: + if str(rel).strip(): + lines.append(f"relationship:{str(rel).strip()}") + + # Write metadata file + if lines: + sidecar.write_text("\n".join(lines) + "\n", encoding="utf-8") + log(f"Wrote metadata to {sidecar}") + else: + # Remove if no content + try: + sidecar.unlink() + except FileNotFoundError: + pass + except OSError as exc: + log(f"Failed to write metadata sidecar {sidecar}: {exc}", file=sys.stderr) + + +def extract_title(tags: Iterable[str]) -> Optional[str]: + """ + Extracts a title from a list of tags (looks for 'title:...'). + """ + for tag in tags: + + tag = tag.strip() + + if tag.lower().startswith("title:"): + title_tag = tag.split(":", 1)[1].strip() + if title_tag: + return title_tag + return None + +def _sanitize_title_for_filename(title: str) -> str: + # Allow alnum, hyphen, underscore, and space; replace other chars with space + temp = [] + for ch in title: + if ch.isalnum() or ch in {"-", "_", " "}: + temp.append(ch) + else: + temp.append(" ") + # Collapse whitespace and trim hyphens/underscores around words + rough = "".join(temp) + tokens = [] + for seg in rough.split(): + cleaned = seg.strip("-_ ") + if cleaned: + tokens.append(cleaned) + sanitized = "_".join(tokens) + sanitized = sanitized.strip("-_") + return sanitized or "untitled" + +def apply_title_to_path(media_path: Path, tags: Iterable[str]) -> Path: + """ + If a title tag is present, returns a new Path with the title as filename; else returns original path. + """ + title = extract_title(tags) + if not title: + return media_path + parent = media_path.parent + sanitized = _sanitize_title_for_filename(title) + destination = parent / f"{sanitized}{media_path.suffix}" + return destination + + +def _collect_search_roots(payload: Dict[str, Any]) -> List[Path]: + roots: List[Path] = [] + for key in ('paths', 'search_paths', 'roots', 'directories'): + raw = payload.get(key) + if not raw: + continue + entries = raw if isinstance(raw, (list, tuple, set)) else [raw] + for entry in entries: + if not entry: + continue + try: + candidate = Path(str(entry)).expanduser() + except Exception: + continue + roots.append(candidate) + if load_config is not None and resolve_output_dir is not None: + try: + config = load_config() + except Exception: + config = None + if isinstance(config, dict) and config: + try: + default_root = resolve_output_dir(config) + except Exception: + default_root = None + if default_root is not None: + roots.append(default_root) + return roots + + +def _locate_sidecar_by_hash(hash_value: str, roots: Iterable[Path]) -> Optional[Path]: + target = f'hash:{hash_value.strip().lower()}' + for root in roots: + try: + root_path = root.expanduser() + except Exception: + continue + if not root_path.exists() or not root_path.is_dir(): + continue + for pattern in ('*.tags', '*.tags.txt'): + try: + iterator = root_path.rglob(pattern) + except OSError: + continue + for candidate in iterator: + if not candidate.is_file(): + continue + try: + with candidate.open('r', encoding='utf-8', errors='ignore') as handle: + for line in handle: + if line.strip().lower() == target: + return candidate + except OSError: + continue + return None + + +def sync_sidecar(payload: Dict[str, Any]) -> Dict[str, Any]: + path_value = payload.get('path') + sidecar_path: Optional[Path] = None + media_path: Optional[Path] = None + if path_value: + candidate = Path(str(path_value)).expanduser() + if candidate.suffix.lower() in {'.tags', '.tags.txt'}: + sidecar_path = candidate + else: + media_path = candidate + hash_input = payload.get('hash') + hash_value = None + if hash_input: + hash_value = _normalize_hash(hash_input) + tags = _normalise_string_list(payload.get('tags')) + known_urls = _normalise_string_list(payload.get('known_urls')) + if media_path is not None: + sidecar_path = _derive_sidecar_path(media_path) + search_roots = _collect_search_roots(payload) + if sidecar_path is None and hash_value: + located = _locate_sidecar_by_hash(hash_value, search_roots) + if located is not None: + sidecar_path = located + if sidecar_path is None: + if media_path is not None: + sidecar_path = _derive_sidecar_path(media_path) + elif hash_value: + return { + 'error': 'not_found', + 'hash': hash_value, + 'tags': tags, + 'known_urls': known_urls, + } + else: + raise ValueError('path or hash is required to synchronise sidecar') + existing_hash, existing_tags, existing_known = _read_sidecar_metadata(sidecar_path) + if not tags: + tags = existing_tags + if not known_urls: + known_urls = existing_known + hash_line = hash_value or existing_hash + title_value: Optional[str] = None + for tag in tags: + if isinstance(tag, str): + if tag.lower().startswith('title:'): + title_value = tag.split(':', 1)[1].strip() if ':' in tag else '' + if title_value == '': + title_value = None + break + lines: List[str] = [] + if hash_line: + lines.append(f'hash:{hash_line}') + lines.extend(tags) + lines.extend(f'known_url:{url}' for url in known_urls) + sidecar_path.parent.mkdir(parents=True, exist_ok=True) + if lines: + sidecar_path.write_text('\n'.join(lines) + '\n', encoding='utf-8') + else: + try: + sidecar_path.unlink() + except FileNotFoundError: + pass + return { + 'path': str(sidecar_path), + 'hash': hash_line, + 'tags': [], + 'known_urls': [], + 'deleted': True, + 'title': title_value, + } + return { + 'path': str(sidecar_path), + 'hash': hash_line, + 'tags': tags, + 'known_urls': known_urls, + 'title': title_value, + } + + +def _build_hydrus_context(payload: Dict[str, Any]) -> Tuple[Any, str, str, float, Optional[str]]: + _ensure_hydrus_client() + assert HydrusClient is not None + base_url = str(payload.get('api_url') or '').strip() + if not base_url: + raise ValueError('Hydrus api_url is required') + access_key = str(payload.get('access_key') or '').strip() + options_raw = payload.get('options') + options = options_raw if isinstance(options_raw, dict) else {} + timeout = float(options.get('timeout') or payload.get('timeout') or 60.0) + prefer_service = payload.get('prefer_service_name') or options.get('prefer_service_name') + if isinstance(prefer_service, str): + prefer_service = prefer_service.strip() or None + else: + prefer_service = None + client = HydrusClient(base_url, access_key, timeout) + return client, base_url, access_key, timeout, prefer_service + + +def _refetch_hydrus_summary(base_url: str, access_key: str, hash_hex: str, timeout: float, prefer_service: Optional[str]) -> Dict[str, Any]: + payload: Dict[str, Any] = { + 'hash': hash_hex, + 'api_url': base_url, + 'access_key': access_key, + 'options': { + 'minimal': True, + 'include_relationships': False, + 'timeout': timeout, + }, + } + if prefer_service: + payload['options']['prefer_service_name'] = prefer_service + return fetch_hydrus_metadata(payload) + + +def _apply_hydrus_tag_mutation(payload: Dict[str, Any], add: Iterable[Any], remove: Iterable[Any]) -> Dict[str, Any]: + client, base_url, access_key, timeout, prefer_service = _build_hydrus_context(payload) + hash_hex = _normalize_hash(payload.get('hash')) + add_list = [_normalize_tag(tag) for tag in add if _normalize_tag(tag)] + remove_list = [_normalize_tag(tag) for tag in remove if _normalize_tag(tag)] + if not add_list and not remove_list: + raise ValueError('No tag changes supplied') + service_key = payload.get('service_key') or payload.get('tag_service_key') + summary = None + if not service_key: + summary = _refetch_hydrus_summary(base_url, access_key, hash_hex, timeout, prefer_service) + service_key = summary.get('tag_service_key') + if not isinstance(service_key, str) or not service_key: + raise RuntimeError('Unable to determine Hydrus tag service key') + actions: Dict[str, List[str]] = {} + if add_list: + actions['0'] = [tag for tag in add_list if tag] + if remove_list: + actions['1'] = [tag for tag in remove_list if tag] + if not actions: + raise ValueError('Tag mutation produced no actionable changes') + request_payload = { + 'hashes': [hash_hex], + 'service_keys_to_actions_to_tags': { + service_key: actions, + }, + } + try: + assert HydrusRequestSpec is not None + tag_spec = HydrusRequestSpec( + method='POST', + endpoint='/add_tags/add_tags', + data=request_payload, + ) + client._perform_request(tag_spec) + except HydrusRequestError as exc: # type: ignore[misc] + raise RuntimeError(str(exc)) + summary_after = _refetch_hydrus_summary(base_url, access_key, hash_hex, timeout, prefer_service) + result = dict(summary_after) + result['added_tags'] = actions.get('0', []) + result['removed_tags'] = actions.get('1', []) + result['tag_service_key'] = summary_after.get('tag_service_key') + return result + + +def apply_tag_mutation(payload: Dict[str, Any], operation: str = 'add') -> Dict[str, Any]: + """Unified tag mutation for add and update operations (Hydrus and local). + + Consolidates: add_tag, update_tag, _add_local_tag, _update_local_tag + + Args: + payload: Mutation payload with type, tags, old_tag, new_tag + operation: 'add' or 'update' + + Returns: + Dict with tags and operation result + """ + file_type = str(payload.get('type', 'local')).lower() + + if file_type == 'hydrus': + if operation == 'add': + new_tag = _normalize_tag(payload.get('new_tag')) + if not new_tag: + raise ValueError('new_tag is required') + result = _apply_hydrus_tag_mutation(payload, [new_tag], []) + result['added'] = True + return result + else: # update + old_tag = _normalize_tag(payload.get('old_tag')) + new_tag = _normalize_tag(payload.get('new_tag')) + result = _apply_hydrus_tag_mutation( + payload, + [new_tag] if new_tag else [], + [old_tag] if old_tag else [] + ) + result['updated'] = True + return result + else: # local + tags = _clean_existing_tags(payload.get('tags')) + + if operation == 'add': + new_tag = _normalize_tag(payload.get('new_tag')) + if not new_tag: + raise ValueError('new_tag is required') + added = new_tag not in tags + if added: + tags.append(new_tag) + return {'tags': tags, 'added': added} + + else: # update + old_tag = _normalize_tag(payload.get('old_tag')) + new_tag = _normalize_tag(payload.get('new_tag')) + if not old_tag: + raise ValueError('old_tag is required') + + remaining = [] + removed_count = 0 + for tag in tags: + if tag == old_tag: + removed_count += 1 + else: + remaining.append(tag) + + if new_tag and removed_count > 0: + remaining.extend([new_tag] * removed_count) + + updated = removed_count > 0 or (bool(new_tag) and new_tag not in tags) + return {'tags': remaining, 'updated': updated, 'removed_count': removed_count} + + +def extract_ytdlp_tags(entry: Dict[str, Any]) -> List[str]: + """Extract meaningful metadata tags from yt-dlp entry. + + This is the UNIFIED API for extracting tags from yt-dlp metadata. + All modules (download_data, merge_file, etc.) should use this function + instead of implementing their own extraction logic. + + Extracts meaningful tags (artist, album, creator, genre, track, etc.) + while excluding technical fields (filesize, duration, format, etc.). + + Args: + entry: yt-dlp entry metadata dictionary from download + + Returns: + List of normalized tag strings in format "namespace:value" + + Example: + >>> entry = {'artist': 'The Beatles', 'album': 'Abbey Road', 'duration': 5247} + >>> tags = extract_ytdlp_tags(entry) + >>> log(tags) + ['artist:The Beatles', 'album:Abbey Road'] + """ + tags: List[str] = [] + seen_namespaces: Set[str] = set() + + # Meaningful yt-dlp fields that should become tags + # This mapping excludes technical fields: filesize, duration, format_id, vcodec, acodec, ext, etc. + field_to_namespace = { + 'artist': 'artist', + 'album': 'album', + 'creator': 'creator', + 'uploader': 'creator', # Map uploader to creator (deduplicate) + 'uploader_id': 'creator', + 'channel': 'channel', + 'genre': 'genre', + 'track': 'track', + 'track_number': 'track_number', + 'release_date': 'release_date', + 'upload_date': 'upload_date', + 'title': 'title', + 'license': 'license', + 'location': 'location', + } + + # Extract simple field mappings + for yt_field, namespace in field_to_namespace.items(): + value = entry.get(yt_field) + if value is not None: + value_str = value_normalize(str(value)) + if value_str: + # Prevent duplicate creator tags (only use first creator) + if namespace == 'creator': + if 'creator' in seen_namespaces: + continue + seen_namespaces.add('creator') + + _add_tag(tags, namespace, value_str) + + # Handle tags field specially (could be list, dict, or string) + # For list/sequence tags, capture as freeform (no namespace prefix) + tags_field = entry.get('tags') + if tags_field is not None: + if isinstance(tags_field, list): + # Tags is list: ["tag1", "tag2", ...] → capture as freeform tags (no "tag:" prefix) + # These are typically genre/category tags from the source (BandCamp genres, etc.) + for tag_value in tags_field: + if tag_value: + normalized = value_normalize(str(tag_value)) + if normalized and normalized not in tags: + tags.append(normalized) + elif isinstance(tags_field, dict): + # Tags is dict: {"key": "val"} → tag:key:val + for key, val in tags_field.items(): + if key and val: + key_normalized = value_normalize(str(key)) + val_normalized = value_normalize(str(val)) + if key_normalized and val_normalized: + _add_tag(tags, f'tag:{key_normalized}', val_normalized) + else: + # Tags is string or other: add as freeform + if tags_field: + normalized = value_normalize(str(tags_field)) + if normalized and normalized not in tags: + tags.append(normalized) + + return tags + + +def dedup_tags_by_namespace(tags: List[str], keep_first: bool = True) -> List[str]: + """Deduplicate tags by namespace, keeping consistent order. + + This is the UNIFIED API for tag deduplication used across all cmdlets. + Replaces custom deduplication logic in merge_file.py and other modules. + + Groups tags by namespace (e.g., "artist", "album", "tag") and keeps + either the first or last occurrence of each namespace, then preserves + order based on first appearance. + + Args: + tags: List of tags (with or without namespace prefixes) + keep_first: If True, keep first occurrence per namespace (default). + If False, keep last occurrence per namespace. + + Returns: + Deduplicated tag list with consistent order + + Example: + >>> tags = [ + ... 'artist:Beatles', 'album:Abbey Road', + ... 'artist:Beatles', 'tag:rock', + ... 'album:Abbey Road', 'artist:Beatles' + ... ] + >>> dedup = dedup_tags_by_namespace(tags) + >>> log(dedup) + ['artist:Beatles', 'album:Abbey Road', 'tag:rock'] + """ + if not tags: + return [] + + # Group tags by namespace + namespace_to_tags: Dict[Optional[str], List[Tuple[int, str]]] = {} # namespace → [(index, full_tag), ...] + first_appearance: Dict[Optional[str], int] = {} # namespace → first_index + + for idx, tag in enumerate(tags): + # Extract namespace (part before ':') + if ':' in tag: + namespace: Optional[str] = tag.split(':', 1)[0] + else: + namespace = None # No namespace + + # Track first appearance + if namespace not in first_appearance: + first_appearance[namespace] = idx + + # Store tag with its index + if namespace not in namespace_to_tags: + namespace_to_tags[namespace] = [] + namespace_to_tags[namespace].append((idx, tag)) + + # Build result: keep first or last occurrence per namespace + result: List[Tuple[int, str]] = [] # (first_appearance_index, tag) + + for namespace, tag_list in namespace_to_tags.items(): + if keep_first: + chosen_tag = tag_list[0][1] # First occurrence + else: + chosen_tag = tag_list[-1][1] # Last occurrence + + result.append((first_appearance[namespace], chosen_tag)) + + # Sort by first appearance order, then extract tags + result.sort(key=lambda x: x[0]) + return [tag for _, tag in result] + + +def merge_multiple_tag_lists( + sources: List[List[str]], + strategy: str = 'first' +) -> List[str]: + """Intelligently merge multiple tag lists with smart deduplication. + + This is the UNIFIED API for merging tags from multiple sources + (e.g., when merging multiple files or combining metadata sources). + + Strategies: + - 'first': Keep first occurrence of each namespace (default) + - 'all': Keep all different values (different artists possible) + - 'combine': For non-namespace tags, combine all unique values + + Args: + sources: List of tag lists to merge + strategy: Merge strategy - 'first', 'all', or 'combine' + + Returns: + Merged and deduplicated tag list + + Example: + >>> list1 = ['artist:Beatles', 'album:Abbey Road'] + >>> list2 = ['artist:Beatles', 'album:Abbey Road', 'tag:rock'] + >>> merged = merge_multiple_tag_lists([list1, list2]) + >>> log(merged) + ['artist:Beatles', 'album:Abbey Road', 'tag:rock'] + """ + if not sources: + return [] + + if strategy == 'first': + # Concatenate all lists and deduplicate by namespace + all_tags = [] + for tag_list in sources: + all_tags.extend(tag_list or []) + return dedup_tags_by_namespace(all_tags, keep_first=True) + + elif strategy == 'all': + # Keep all different values per namespace + namespace_to_values: Dict[Optional[str], Set[str]] = {} + order: List[Tuple[int, str, str]] = [] # (first_index, namespace, value) + global_index = 0 + + for source in sources: + if not source: + continue + for tag in source: + if ':' in tag: + namespace: Optional[str] = tag.split(':', 1)[0] + value = tag.split(':', 1)[1] + else: + namespace = None + value = tag + + if namespace not in namespace_to_values: + namespace_to_values[namespace] = set() + order.append((global_index, namespace or '', tag)) + elif value not in namespace_to_values[namespace]: + order.append((global_index, namespace or '', tag)) + + namespace_to_values[namespace].add(value) + global_index += 1 + + # Sort by order of first appearance and extract + order.sort(key=lambda x: x[0]) + return [tag for _, _, tag in order] + + elif strategy == 'combine': + # Combine all unique plain (non-namespace) tags + all_tags = [] + namespaced: Dict[str, str] = {} # namespace → tag (first occurrence) + + for source in sources: + if not source: + continue + for tag in source: + if ':' in tag: + namespace = tag.split(':', 1)[0] + if namespace not in namespaced: + namespaced[namespace] = tag + all_tags.append(tag) + else: + if tag not in all_tags: + all_tags.append(tag) + + return all_tags + + else: + raise ValueError(f"Unknown merge strategy: {strategy}") + + +def read_tags_from_file(file_path: Path) -> List[str]: + """Read and normalize tags from .tags sidecar file. + + This is the UNIFIED API for reading .tags files across all cmdlets. + Handles normalization, deduplication, and format validation. + + Args: + file_path: Path to .tags sidecar file + + Returns: + List of normalized tag strings + + Raises: + FileNotFoundError: If file doesn't exist + + Example: + >>> tags = read_tags_from_file(Path('file.txt.tags')) + >>> log(tags) + ['artist:Beatles', 'album:Abbey Road'] + """ + file_path = Path(file_path) + if not file_path.exists(): + raise FileNotFoundError(f"Tag file not found: {file_path}") + + tags: List[str] = [] + seen: Set[str] = set() + + try: + with open(file_path, 'r', encoding='utf-8') as f: + for line in f: + # Strip whitespace and skip empty lines + line = line.strip() + if not line: + continue + + # Skip comment lines + if line.startswith('#'): + continue + + # Normalize the tag + normalized = value_normalize(line) + if normalized and normalized not in seen: + seen.add(normalized) + tags.append(normalized) + except Exception as exc: + raise ValueError(f"Error reading tag file {file_path}: {exc}") + + return tags + + +def embed_metadata_in_file( + file_path: Path, + tags: List[str], + file_kind: str = '' +) -> bool: + """Embed metadata tags into a media file using FFmpeg. + + Extracts metadata from tags (namespace:value format) and writes to the file's + metadata using FFmpeg with -c copy (no re-encoding). + + Supported tag namespaces: + - title, artist, album, track/track_number, date/year, genre, composer, comment + + For audio files, applies sensible defaults: + - If no album, uses title as album + - If no track, defaults to 1 + - album_artist is set to artist value + + Args: + file_path: Path to media file + tags: List of tags in format ['namespace:value', ...] (e.g., ['artist:Beatles', 'album:Abbey Road']) + file_kind: Type of file: 'audio', 'video', or '' for auto-detect (optional) + + Returns: + True if successful, False otherwise + + Raises: + None (logs errors to stderr) + + Example: + >>> tags = ['artist:Beatles', 'album:Abbey Road', 'track:1'] + >>> success = embed_metadata_in_file(Path('song.mp3'), tags, file_kind='audio') + """ + if not tags: + return True + + file_path = Path(file_path) + + # Tag namespace to FFmpeg metadata key mapping + tag_map = { + 'title': 'title', + 'artist': 'artist', + 'album': 'album', + 'track': 'track', + 'track_number': 'track', + 'date': 'date', + 'year': 'date', + 'genre': 'genre', + 'composer': 'composer', + 'comment': 'comment', + 'known_url': 'comment', # Embed known URLs in comment field + 'creator': 'artist', # Map creator to artist + 'channel': 'album_artist', # Map channel to album_artist + } + + # Extract metadata from tags + metadata = {} + comments = [] # Collect comments (including URLs) + for tag in tags: + tag_str = str(tag).strip() + if ':' in tag_str: + namespace, value = tag_str.split(':', 1) + namespace = namespace.lower().strip() + value = value.strip() + if namespace in tag_map and value: + ffmpeg_key = tag_map[namespace] + if namespace == 'known_url': + # Collect URLs as comments + comments.append(f"URL: {value}") + elif ffmpeg_key == 'comment': + # Collect other comment-type tags + comments.append(value) + elif ffmpeg_key not in metadata: + # Don't overwrite if already set from earlier tag + metadata[ffmpeg_key] = value + + # Add collected comments to metadata + if comments: + if 'comment' in metadata: + metadata['comment'] = metadata['comment'] + ' | ' + ' | '.join(comments) + else: + metadata['comment'] = ' | '.join(comments) + + # Apply sensible defaults for audio files + if file_kind == 'audio' or (not file_kind and file_path.suffix.lower() in {'.mp3', '.flac', '.wav', '.m4a', '.aac', '.ogg', '.opus', '.mka'}): + # If no album, use title as album + if 'album' not in metadata and 'title' in metadata: + metadata['album'] = metadata['title'] + # If no track, default to 1 + if 'track' not in metadata: + metadata['track'] = '1' + # If no album_artist, use artist + if 'artist' in metadata: + metadata['album_artist'] = metadata['artist'] + + if not metadata: + return True + + # Check if FFmpeg is available + ffmpeg_path = shutil.which('ffmpeg') + if not ffmpeg_path: + log(f"⚠️ FFmpeg not found; cannot embed metadata in {file_path.name}", file=sys.stderr) + return False + + # Create temporary file for output + temp_file = file_path.parent / f"{file_path.stem}.ffmpeg_tmp{file_path.suffix}" + try: + cmd = [ffmpeg_path, '-y', '-i', str(file_path)] + for key, value in metadata.items(): + cmd.extend(['-metadata', f'{key}={value}']) + cmd.extend(['-c', 'copy', str(temp_file)]) + + # Run ffmpeg with error handling for non-UTF8 output + result = subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=False, # Don't decode as text - ffmpeg may output binary data + timeout=30 + ) + if result.returncode == 0 and temp_file.exists(): + # Replace original with temp file + file_path.unlink() + temp_file.rename(file_path) + log(f"✅ Embedded metadata in file: {file_path.name}", file=sys.stderr) + return True + else: + # Clean up temp file if it exists + if temp_file.exists(): + temp_file.unlink() + log(f"❌ FFmpeg metadata embedding failed for {file_path.name}", file=sys.stderr) + if result.stderr: + # Safely decode stderr, ignoring invalid UTF-8 bytes + try: + stderr_text = result.stderr.decode('utf-8', errors='replace')[:200] + log(f"FFmpeg stderr: {stderr_text}", file=sys.stderr) + except Exception: + pass + return False + except Exception as exc: + if temp_file.exists(): + try: + temp_file.unlink() + except Exception: + pass + log(f"❌ Error embedding metadata: {exc}", file=sys.stderr) + return False + + +def write_tags_to_file( + file_path: Path, + tags: List[str], + source_hashes: Optional[List[str]] = None, + known_urls: Optional[List[str]] = None, + append: bool = False +) -> bool: + """Write tags to .tags sidecar file. + + This is the UNIFIED API for writing .tags files across all cmdlets. + Uses consistent format and handles file creation/overwriting. + + Args: + file_path: Path to .tags file (will be created if doesn't exist) + tags: List of tags to write + source_hashes: Optional source file hashes (written as source:hash1,hash2) + known_urls: Optional known URLs (each written on separate line as known_url:url) + append: If True, append to existing file; if False, overwrite (default) + + Returns: + True if successful + + Raises: + Exception: If file write fails + + Example: + >>> tags = ['artist:Beatles', 'album:Abbey Road'] + >>> write_tags_to_file(Path('file.txt.tags'), tags) + True + """ + file_path = Path(file_path) + + try: + # Prepare content + content_lines: List[str] = [] + + # Add source hashes if provided + if source_hashes: + content_lines.append(f"source:{','.join(source_hashes)}") + + # Add known URLs if provided - each on separate line to prevent corruption + if known_urls: + for url in known_urls: + content_lines.append(f"known_url:{url}") + + # Add tags + if tags: + content_lines.extend(tags) + + # Write to file + mode = 'a' if (append and file_path.exists()) else 'w' + with open(file_path, mode, encoding='utf-8') as f: + for line in content_lines: + f.write(line + '\n') + + return True + except Exception as exc: + raise ValueError(f"Error writing tag file {file_path}: {exc}") + + +def normalize_tags_from_source( + source_data: Any, + source_type: str = 'auto' +) -> List[str]: + """Normalize tags from any source format. + + Universal function to normalize tags from different sources: + - yt-dlp entry dicts + - Raw tag lists + - .tags file content strings + - Metadata dictionaries + + Args: + source_data: Source data (type determined by source_type or auto-detected) + source_type: One of 'auto', 'ytdlp', 'list', 'text', 'dict' + 'auto' attempts to auto-detect the type + + Returns: + Normalized, deduplicated tag list + + Example: + >>> entry = {'artist': 'Beatles', 'album': 'Abbey Road'} + >>> tags = normalize_tags_from_source(entry, 'ytdlp') + >>> log(tags) + ['artist:Beatles', 'album:Abbey Road'] + """ + if source_type == 'auto': + # Auto-detect source type + if isinstance(source_data, dict): + # Check if it looks like a yt-dlp entry (has id, title, url, etc.) + if 'id' in source_data or 'title' in source_data or 'uploader' in source_data: + source_type = 'ytdlp' + else: + source_type = 'dict' + elif isinstance(source_data, list): + source_type = 'list' + elif isinstance(source_data, str): + source_type = 'text' + else: + source_type = 'dict' + + # Process based on detected/specified type + if source_type == 'ytdlp': + if not isinstance(source_data, dict): + raise ValueError("ytdlp source must be a dict") + return extract_ytdlp_tags(source_data) + + elif source_type == 'list': + if not isinstance(source_data, (list, tuple)): + raise ValueError("list source must be a list or tuple") + # Normalize each tag in the list + result = [] + for tag in source_data: + normalized = value_normalize(str(tag)) + if normalized: + result.append(normalized) + return result + + elif source_type == 'text': + if not isinstance(source_data, str): + raise ValueError("text source must be a string") + # Split by lines and normalize + lines = source_data.split('\n') + result = [] + seen = set() + for line in lines: + line = line.strip() + if line and not line.startswith('#'): + normalized = value_normalize(line) + if normalized and normalized not in seen: + seen.add(normalized) + result.append(normalized) + return result + + elif source_type == 'dict': + if not isinstance(source_data, dict): + raise ValueError("dict source must be a dict") + # Extract as generic metadata (similar to yt-dlp but from any dict) + return extract_ytdlp_tags(source_data) + + else: + raise ValueError(f"Unknown source type: {source_type}") + + +def detect_metadata_request(tag: str) -> Optional[Dict[str, str]]: + trimmed = value_normalize(tag) + if not trimmed: + return None + lower = trimmed.lower() + imdb_match = re.match(r'^imdb:\s*(tt[\w]+)$', lower) + if imdb_match: + imdb_id = imdb_match.group(1) + return { + 'source': 'imdb', + 'id': imdb_id, + 'base': f'imdb:{imdb_id}', + } + remainder = re.match(r'^musicbrainz:\s*(.+)$', lower) + if remainder: + raw = remainder.group(1) + entity = 'release' + identifier = raw + specific = re.match(r'^(?P[a-zA-Z]+)\s*:\s*(?P[\w-]+)$', raw) + if specific: + entity = specific.group('entity') + identifier = specific.group('id') + identifier = identifier.replace(' ', '') + if identifier: + return { + 'source': 'musicbrainz', + 'entity': entity.lower(), + 'id': identifier, + 'base': f'musicbrainz:{identifier}', + } + return None +def expand_metadata_tag(payload: Dict[str, Any]) -> Dict[str, Any]: + tag = payload.get('tag') + if not isinstance(tag, str): + return {'tags': []} + trimmed = value_normalize(tag) + if not trimmed: + return {'tags': []} + request = detect_metadata_request(trimmed) + tags: List[str] = [] + seen: Set[str] = set() + if request: + _append_unique(tags, seen, request['base']) + else: + _append_unique(tags, seen, trimmed) + return {'tags': tags} + try: + if request['source'] == 'imdb': + data = imdb_tag(request['id']) + else: + data = fetch_musicbrainz_tags(request['id'], request['entity']) + except Exception as exc: # pragma: no cover - network/service errors + return {'tags': tags, 'error': str(exc)} + # Add tags from fetched data (no namespace, just unique append) + for tag in (data.get('tags') or []): + _append_unique(tags, seen, tag) + result = { + 'tags': tags, + 'source': request['source'], + 'id': request['id'], + } + if request['source'] == 'musicbrainz': + result['entity'] = request['entity'] + return result +def build_remote_bundle(metadata: Optional[Dict[str, Any]], existing: Optional[Sequence[str]] = None, context: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + metadata = metadata or {} + context = context or {} + tags: List[str] = [] + seen: Set[str] = set() + if existing: + for tag in existing: + _append_unique(tags, seen, tag) + + # Add tags from various sources + for tag in (metadata.get("tags") or []): + _append_unique(tags, seen, tag) + for tag in (metadata.get("categories") or []): + _append_unique(tags, seen, tag) + + # Extract and namespace genres + raw_genres = metadata.get("genres") + keywords = metadata.get("keywords") + if isinstance(keywords, str): + for token in keywords.split(","): + _append_unique(tags, seen, token) + if raw_genres: + for genre in (raw_genres if isinstance(raw_genres, (list, tuple)) else [raw_genres]): + if genre: + _append_unique(tags, seen, f"genre:{genre}") + + # Extract creators/artists + artists = metadata.get("artists") or metadata.get("artist") + if artists: + artist_list = artists if isinstance(artists, (list, tuple)) else [artists] + for artist in artist_list: + if artist: + _append_unique(tags, seen, f"creator:{artist}") + + creator = metadata.get("uploader") or metadata.get("channel") or metadata.get("artist") or metadata.get("creator") + if creator: + _append_unique(tags, seen, f"creator:{creator}") + + # Extract title + title_value = metadata.get("title") + if title_value: + _extend_namespaced(tags, seen, "title", [title_value]) + source_url = context.get("source_url") or metadata.get("original_url") or metadata.get("webpage_url") or metadata.get("url") + clean_title = value_normalize(str(title_value)) if title_value is not None else None + result = { + "tags": tags, + "title": clean_title, + "source_url": _sanitize_url(source_url), + "duration": _coerce_duration(metadata), + "metadata": metadata, + } + return result +def _load_payload(value: Optional[str]) -> Dict[str, Any]: + text = value + if text is None: + text = sys.stdin.read() + if text is None or text.strip() == "": + raise ValueError("Expected JSON payload") + data = json.loads(text) + if not isinstance(data, dict): + raise ValueError("Payload must be a JSON object") + return data + + +import typer + +app = typer.Typer(help="Fetch metadata tags for known services") + +@app.command(help="Lookup an IMDb title") +def imdb(imdb_id: str = typer.Argument(..., help="IMDb identifier (ttXXXXXXX)")): + """Lookup an IMDb title.""" + try: + result = imdb_tag(imdb_id) + log(json.dumps(result, ensure_ascii=False), flush=True) + except Exception as exc: + error_payload = {"error": str(exc)} + log(json.dumps(error_payload, ensure_ascii=False), flush=True) + raise typer.Exit(code=1) + +@app.command(help="Lookup a MusicBrainz entity") +def musicbrainz( + mbid: str = typer.Argument(..., help="MusicBrainz identifier (UUID)"), + entity: str = typer.Option("release", help="Entity type (release, recording, artist)") +): + """Lookup a MusicBrainz entity.""" + try: + result = fetch_musicbrainz_tags(mbid, entity) + log(json.dumps(result, ensure_ascii=False), flush=True) + except Exception as exc: + error_payload = {"error": str(exc)} + log(json.dumps(error_payload, ensure_ascii=False), flush=True) + raise typer.Exit(code=1) + +@app.command(name="remote-tags", help="Normalize a remote metadata payload") +def remote_tags(payload: Optional[str] = typer.Option(None, "--payload", help="JSON payload; reads stdin if omitted")): + """Normalize a remote metadata payload.""" + try: + payload_data = _load_payload(payload) + metadata = payload_data.get("metadata") or {} + existing = payload_data.get("existing_tags") or [] + context = payload_data.get("context") or {} + if not isinstance(existing, list): + raise ValueError("existing_tags must be a list") + if context and not isinstance(context, dict): + raise ValueError("context must be an object") + result = build_remote_bundle(metadata, existing, context) + log(json.dumps(result, ensure_ascii=False), flush=True) + except Exception as exc: + error_payload = {"error": str(exc)} + log(json.dumps(error_payload, ensure_ascii=False), flush=True) + raise typer.Exit(code=1) + +@app.command(name="remote-fetch", help="Resolve remote metadata bundle") +def remote_fetch(payload: Optional[str] = typer.Option(None, "--payload", help="JSON payload; reads stdin if omitted")): + """Resolve remote metadata bundle.""" + try: + payload_data = _load_payload(payload) + result = resolve_remote_metadata(payload_data) + log(json.dumps(result, ensure_ascii=False), flush=True) + except Exception as exc: + error_payload = {"error": str(exc)} + log(json.dumps(error_payload, ensure_ascii=False), flush=True) + raise typer.Exit(code=1) + +@app.command(name="expand-tag", help="Expand metadata references into tags") +def expand_tag(payload: Optional[str] = typer.Option(None, "--payload", help="JSON payload; reads stdin if omitted")): + """Expand metadata references into tags.""" + try: + payload_data = _load_payload(payload) + result = expand_metadata_tag(payload_data) + log(json.dumps(result, ensure_ascii=False), flush=True) + except Exception as exc: + error_payload = {"error": str(exc)} + log(json.dumps(error_payload, ensure_ascii=False), flush=True) + raise typer.Exit(code=1) + +@app.command(name="hydrus-fetch", help="Fetch Hydrus metadata for a file") +def hydrus_fetch(payload: Optional[str] = typer.Option(None, "--payload", help="JSON payload; reads stdin if omitted")): + """Fetch Hydrus metadata for a file.""" + try: + payload_data = _load_payload(payload) + result = fetch_hydrus_metadata(payload_data) + log(json.dumps(result, ensure_ascii=False), flush=True) + except Exception as exc: + error_payload = {"error": str(exc)} + log(json.dumps(error_payload, ensure_ascii=False), flush=True) + raise typer.Exit(code=1) + +@app.command(name="hydrus-fetch-url", help="Fetch Hydrus metadata using a source URL") +def hydrus_fetch_url(payload: Optional[str] = typer.Option(None, "--payload", help="JSON payload; reads stdin if omitted")): + """Fetch Hydrus metadata using a source URL.""" + try: + payload_data = _load_payload(payload) + result = fetch_hydrus_metadata_by_url(payload_data) + log(json.dumps(result, ensure_ascii=False), flush=True) + except Exception as exc: + error_payload = {"error": str(exc)} + log(json.dumps(error_payload, ensure_ascii=False), flush=True) + raise typer.Exit(code=1) + +@app.command(name="sync-sidecar", help="Synchronise .tags sidecar with supplied data") +def sync_sidecar_cmd(payload: Optional[str] = typer.Option(None, "--payload", help="JSON payload; reads stdin if omitted")): + """Synchronise .tags sidecar with supplied data.""" + try: + payload_data = _load_payload(payload) + result = sync_sidecar(payload_data) + log(json.dumps(result, ensure_ascii=False), flush=True) + except Exception as exc: + error_payload = {"error": str(exc)} + log(json.dumps(error_payload, ensure_ascii=False), flush=True) + raise typer.Exit(code=1) + +@app.command(name="update-tag", help="Update or rename a tag") +def update_tag_cmd(payload: Optional[str] = typer.Option(None, "--payload", help="JSON payload; reads stdin if omitted")): + """Update or rename a tag.""" + try: + payload_data = _load_payload(payload) + result = apply_tag_mutation(payload_data, 'update') + log(json.dumps(result, ensure_ascii=False), flush=True) + except Exception as exc: + error_payload = {"error": str(exc)} + log(json.dumps(error_payload, ensure_ascii=False), flush=True) + raise typer.Exit(code=1) + +def main(argv: Optional[List[str]] = None) -> int: + """Main entry point using Typer.""" + try: + app(argv, standalone_mode=False) + return 0 + except SystemExit as e: + return e.code if isinstance(e.code, int) else 1 + + +# ============================================================================ +# TAG OPERATIONS - Consolidated from tag_operations.py and tag_helpers.py +# ============================================================================ + +def sort_tags(tags: List[str]) -> List[str]: + """ + Sort tags into namespace tags and freeform tags, then alphabetically. + + Args: + tags: List of tag strings + + Returns: + Sorted list with namespace tags first, then freeform tags + """ + if not tags: + return [] + + namespace_tags = [] + freeform_tags = [] + + for tag in tags: + if isinstance(tag, str): + if ':' in tag: + namespace_tags.append(tag) + else: + freeform_tags.append(tag) + + namespace_tags.sort() + freeform_tags.sort() + + return namespace_tags + freeform_tags + + +def format_tags_display(tags: List[str], namespace_filter: Optional[str] = None) -> List[str]: + """ + Format tags for display, optionally filtered by namespace. + + Args: + tags: List of tags + namespace_filter: Optional namespace to filter by (e.g., "creator:") + + Returns: + Formatted list of tags + """ + if not tags: + return [] + + if namespace_filter: + filtered = [t for t in tags if t.startswith(namespace_filter)] + return sort_tags(filtered) + + return sort_tags(tags) + + +def split_tag(tag: str) -> tuple[str, str]: + """ + Split a tag into namespace and value. + + Args: + tag: Tag string (e.g., "creator:Author Name" or "freeform tag") + + Returns: + Tuple of (namespace, value). For freeform tags, namespace is empty string. + """ + if ':' in tag: + parts = tag.split(':', 1) + return parts[0], parts[1] + return '', tag + + +def filter_tags_by_namespace(tags: List[str], namespace: str) -> List[str]: + """ + Get all tags in a specific namespace. + + Args: + tags: List of tags + namespace: Namespace to filter by + + Returns: + List of values in that namespace + """ + prefix = namespace + ':' + return [split_tag(t)[1] for t in tags if t.startswith(prefix)] + + +def ensure_title_tag(tags: List[str], title: str) -> List[str]: + """ + Ensure there's a title: tag with the given title. + + Args: + tags: List of existing tags + title: Title to ensure exists + + Returns: + Updated tag list + """ + if not title: + return tags + + # Remove any existing title tags + filtered = [t for t in tags if not t.startswith('title:')] + + # Add new title tag + new_tags = filtered + [f'title:{title}'] + + return sort_tags(new_tags) + + +def remove_title_tags(tags: List[str]) -> List[str]: + """Remove all title: tags.""" + return [t for t in tags if not t.startswith('title:')] + + +def is_namespace_tag(tag: str) -> bool: + """Check if a tag is a namespace tag (contains :).""" + return ':' in tag if isinstance(tag, str) else False + + +def validate_tag(tag: str) -> bool: + """ + Validate that a tag is properly formatted. + + Args: + tag: Tag to validate + + Returns: + True if tag is valid + """ + if not isinstance(tag, str) or not tag.strip(): + return False + + # Tag shouldn't have leading/trailing whitespace + if tag != tag.strip(): + return False + + # Tag shouldn't be empty + if not tag: + return False + + return True + + +def normalize_tags(tags: List[Any]) -> List[str]: + """ + Normalize a tag list by filtering and cleaning. + + Args: + tags: List of tags (may contain invalid entries) + + Returns: + Cleaned list of valid tags + """ + if not tags: + return [] + + normalized = [] + for tag in tags: + if isinstance(tag, str): + trimmed = tag.strip() + if trimmed and validate_tag(trimmed): + normalized.append(trimmed) + + return sort_tags(normalized) + + +def merge_tag_lists(*tag_lists: List[str]) -> List[str]: + """ + Merge multiple tag lists, removing duplicates. + + Args: + *tag_lists: Variable number of tag lists + + Returns: + Merged, deduplicated, sorted list + """ + merged = set() + for tag_list in tag_lists: + if isinstance(tag_list, list): + merged.update(tag_list) + + return sort_tags(list(merged)) + + +def tag_diff(old_tags: List[str], new_tags: List[str]) -> Dict[str, List[str]]: + """ + Calculate the difference between two tag lists. + + Args: + old_tags: Original tags + new_tags: New tags + + Returns: + Dict with 'added' and 'removed' keys + """ + old_set = set(old_tags) if old_tags else set() + new_set = set(new_tags) if new_tags else set() + + return { + 'added': sorted(list(new_set - old_set)), + 'removed': sorted(list(old_set - new_set)) + } + + +def expand_tag_lists(tags_set: Set[str]) -> Set[str]: + """Expand tag list references like {psychology} to actual tags from adjective.json. + + Removes the reference after expansion (e.g., {psychology} is deleted, psychology tags added). + + Args: + tags_set: Set of tag strings that may include {list_name} references + + Returns: + Set of expanded tags with all {list_name} references replaced with actual tags + """ + # Load adjective.json from workspace root + adjective_path = Path(__file__).parent / "adjective.json" + if not adjective_path.exists(): + log.debug(f"adjective.json not found at {adjective_path}") + return tags_set + + try: + with open(adjective_path, 'r') as f: + adjective_lists = json.load(f) + except Exception as e: + log.error(f"Error loading adjective.json: {e}") + return tags_set + + expanded_tags = set() + for tag in tags_set: + # Check if tag is a list reference like {psychology} + if tag.startswith('{') and tag.endswith('}'): + list_name = tag[1:-1].lower() # Extract name, make lowercase + + # Find matching list (case-insensitive) + matched_list = None + for key in adjective_lists.keys(): + if key.lower() == list_name: + matched_list = adjective_lists[key] + break + + if matched_list: + # Add all tags from the list + expanded_tags.update(matched_list) + log.info(f"Expanded {tag} to {len(matched_list)} tags") + else: + # List not found, log warning but don't add the reference + log.warning(f"Tag list '{list_name}' not found in adjective.json") + else: + # Regular tag, keep as is + expanded_tags.add(tag) + + return expanded_tags + + +def process_tags_from_string(tags_str: str, expand_lists: bool = False) -> Set[str]: + """Process a tag string into a set of tags. + + Handles: + - Multiple formats: comma-separated, newline-separated, space-separated + - Tag list expansion: {psychology} -> psychology tags (if expand_lists=True) + - Whitespace trimming + + Args: + tags_str: Raw tag string + expand_lists: If True, expand {list_name} references using adjective.json + + Returns: + Set of processed tags + """ + if not tags_str: + return set() + + # Try to detect delimiter and split accordingly + # Prefer newlines, then commas, then spaces + if '\n' in tags_str: + delimiter = '\n' + elif ',' in tags_str: + delimiter = ',' + else: + delimiter = ' ' + + # Split and clean tags + tags_set = set() + for tag in tags_str.split(delimiter): + tag = tag.strip() + if tag: + tags_set.add(tag) + + # Expand list references if requested + if expand_lists: + tags_set = expand_tag_lists(tags_set) + + return tags_set + + +def fetch_openlibrary_metadata_tags(isbn: Optional[str] = None, olid: Optional[str] = None) -> List[str]: + """Fetch book metadata from OpenLibrary and return as tags. + + Args: + isbn: ISBN number (with or without isbn: prefix) + olid: OpenLibrary ID + + Returns: + List of tags extracted from OpenLibrary metadata + """ + metadata_tags = [] + + # Try OLID first (preferred), then ISBN + url = None + + if olid: + # Clean up OLID format + olid_clean = str(olid).replace('OL', '').replace('M', '').replace('W', '') + if olid_clean.isdigit(): + url = f"https://openlibrary.org/books/OL{olid_clean}M.json" + else: + url = f"https://openlibrary.org/books/{olid}.json" + elif isbn: + # Clean up ISBN + isbn_clean = str(isbn).replace('isbn:', '').strip() + url = f"https://openlibrary.org/isbn/{isbn_clean}.json" + + if not url: + return metadata_tags + + try: + response = requests.get(url, timeout=10) + if response.status_code != 200: + return metadata_tags + + data = response.json() + if not data: + return metadata_tags + + # Extract title + if 'title' in data: + metadata_tags.append(f"title:{data['title']}") + + # Extract authors + if 'authors' in data and isinstance(data['authors'], list): + for author in data['authors'][:3]: + if isinstance(author, dict) and 'name' in author: + metadata_tags.append(f"author:{author['name']}") + elif isinstance(author, str): + metadata_tags.append(f"author:{author}") + + # Extract publish date + if 'publish_date' in data: + metadata_tags.append(f"publish_date:{data['publish_date']}") + + # Extract publishers + if 'publishers' in data and isinstance(data['publishers'], list): + for pub in data['publishers'][:1]: + if isinstance(pub, dict) and 'name' in pub: + metadata_tags.append(f"publisher:{pub['name']}") + elif isinstance(pub, str): + metadata_tags.append(f"publisher:{pub}") + + # Extract number of pages + if 'number_of_pages' in data: + page_count = data['number_of_pages'] + if page_count and isinstance(page_count, int) and page_count > 0: + metadata_tags.append(f"pages:{page_count}") + + # Extract language + if 'languages' in data and isinstance(data['languages'], list) and data['languages']: + lang = data['languages'][0] + if isinstance(lang, dict) and 'key' in lang: + lang_code = lang['key'].split('/')[-1] + metadata_tags.append(f"language:{lang_code}") + elif isinstance(lang, str): + metadata_tags.append(f"language:{lang}") + + # Extract subjects as freeform tags (limit to 5) + if 'subjects' in data and isinstance(data['subjects'], list): + for subject in data['subjects'][:5]: + if subject and isinstance(subject, str): + subject_clean = str(subject).strip() + if subject_clean: + metadata_tags.append(subject_clean) + + except Exception as e: + log(f"⚠ Failed to fetch OpenLibrary metadata: {e}") + + return metadata_tags + + +def enrich_playlist_entries(entries: list, extractor: str) -> list: + """Enrich playlist entries with full metadata by fetching individual entry info. + + When extract_flat is used, entries contain minimal info (title, id, url). + This function fetches full metadata for each entry. + + Args: + entries: List of entry dicts from probe_url + extractor: Extractor name + + Returns: + List of enriched entry dicts + """ + # Import here to avoid circular dependency + from helper.download import is_url_supported_by_ytdlp + + if not entries: + return entries + + enriched = [] + for entry in entries: + # If entry has a direct URL, fetch its full metadata + entry_url = entry.get("url") + if entry_url and is_url_supported_by_ytdlp(entry_url): + try: + import yt_dlp + ydl_opts = { + "quiet": True, + "no_warnings": True, + "skip_download": True, + "noprogress": True, + "socket_timeout": 5, + "retries": 1, + } + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + full_info = ydl.extract_info(entry_url, download=False) + if full_info: + enriched.append(full_info) + continue + except Exception: + pass + + # Fallback to original entry if fetch failed + enriched.append(entry) + + return enriched + + +def format_playlist_entry(entry: Dict[str, Any], index: int, extractor: str) -> Dict[str, Any]: + """Format a playlist entry for display in result table. + + Args: + entry: Single playlist entry from yt-dlp (fully enriched if possible) + index: 1-based track number + extractor: Extractor name (youtube, bandcamp, spotify, etc.) + + Returns: + Dict with displayable fields for result table + """ + result = { + "index": index, + "title": entry.get("title", "Unknown"), + "duration": entry.get("duration") or entry.get("length") or 0, + "uploader": entry.get("uploader") or entry.get("creator") or "", + "artist": entry.get("artist") or entry.get("uploader") or entry.get("creator") or "", + "album": entry.get("album") or "", + "track_number": entry.get("track_number") or index, + } + + # Normalize extractor for comparison + ext_lower = extractor.lower().replace(":", "").replace(" ", "") + + # Add site-specific fields + if "youtube" in ext_lower: + result["video_id"] = entry.get("id", "") + result["channel"] = entry.get("uploader") or entry.get("channel", "") + result["views"] = entry.get("view_count", 0) + + elif "bandcamp" in ext_lower: + result["track_number"] = entry.get("track_number") or index + # For Bandcamp album entries, track info may be in different fields + result["artist"] = entry.get("artist") or entry.get("uploader", "") + result["album"] = entry.get("album") or "" + + elif "spotify" in ext_lower: + result["artists"] = entry.get("creator") or entry.get("uploader", "") + result["album"] = entry.get("album", "") + result["release_date"] = entry.get("release_date", "") + + return result diff --git a/models.py b/models.py new file mode 100644 index 0000000..48102c5 --- /dev/null +++ b/models.py @@ -0,0 +1,678 @@ +"""Data models for the pipeline.""" + +import datetime +import hashlib +import json +import math +import os +import shutil +import sys +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Protocol, TextIO, Tuple + + +@dataclass(slots=True) +class PipeObject: + """Unified pipeline object for tracking files, metadata, tags, and relationships through the pipeline. + + This is the single source of truth for all result data in the pipeline. It can represent: + - Tag extraction results (IMDb, MusicBrainz, OpenLibrary lookups) + - Remote metadata fetches + - File operations with metadata/tags and relationship tracking + - Search results + - Files with version relationships (king/alt/related) + + Attributes: + source: Source of the object (e.g., 'imdb', 'musicbrainz', 'libgen', 'debrid', 'file', etc.) + identifier: Unique identifier from the source (e.g., IMDb ID, MBID, magnet hash, file hash) + tags: List of extracted or assigned tags + title: Human-readable title if applicable + source_url: URL where the object came from + duration: Duration in seconds if applicable + metadata: Full metadata dictionary from source + remote_metadata: Additional remote metadata + warnings: Any warnings or issues encountered + mpv_metadata: MPV-specific metadata if applicable + file_path: Path to the file if this object represents a file + file_hash: SHA-256 hash of the file for integrity and relationship tracking + king_hash: Hash of the primary/master version of this file (for alternates) + alt_hashes: List of hashes for alternate versions of this file + related_hashes: List of hashes for related files (e.g., screenshots, editions) + is_temp: If True, this is a temporary/intermediate artifact that may be cleaned up + action: The cmdlet that created this object (format: 'cmdlet:cmdlet_name', e.g., 'cmdlet:get-file') + parent_id: Hash of the parent file in the pipeline chain (for tracking provenance/lineage) + extra: Additional fields not covered above + """ + source: str + identifier: str + tags: List[str] = field(default_factory=list) + title: Optional[str] = None + source_url: Optional[str] = None + duration: Optional[float] = None + metadata: Dict[str, Any] = field(default_factory=dict) + remote_metadata: Optional[Dict[str, Any]] = None + warnings: List[str] = field(default_factory=list) + mpv_metadata: Optional[Dict[str, Any]] = None + file_path: Optional[str] = None + file_hash: Optional[str] = None + king_hash: Optional[str] = None + alt_hashes: List[str] = field(default_factory=list) + related_hashes: List[str] = field(default_factory=list) + is_temp: bool = False + action: Optional[str] = None + parent_id: Optional[str] = None + extra: Dict[str, Any] = field(default_factory=dict) + + def register_as_king(self, file_hash: str) -> None: + """Register this object as the king (primary) version of a file.""" + self.king_hash = file_hash + + def add_alternate(self, alt_hash: str) -> None: + """Add an alternate version hash for this file.""" + if alt_hash not in self.alt_hashes: + self.alt_hashes.append(alt_hash) + + def add_related(self, related_hash: str) -> None: + """Add a related file hash (e.g., screenshot, edition).""" + if related_hash not in self.related_hashes: + self.related_hashes.append(related_hash) + + def get_relationships(self) -> Dict[str, Any]: + """Get all relationships for this object.""" + rels = {} + if self.king_hash: + rels["king"] = self.king_hash + if self.alt_hashes: + rels["alt"] = self.alt_hashes + if self.related_hashes: + rels["related"] = self.related_hashes + return rels + + def to_dict(self) -> Dict[str, Any]: + """Serialize to dictionary, excluding None and empty values.""" + data: Dict[str, Any] = { + "source": self.source, + "tags": self.tags, + } + if self.identifier: + data["id"] = self.identifier + if self.title: + data["title"] = self.title + if self.source_url: + data["source_url"] = self.source_url + if self.duration is not None: + data["duration"] = self.duration + if self.metadata: + data["metadata"] = self.metadata + if self.remote_metadata is not None: + data["remote_metadata"] = self.remote_metadata + if self.mpv_metadata is not None: + data["mpv_metadata"] = self.mpv_metadata + if self.warnings: + data["warnings"] = self.warnings + if self.file_path: + data["file_path"] = self.file_path + if self.file_hash: + data["file_hash"] = self.file_hash + # Include pipeline chain tracking fields + if self.is_temp: + data["is_temp"] = self.is_temp + if self.action: + data["action"] = self.action + if self.parent_id: + data["parent_id"] = self.parent_id + # Include relationship data if present + rels = self.get_relationships() + if rels: + data["relationships"] = rels + data.update({k: v for k, v in self.extra.items() if v is not None}) + return data + + @property + def hash(self) -> str: + """Compute SHA-256 hash from source and identifier.""" + base = f"{self.source}:{self.identifier}" + return hashlib.sha256(base.encode('utf-8')).hexdigest() + + # Backwards compatibility aliases + def as_dict(self) -> Dict[str, Any]: + """Alias for to_dict() for backwards compatibility.""" + return self.to_dict() + + def to_serializable(self) -> Dict[str, Any]: + """Alias for to_dict() for backwards compatibility.""" + return self.to_dict() + + +class FileRelationshipTracker: + """Track relationships between files for sidecar creation. + + Allows tagging files with their relationships to other files: + - king: The primary/master version of a file + - alt: Alternate versions of the same content + - related: Related files (e.g., screenshots of a book) + """ + + def __init__(self) -> None: + self.relationships: Dict[str, Dict[str, Any]] = {} + + def register_king(self, file_path: str, file_hash: str) -> None: + """Register a file as the king (primary) version.""" + if file_path not in self.relationships: + self.relationships[file_path] = {} + self.relationships[file_path]["king"] = file_hash + + def add_alt(self, file_path: str, alt_hash: str) -> None: + """Add an alternate version of a file.""" + if file_path not in self.relationships: + self.relationships[file_path] = {} + if "alt" not in self.relationships[file_path]: + self.relationships[file_path]["alt"] = [] + if alt_hash not in self.relationships[file_path]["alt"]: + self.relationships[file_path]["alt"].append(alt_hash) + + def add_related(self, file_path: str, related_hash: str) -> None: + """Add a related file.""" + if file_path not in self.relationships: + self.relationships[file_path] = {} + if "related" not in self.relationships[file_path]: + self.relationships[file_path]["related"] = [] + if related_hash not in self.relationships[file_path]["related"]: + self.relationships[file_path]["related"].append(related_hash) + + def get_relationships(self, file_path: str) -> Optional[Dict[str, Any]]: + """Get relationships for a file.""" + return self.relationships.get(file_path) + + def link_files(self, primary_path: str, king_hash: str, *alt_paths: str) -> None: + """Link files together with primary as king and others as alternates. + + Args: + primary_path: Path to the primary file (will be marked as 'king') + king_hash: Hash of the primary file + alt_paths: Paths to alternate versions (will be marked as 'alt') + """ + self.register_king(primary_path, king_hash) + for alt_path in alt_paths: + try: + alt_hash = _get_file_hash(alt_path) + self.add_alt(primary_path, alt_hash) + except Exception as e: + import sys + print(f"Error hashing {alt_path}: {e}", file=sys.stderr) + + +def _get_file_hash(filepath: str) -> str: + """Calculate SHA256 hash of a file.""" + sha256_hash = hashlib.sha256() + with open(filepath, "rb") as f: + for byte_block in iter(lambda: f.read(4096), b""): + sha256_hash.update(byte_block) + return sha256_hash.hexdigest() + + +# ============= Download Module Classes ============= + +class DownloadError(RuntimeError): + """Raised when the download or Hydrus import fails.""" + + +@dataclass(slots=True) +class DownloadOptions: + """Configuration for downloading media. + + Use the add-file cmdlet separately for Hydrus import. + """ + url: str + mode: str # "audio" or "video" + output_dir: Path + cookies_path: Optional[Path] = None + ytdl_format: Optional[str] = None + extra_tags: Optional[List[str]] = None + debug_log: Optional[Path] = None + native_progress: bool = False + clip_sections: Optional[str] = None + playlist_items: Optional[str] = None # yt-dlp --playlist-items format (e.g., "1-3,5,8") + no_playlist: bool = False # If True, pass --no-playlist to yt-dlp + + +class SendFunc(Protocol): + """Protocol for event sender function.""" + def __call__(self, event: str, **payload: Any) -> None: + ... + + +@dataclass(slots=True) +class DownloadMediaResult: + """Result of a successful media download.""" + path: Path + info: Dict[str, Any] + tags: List[str] + source_url: Optional[str] + hash_value: Optional[str] = None + + +@dataclass(slots=True) +class DebugLogger: + """Logs events to a JSON debug file for troubleshooting downloads.""" + path: Path + file: Optional[TextIO] = None + session_started: bool = False + + def ensure_open(self) -> None: + """Open the debug log file if not already open.""" + if self.file is not None: + return + try: + parent = self.path.parent + if parent and not parent.exists(): + parent.mkdir(parents=True, exist_ok=True) + self.file = self.path.open("a", encoding="utf-8") + except OSError as exc: # pragma: no cover - surfaces to stderr + print(f"Failed to open debug log {self.path}: {exc}", file=sys.stderr) + self.file = None + return + self._write_session_header() + + def _write_session_header(self) -> None: + """Write session start marker to log.""" + if self.session_started: + return + self.session_started = True + self.write_record("session-start", {"pid": os.getpid(), "exe": sys.executable}) + + def write_raw(self, text: str) -> None: + """Write raw text to debug log.""" + self.ensure_open() + if self.file is None: + return + self.file.write(text + "\n") + self.file.flush() + + def write_record(self, event: str, payload: Optional[Dict[str, Any]] = None) -> None: + """Write a structured event record to debug log.""" + record = { + "timestamp": datetime.datetime.utcnow().isoformat(timespec="seconds") + "Z", + "event": event, + "payload": payload, + } + self.write_raw(json.dumps(_sanitise_for_json(record), ensure_ascii=False)) + + def close(self) -> None: + """Close the debug log file.""" + if self.file is None: + return + try: + self.file.close() + finally: + self.file = None + + +def _sanitise_for_json(value: Any, *, max_depth: int = 8, _seen: Optional[set[int]] = None) -> Any: + """Best-effort conversion to JSON-serialisable types without raising on cycles.""" + import math + from dataclasses import asdict, is_dataclass + + if value is None or isinstance(value, (str, bool)): + return value + if isinstance(value, (int, float)): + if isinstance(value, float) and not math.isfinite(value): + return repr(value) + return value + if isinstance(value, Path): + return str(value) + if isinstance(value, bytes): + try: + return value.decode() + except Exception: + return value.hex() + + if max_depth <= 0: + return repr(value) + + if _seen is None: + _seen = set() + + obj_id = id(value) + if obj_id in _seen: + return "" + + _seen.add(obj_id) + try: + if isinstance(value, dict): + return { + str(key): _sanitise_for_json(val, max_depth=max_depth - 1, _seen=_seen) + for key, val in value.items() + } + if isinstance(value, (list, tuple, set)): + iterable = value if not isinstance(value, set) else list(value) + return [ + _sanitise_for_json(item, max_depth=max_depth - 1, _seen=_seen) + for item in iterable + ] + if is_dataclass(value) and not isinstance(value, type): + return _sanitise_for_json(asdict(value), max_depth=max_depth - 1, _seen=_seen) + finally: + _seen.discard(obj_id) + + return repr(value) + + +# ============================================================================ +# PROGRESS BAR CLASS +# ============================================================================ + +class ProgressBar: + """Formats download progress with visual bar, speed, ETA, and file size.""" + + def __init__(self, width: Optional[int] = None): + """Initialize progress bar with optional custom width. + + Args: + width: Terminal width, defaults to auto-detect. + """ + if width is None: + width = shutil.get_terminal_size((80, 20))[0] + self.width = max(40, width) # Minimum 40 chars for readability + + def format_bytes(self, bytes_val: Optional[float]) -> str: + """Format bytes to human-readable size. + + Args: + bytes_val: Number of bytes or None. + + Returns: + Formatted string (e.g., "123.4 MB", "1.2 GB"). + """ + if bytes_val is None or bytes_val <= 0: + return "?.? B" + + for unit in ("B", "KB", "MB", "GB", "TB"): + if bytes_val < 1024: + return f"{bytes_val:.1f} {unit}" + bytes_val /= 1024 + + return f"{bytes_val:.1f} PB" + + def format_speed(self, speed_str: Optional[str]) -> str: + """Format download speed. + + Args: + speed_str: Speed string from yt-dlp (e.g., "1.23MiB/s"). + + Returns: + Formatted speed string or "?.? KB/s". + """ + if not speed_str or speed_str.strip() == "": + return "?.? KB/s" + return speed_str.strip() + + def format_eta(self, eta_str: Optional[str]) -> str: + """Format estimated time remaining. + + Args: + eta_str: ETA string from yt-dlp (e.g., "00:12:34"). + + Returns: + Formatted ETA string or "?:?:?". + """ + if not eta_str or eta_str.strip() == "": + return "?:?:?" + return eta_str.strip() + + def format_percent(self, percent_str: Optional[str]) -> float: + """Extract percent as float. + + Args: + percent_str: Percent string from yt-dlp (e.g., "45.2%"). + + Returns: + Float 0-100 or 0 if invalid. + """ + if not percent_str: + return 0.0 + try: + return float(percent_str.replace("%", "").strip()) + except ValueError: + return 0.0 + + def build_bar(self, percent: float, width: int = 30) -> str: + """Build ASCII progress bar. + + Args: + percent: Completion percentage (0-100). + width: Bar width in characters. + + Returns: + Progress bar string (e.g., "[████████░░░░░░░░░░░░░░░░░░]"). + """ + percent = max(0, min(100, percent)) # Clamp to 0-100 + filled = int(percent * width / 100) + empty = width - filled + + # Use box-drawing characters for nice appearance + bar = "█" * filled + "░" * empty + return f"[{bar}]" + + def format_progress( + self, + percent_str: Optional[str] = None, + downloaded: Optional[int] = None, + total: Optional[int] = None, + speed_str: Optional[str] = None, + eta_str: Optional[str] = None, + ) -> str: + """Format complete progress line. + + Args: + percent_str: Percent string (e.g., "45.2%"). + downloaded: Downloaded bytes. + total: Total bytes. + speed_str: Speed string (e.g., "1.23MiB/s"). + eta_str: ETA string (e.g., "00:12:34"). + + Returns: + Formatted progress string. + """ + percent = self.format_percent(percent_str) + bar = self.build_bar(percent) + + # Format sizes + if downloaded is not None and total is not None and total > 0: + size_str = f"{self.format_bytes(downloaded)} / {self.format_bytes(total)}" + elif total is not None and total > 0: + size_str = f"/ {self.format_bytes(total)}" + elif downloaded is not None and downloaded > 0: + size_str = f"{self.format_bytes(downloaded)} downloaded" + else: + size_str = "" + + speed = self.format_speed(speed_str) + eta = self.format_eta(eta_str) + + # Build complete line + # Format: [████░░░░] 45.2% | 125.5 MB / 278.3 MB | 1.23 MB/s | ETA 00:12:34 + parts = [ + bar, + f"{percent:5.1f}%", + ] + + if size_str: + parts.append(f"| {size_str}") + + parts.append(f"| {speed}") + parts.append(f"| ETA {eta}") + + return " ".join(parts) + + def format_summary( + self, + total: Optional[int] = None, + speed_str: Optional[str] = None, + elapsed_str: Optional[str] = None, + ) -> str: + """Format completion summary. + + Args: + total: Total bytes downloaded. + speed_str: Average speed. + elapsed_str: Total time elapsed. + + Returns: + Summary string. + """ + parts = ["✓ Download complete"] + + if total is not None and total > 0: + parts.append(f"| {self.format_bytes(total)}") + + if speed_str: + parts.append(f"| {speed_str.strip()}") + + if elapsed_str: + parts.append(f"| {elapsed_str.strip()}") + + return " ".join(parts) + + +# ============================================================================ +# PIPELINE EXECUTION CONTEXT +# Consolidated from pipeline_context.py +# ============================================================================ +# Note: Pipeline functions and state variables moved to pipeline.py + +class PipelineStageContext: + """Context information for the current pipeline stage.""" + + def __init__(self, stage_index: int, total_stages: int): + self.stage_index = stage_index + self.total_stages = total_stages + self.is_last_stage = (stage_index == total_stages - 1) + self.emits: List[Any] = [] + + def emit(self, obj: Any) -> None: + """Emit an object to the next pipeline stage.""" + self.emits.append(obj) + + def __repr__(self) -> str: + return f"PipelineStageContext(stage={self.stage_index}/{self.total_stages}, is_last={self.is_last_stage})" + + +# ============================================================================ +# RESULT TABLE CLASSES +# Consolidated from result_table.py +# ============================================================================ + +@dataclass +class InputOption: + """Represents an interactive input option (cmdlet argument) in a table. + + Allows users to select options that translate to cmdlet arguments, + enabling interactive configuration right from the result table. + + Example: + # Create an option for location selection + location_opt = InputOption( + "location", + type="enum", + choices=["local", "hydrus", "0x0"], + description="Download destination" + ) + + # Use in result table + table.add_input_option(location_opt) + selected = table.select_option("location") # Returns user choice + """ + name: str + """Option name (maps to cmdlet argument)""" + type: str = "string" + """Option type: 'string', 'enum', 'flag', 'integer'""" + choices: List[str] = field(default_factory=list) + """Valid choices for enum type""" + default: Optional[str] = None + """Default value if not specified""" + description: str = "" + """Description of what this option does""" + validator: Optional[Callable[[str], bool]] = None + """Optional validator function: takes value, returns True if valid""" + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary.""" + return { + "name": self.name, + "type": self.type, + "choices": self.choices if self.choices else None, + "default": self.default, + "description": self.description, + } + + +@dataclass +class TUIResultCard: + """Represents a result as a UI card with title, metadata, and actions. + + Used in hub-ui and TUI contexts to render individual search results + as grouped components with visual structure. + """ + title: str + subtitle: Optional[str] = None + metadata: Optional[Dict[str, str]] = None + media_kind: Optional[str] = None + tags: Optional[List[str]] = None + file_hash: Optional[str] = None + file_size: Optional[str] = None + duration: Optional[str] = None + + def __post_init__(self): + """Initialize default values.""" + if self.metadata is None: + self.metadata = {} + if self.tags is None: + self.tags = [] + + +@dataclass +class ResultColumn: + """Represents a single column in a result table.""" + name: str + value: str + width: Optional[int] = None + + def __str__(self) -> str: + """String representation of the column.""" + return f"{self.name}: {self.value}" + + def to_dict(self) -> Dict[str, str]: + """Convert to dictionary.""" + return {"name": self.name, "value": self.value} + + +@dataclass +class ResultRow: + """Represents a single row in a result table.""" + columns: List[ResultColumn] = field(default_factory=list) + + def add_column(self, name: str, value: Any) -> None: + """Add a column to this row.""" + str_value = str(value) if value is not None else "" + self.columns.append(ResultColumn(name, str_value)) + + def get_column(self, name: str) -> Optional[str]: + """Get column value by name.""" + for col in self.columns: + if col.name.lower() == name.lower(): + return col.value + return None + + def to_dict(self) -> List[Dict[str, str]]: + """Convert to list of column dicts.""" + return [col.to_dict() for col in self.columns] + + def to_list(self) -> List[tuple[str, str]]: + """Convert to list of (name, value) tuples.""" + return [(col.name, col.value) for col in self.columns] + + def __str__(self) -> str: + """String representation of the row.""" + return " | ".join(str(col) for col in self.columns) \ No newline at end of file diff --git a/pipeline.py b/pipeline.py new file mode 100644 index 0000000..630e96f --- /dev/null +++ b/pipeline.py @@ -0,0 +1,679 @@ +"""Pipeline execution context and state management for cmdlets. + +This module provides functions for managing pipeline state, allowing cmdlets to +emit results and control printing behavior within a piped execution context. + +Key Concepts: +- Pipeline stages are chained command invocations +- Each stage receives input items and emits output items +- Printing behavior is controlled based on pipeline position +- Stage context tracks whether this is the last stage (affects output verbosity) + +PowerShell-like piping model: +- Each stage processes items individually +- Stage calls emit() for each output item +- Output items become input for next stage +- Batch commands receive all items at once (special case) +""" + +from __future__ import annotations + +import sys +from typing import Any, Dict, List, Optional, Sequence + +from models import PipelineStageContext +from helper.logger import log + + +# ============================================================================ +# PIPELINE GLOBALS (maintained for backward compatibility) +# ============================================================================ + +# Current pipeline context (thread-local in real world, global here for simplicity) +_CURRENT_CONTEXT: Optional[PipelineStageContext] = None + +# Active execution state +_PIPE_EMITS: List[Any] = [] +_PIPE_ACTIVE: bool = False +_PIPE_IS_LAST: bool = False + +# Ephemeral handoff for direct pipelines (e.g., URL --screen-shot | ...) +_LAST_PIPELINE_CAPTURE: Optional[Any] = None + +# Remember last search query to support refreshing results after pipeline actions +_LAST_SEARCH_QUERY: Optional[str] = None + +# Track whether the last pipeline execution already refreshed and displayed results +_PIPELINE_REFRESHED: bool = False + +# Cache the last pipeline outputs so non-interactive callers can inspect results +_PIPELINE_LAST_ITEMS: List[Any] = [] + +# Store the last result table for @ selection syntax (e.g., @2, @2-5, @{1,3,5}) +_LAST_RESULT_TABLE: Optional[Any] = None +_LAST_RESULT_ITEMS: List[Any] = [] + +# History of result tables for @.. navigation (LIFO stack, max 20 tables) +_RESULT_TABLE_HISTORY: List[tuple[Optional[Any], List[Any]]] = [] +_MAX_RESULT_TABLE_HISTORY = 20 + +# Current stage table for @N expansion (separate from history) +# Used to track the ResultTable with source_command + row_selection_args from current pipeline stage +# This is set by cmdlets that display tabular results (e.g., download-data showing formats) +# and used by CLI to expand @N into full commands like "download-data URL -item 2" +_CURRENT_STAGE_TABLE: Optional[Any] = None + +# Items displayed by non-selectable commands (get-tag, delete-tag, etc.) +# These are available for @N selection but NOT saved to history +_DISPLAY_ITEMS: List[Any] = [] + +# Table for display-only commands (overlay) +# Used when a command wants to show a specific table formatting but not affect history +_DISPLAY_TABLE: Optional[Any] = None + +# Track the indices the user selected via @ syntax for the current invocation +_PIPELINE_LAST_SELECTION: List[int] = [] + +# Track the currently executing command/pipeline string for worker attribution +_PIPELINE_COMMAND_TEXT: str = "" + +# Shared scratchpad for cmdlets/funacts to stash structured data between stages +_PIPELINE_VALUES: Dict[str, Any] = {} +_PIPELINE_MISSING = object() + +# Global callback to notify UI when library content changes +_UI_LIBRARY_REFRESH_CALLBACK: Optional[Any] = None + + +# ============================================================================ +# PUBLIC API +# ============================================================================ + +def set_stage_context(context: Optional[PipelineStageContext]) -> None: + """Internal: Set the current pipeline stage context.""" + global _CURRENT_CONTEXT + _CURRENT_CONTEXT = context + + +def get_stage_context() -> Optional[PipelineStageContext]: + """Get the current pipeline stage context.""" + return _CURRENT_CONTEXT + + +def emit(obj: Any) -> None: + """Emit an object to the current pipeline stage output. + + Call this from a cmdlet to pass data to the next pipeline stage. + If not in a pipeline context, this is a no-op. + + Args: + obj: Any object to emit downstream + + Example: + ```python + def _run(item, args, config): + result = process(item) + if result: + emit(result) # Pass to next stage + return 0 + ``` + """ + # Try new context-based approach first + if _CURRENT_CONTEXT is not None: + import logging + logger = logging.getLogger(__name__) + logger.debug(f"[EMIT] Context-based: appending to _CURRENT_CONTEXT.emits. obj={obj}") + _CURRENT_CONTEXT.emit(obj) + return + + # Fallback to legacy global approach (for backward compatibility) + try: + import logging + logger = logging.getLogger(__name__) + logger.debug(f"[EMIT] Legacy: appending to _PIPE_EMITS. obj type={type(obj).__name__}, _PIPE_EMITS len before={len(_PIPE_EMITS)}") + _PIPE_EMITS.append(obj) + logger.debug(f"[EMIT] Legacy: _PIPE_EMITS len after={len(_PIPE_EMITS)}") + except Exception as e: + import logging + logger = logging.getLogger(__name__) + logger.error(f"[EMIT] Error appending to _PIPE_EMITS: {e}", exc_info=True) + pass + + +def print_if_visible(*args: Any, file=None, **kwargs: Any) -> None: + """Print only if this is not a quiet mid-pipeline stage. + + - Always allow errors printed to stderr by callers (they pass file=sys.stderr). + - For normal info messages, this suppresses printing for intermediate pipeline stages. + - Use this instead of log() in cmdlets when you want stage-aware output. + + Args: + *args: Arguments to print (same as built-in print) + file: Output stream (default: stdout) + **kwargs: Keyword arguments for print + + Example: + ```python + # Always shows errors + print_if_visible("[error] Something failed", file=sys.stderr) + + # Only shows in non-piped context or as final stage + print_if_visible(f"Processed {count} items") + ``` + """ + try: + # Print if: not in a pipeline OR this is the last stage + should_print = (not _PIPE_ACTIVE) or _PIPE_IS_LAST + + # Always print to stderr regardless + if file is not None: + should_print = True + + if should_print: + log(*args, **kwargs) if file is None else log(*args, file=file, **kwargs) + except Exception: + pass + + +def store_value(key: str, value: Any) -> None: + """Store a value to pass to later pipeline stages. + + Values are stored in a shared dictionary keyed by normalized lowercase strings. + This allows one stage to prepare data for the next stage without intermediate output. + + Args: + key: Variable name (normalized to lowercase, non-empty) + value: Any Python object to store + """ + if not isinstance(key, str): + return + text = key.strip().lower() + if not text: + return + try: + _PIPELINE_VALUES[text] = value + except Exception: + pass + + +def load_value(key: str, default: Any = None) -> Any: + """Retrieve a value stored by an earlier pipeline stage. + + Supports dotted path notation for nested access (e.g., "metadata.tags" or "items.0"). + + Args: + key: Variable name or dotted path (e.g., "my_var", "metadata.title", "list.0") + default: Value to return if key not found or access fails + + Returns: + The stored value, or default if not found + """ + if not isinstance(key, str): + return default + text = key.strip() + if not text: + return default + parts = [segment.strip() for segment in text.split('.') if segment.strip()] + if not parts: + return default + root_key = parts[0].lower() + container = _PIPELINE_VALUES.get(root_key, _PIPELINE_MISSING) + if container is _PIPELINE_MISSING: + return default + if len(parts) == 1: + return container + current: Any = container + for fragment in parts[1:]: + if isinstance(current, dict): + fragment_lower = fragment.lower() + if fragment in current: + current = current[fragment] + continue + match = _PIPELINE_MISSING + for key_name, value in current.items(): + if isinstance(key_name, str) and key_name.lower() == fragment_lower: + match = value + break + if match is _PIPELINE_MISSING: + return default + current = match + continue + if isinstance(current, (list, tuple)): + if fragment.isdigit(): + try: + idx = int(fragment) + except ValueError: + return default + if 0 <= idx < len(current): + current = current[idx] + continue + return default + if hasattr(current, fragment): + try: + current = getattr(current, fragment) + continue + except Exception: + return default + return default + return current + + +def reset() -> None: + """Reset all pipeline state. Called between pipeline executions.""" + global _PIPE_EMITS, _PIPE_ACTIVE, _PIPE_IS_LAST, _PIPELINE_VALUES + global _LAST_PIPELINE_CAPTURE, _PIPELINE_REFRESHED, _PIPELINE_LAST_ITEMS + global _PIPELINE_COMMAND_TEXT + + _PIPE_EMITS = [] + _PIPE_ACTIVE = False + _PIPE_IS_LAST = False + _LAST_PIPELINE_CAPTURE = None + _PIPELINE_REFRESHED = False + _PIPELINE_LAST_ITEMS = [] + _PIPELINE_VALUES = {} + _PIPELINE_COMMAND_TEXT = "" + + +def get_emitted_items() -> List[Any]: + """Get a copy of all items emitted by the current pipeline stage.""" + return list(_PIPE_EMITS) + + +def clear_emits() -> None: + """Clear the emitted items list (called between stages).""" + global _PIPE_EMITS + _PIPE_EMITS = [] + + +def set_last_selection(indices: Sequence[int]) -> None: + """Record the indices selected via @ syntax for the next cmdlet. + + Args: + indices: Iterable of 0-based indices captured from the REPL parser + """ + global _PIPELINE_LAST_SELECTION + _PIPELINE_LAST_SELECTION = list(indices or []) + + +def get_last_selection() -> List[int]: + """Return the indices selected via @ syntax for the current invocation.""" + return list(_PIPELINE_LAST_SELECTION) + + +def clear_last_selection() -> None: + """Clear the cached selection indices after a cmdlet finishes.""" + global _PIPELINE_LAST_SELECTION + _PIPELINE_LAST_SELECTION = [] + + +def set_current_command_text(command_text: Optional[str]) -> None: + """Record the raw pipeline/command text for downstream consumers.""" + global _PIPELINE_COMMAND_TEXT + _PIPELINE_COMMAND_TEXT = (command_text or "").strip() + + +def get_current_command_text(default: str = "") -> str: + """Return the last recorded command/pipeline text.""" + text = _PIPELINE_COMMAND_TEXT.strip() + return text if text else default + + +def clear_current_command_text() -> None: + """Clear the cached command text after execution completes.""" + global _PIPELINE_COMMAND_TEXT + _PIPELINE_COMMAND_TEXT = "" + + +def set_active(active: bool) -> None: + """Internal: Set whether we're in a pipeline context.""" + global _PIPE_ACTIVE + _PIPE_ACTIVE = active + + +def set_last_stage(is_last: bool) -> None: + """Internal: Set whether this is the last stage of the pipeline.""" + global _PIPE_IS_LAST + _PIPE_IS_LAST = is_last + + +def set_search_query(query: Optional[str]) -> None: + """Internal: Set the last search query for refresh purposes.""" + global _LAST_SEARCH_QUERY + _LAST_SEARCH_QUERY = query + + +def get_search_query() -> Optional[str]: + """Get the last search query.""" + return _LAST_SEARCH_QUERY + + +def set_pipeline_refreshed(refreshed: bool) -> None: + """Internal: Track whether the pipeline already refreshed results.""" + global _PIPELINE_REFRESHED + _PIPELINE_REFRESHED = refreshed + + +def was_pipeline_refreshed() -> bool: + """Check if the pipeline already refreshed results.""" + return _PIPELINE_REFRESHED + + +def set_last_items(items: list) -> None: + """Internal: Cache the last pipeline outputs.""" + global _PIPELINE_LAST_ITEMS + _PIPELINE_LAST_ITEMS = list(items) if items else [] + + +def get_last_items() -> List[Any]: + """Get the last pipeline outputs.""" + return list(_PIPELINE_LAST_ITEMS) + + +def set_last_capture(obj: Any) -> None: + """Internal: Store ephemeral handoff for direct pipelines.""" + global _LAST_PIPELINE_CAPTURE + _LAST_PIPELINE_CAPTURE = obj + + +def get_last_capture() -> Optional[Any]: + """Get ephemeral pipeline handoff (e.g., URL --screen-shot | ...).""" + return _LAST_PIPELINE_CAPTURE + + +def set_ui_library_refresh_callback(callback: Any) -> None: + """Set a callback to be called when library content is updated. + + The callback will be called with: + callback(library_filter: str = 'local') + + Args: + callback: A callable that accepts optional library_filter parameter + + Example: + def my_refresh_callback(library_filter='local'): + print(f"Refresh library: {library_filter}") + set_ui_library_refresh_callback(my_refresh_callback) + """ + global _UI_LIBRARY_REFRESH_CALLBACK + _UI_LIBRARY_REFRESH_CALLBACK = callback + + +def get_ui_library_refresh_callback() -> Optional[Any]: + """Get the current library refresh callback.""" + return _UI_LIBRARY_REFRESH_CALLBACK + + +def trigger_ui_library_refresh(library_filter: str = 'local') -> None: + """Trigger a library refresh in the UI if callback is registered. + + This should be called from cmdlets/funacts after content is added to library. + + Args: + library_filter: Which library to refresh ('local', 'hydrus', etc) + """ + callback = get_ui_library_refresh_callback() + if callback: + try: + callback(library_filter) + except Exception as e: + print(f"[trigger_ui_library_refresh] Error calling refresh callback: {e}", file=sys.stderr) + + +def set_last_result_table(result_table: Optional[Any], items: Optional[List[Any]] = None) -> None: + """Store the last result table and items for @ selection syntax. + + This should be called after displaying a result table, so users can reference + rows with @2, @2-5, @{1,3,5} syntax in subsequent commands. + Also maintains a history stack for @.. navigation (restore previous result table). + + Only selectable commands (search-file, download-data) should call this to create history. + For action commands (delete-tag, add-tag, etc), use set_last_result_table_preserve_history() instead. + + Args: + result_table: The ResultTable object that was displayed (or None) + items: List of items that populated the table (optional) + """ + global _LAST_RESULT_TABLE, _LAST_RESULT_ITEMS, _RESULT_TABLE_HISTORY, _DISPLAY_ITEMS, _DISPLAY_TABLE + + # Push current table to history before replacing + if _LAST_RESULT_TABLE is not None: + _RESULT_TABLE_HISTORY.append((_LAST_RESULT_TABLE, _LAST_RESULT_ITEMS.copy())) + # Keep history size limited + if len(_RESULT_TABLE_HISTORY) > _MAX_RESULT_TABLE_HISTORY: + _RESULT_TABLE_HISTORY.pop(0) + + # Set new current table and clear any display items/table + _DISPLAY_ITEMS = [] + _DISPLAY_TABLE = None + _LAST_RESULT_TABLE = result_table + _LAST_RESULT_ITEMS = items or [] + + +def set_last_result_table_overlay(result_table: Optional[Any], items: Optional[List[Any]] = None) -> None: + """Set a result table as an overlay (display only, no history). + + Used for commands like get-tag that want to show a formatted table but + should be treated as a transient view (closing it returns to previous table). + + Args: + result_table: The ResultTable object to display + items: List of items for @N selection + """ + global _DISPLAY_ITEMS, _DISPLAY_TABLE + + _DISPLAY_TABLE = result_table + _DISPLAY_ITEMS = items or [] + + +def set_last_result_table_preserve_history(result_table: Optional[Any], items: Optional[List[Any]] = None) -> None: + """Update the last result table WITHOUT adding to history. + + Used for action commands (delete-tag, add-tag, etc.) that modify data but shouldn't + create history entries. This allows @.. to navigate search results, not undo stacks. + + Args: + result_table: The ResultTable object that was displayed (or None) + items: List of items that populated the table (optional) + """ + global _LAST_RESULT_TABLE, _LAST_RESULT_ITEMS + + # Update current table WITHOUT pushing to history + _LAST_RESULT_TABLE = result_table + _LAST_RESULT_ITEMS = items or [] + + +def set_last_result_items_only(items: Optional[List[Any]]) -> None: + """Store items for @N selection WITHOUT affecting history or saved search data. + + Used for display-only commands (get-tag, get-url, etc.) and action commands + (delete-tag, add-tag, etc.) that emit results but shouldn't affect history. + + These items are available for @1, @2, etc. selection in the next command, + but are NOT saved to history. This preserves search context for @.. navigation. + + Args: + items: List of items to select from + """ + global _DISPLAY_ITEMS, _DISPLAY_TABLE + + # Store items for immediate @N selection, but DON'T modify _LAST_RESULT_ITEMS + # This ensures history contains original search data, not display transformations + _DISPLAY_ITEMS = items or [] + # Clear display table since we're setting items only (CLI will generate table if needed) + _DISPLAY_TABLE = None + + +def restore_previous_result_table() -> bool: + """Restore the previous result table from history (for @.. navigation). + + Returns: + True if a previous table was restored, False if history is empty + """ + global _LAST_RESULT_TABLE, _LAST_RESULT_ITEMS, _RESULT_TABLE_HISTORY, _DISPLAY_ITEMS, _DISPLAY_TABLE + + # If we have an active overlay (display items/table), clear it to "go back" to the underlying table + if _DISPLAY_ITEMS or _DISPLAY_TABLE: + _DISPLAY_ITEMS = [] + _DISPLAY_TABLE = None + return True + + if not _RESULT_TABLE_HISTORY: + return False + + # Pop from history and restore + _LAST_RESULT_TABLE, _LAST_RESULT_ITEMS = _RESULT_TABLE_HISTORY.pop() + # Clear display items so get_last_result_items() falls back to restored items + _DISPLAY_ITEMS = [] + _DISPLAY_TABLE = None + return True + + +def get_display_table() -> Optional[Any]: + """Get the current display overlay table. + + Returns: + The ResultTable object, or None if no overlay table is set + """ + return _DISPLAY_TABLE + + +def get_last_result_table() -> Optional[Any]: + """Get the current last result table. + + Returns: + The ResultTable object, or None if no table is set + """ + return _LAST_RESULT_TABLE + + +def get_last_result_items() -> List[Any]: + """Get the items available for @N selection. + + Returns items from display/action commands (get-tag, delete-tag, etc.) if available, + otherwise returns items from the last search command. This ensures @N selection + works for both display operations and search results. + + Returns: + List of items, or empty list if no prior results + """ + # Prioritize items from display commands (get-tag, delete-tag, etc.) + # These are available for immediate @N selection + if _DISPLAY_ITEMS: + return _DISPLAY_ITEMS + # Fall back to items from last search/selectable command + return _LAST_RESULT_ITEMS + + +def get_last_result_table_source_command() -> Optional[str]: + """Get the source command from the last displayed result table. + + Returns: + Command name (e.g., 'download-data') or None if not set + """ + if _LAST_RESULT_TABLE and hasattr(_LAST_RESULT_TABLE, 'source_command'): + return _LAST_RESULT_TABLE.source_command + return None + + +def get_last_result_table_source_args() -> List[str]: + """Get the base source arguments from the last displayed result table. + + Returns: + List of arguments (e.g., ['https://example.com']) or empty list + """ + if _LAST_RESULT_TABLE and hasattr(_LAST_RESULT_TABLE, 'source_args'): + return _LAST_RESULT_TABLE.source_args or [] + return [] + + +def get_last_result_table_row_selection_args(row_index: int) -> Optional[List[str]]: + """Get the selection arguments for a specific row in the last result table. + + Args: + row_index: Index of the row (0-based) + + Returns: + Selection arguments (e.g., ['-item', '3']) or None + """ + if _LAST_RESULT_TABLE and hasattr(_LAST_RESULT_TABLE, 'rows'): + if 0 <= row_index < len(_LAST_RESULT_TABLE.rows): + row = _LAST_RESULT_TABLE.rows[row_index] + if hasattr(row, 'selection_args'): + return row.selection_args + return None + + +def set_current_stage_table(result_table: Optional[Any]) -> None: + """Store the current pipeline stage table for @N expansion. + + Used by cmdlets that display tabular results (e.g., download-data with formats) + to make their result table available for @N expansion logic. + + Does NOT push to history - purely for command expansion in the current pipeline. + + Args: + result_table: The ResultTable object (or None to clear) + """ + global _CURRENT_STAGE_TABLE + _CURRENT_STAGE_TABLE = result_table + + +def get_current_stage_table_source_command() -> Optional[str]: + """Get the source command from the current pipeline stage table. + + Returns: + Command name (e.g., 'download-data') or None + """ + if _CURRENT_STAGE_TABLE and hasattr(_CURRENT_STAGE_TABLE, 'source_command'): + return _CURRENT_STAGE_TABLE.source_command + return None + + +def get_current_stage_table_source_args() -> List[str]: + """Get the source arguments from the current pipeline stage table. + + Returns: + List of arguments or empty list + """ + if _CURRENT_STAGE_TABLE and hasattr(_CURRENT_STAGE_TABLE, 'source_args'): + return _CURRENT_STAGE_TABLE.source_args or [] + return [] + + +def get_current_stage_table_row_selection_args(row_index: int) -> Optional[List[str]]: + """Get the selection arguments for a row in the current pipeline stage table. + + Args: + row_index: Index of the row (0-based) + + Returns: + Selection arguments or None + """ + if _CURRENT_STAGE_TABLE and hasattr(_CURRENT_STAGE_TABLE, 'rows'): + if 0 <= row_index < len(_CURRENT_STAGE_TABLE.rows): + row = _CURRENT_STAGE_TABLE.rows[row_index] + if hasattr(row, 'selection_args'): + return row.selection_args + return None + + +def clear_last_result() -> None: + """Clear the stored last result table and items.""" + global _LAST_RESULT_TABLE, _LAST_RESULT_ITEMS + _LAST_RESULT_TABLE = None + _LAST_RESULT_ITEMS = [] + + +def emit_list(objects: List[Any]) -> None: + """Emit a list of PipeObjects to the next pipeline stage. + + This allows cmdlets to emit multiple results that are tracked as a list, + enabling downstream cmdlets to process all of them or filter by metadata. + + Args: + objects: List of PipeObject instances or dicts to emit + """ + if _CURRENT_CONTEXT is not None: + _CURRENT_CONTEXT.emit(objects) + else: + _PIPE_EMITS.append(objects) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..1e3bb53 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,183 @@ +[build-system] +requires = ["setuptools>=65.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "medeia-macina" +version = "0.1.0" +description = "Comprehensive media management and search platform with support for local files, Hydrus database, torrents, books, and P2P networks" +readme = "README.md" +requires-python = ">=3.9,<3.12" +license = {text = "MIT"} +authors = [ + {name = "Your Name", email = "your.email@example.com"} +] +keywords = ["media", "search", "management", "hydrus", "download", "cli", "tui"] +classifiers = [ + "Development Status :: 3 - Alpha", + "Environment :: Console", + "Intended Audience :: End Users/Desktop", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Multimedia", + "Topic :: Internet", +] + +dependencies = [ + # Core CLI and TUI frameworks + "typer>=0.9.0", + "prompt-toolkit>=3.0.0", + "textual>=0.30.0", + + # Media processing and downloading + "yt-dlp>=2023.11.0", + "yt-dlp-ejs", # EJS challenge solver scripts for YouTube JavaScript challenges + "requests>=2.31.0", + "httpx>=0.25.0", + "ffmpeg-python>=0.2.0", + + # Document and data handling + "PyPDF2>=3.0.0", + "img2pdf>=0.6.0", + "mutagen>=1.46.0", + "cbor2>=4.0", + + # Image and media support + "Pillow>=10.0.0", + "python-bidi>=0.4.2", + + # Metadata extraction and processing + "musicbrainzngs>=0.7.0", + "beautifulsoup4>=4.12.0", + "lxml>=4.9.0", + + # Advanced searching and libraries + "libgen-api>=1.0.0", + "aioslsk>=1.6.0", + "imdbinfo>=0.1.10", + + # Encryption and security + "pycryptodome>=3.18.0", + + # Data processing + "bencode3", + "tqdm>=4.66.0", + + # Browser automation + "playwright>=1.40.0", + + # Development and utilities + "python-dateutil>=2.8.0", +] + +[project.optional-dependencies] +dev = [ + # Testing + "pytest>=7.4.0", + "pytest-cov>=4.1.0", + "pytest-asyncio>=0.21.0", + + # Code quality + "black>=23.11.0", + "flake8>=6.1.0", + "isort>=5.12.0", + "mypy>=1.7.0", + "pylint>=3.0.0", + + # Documentation + "sphinx>=7.2.0", + "sphinx-rtd-theme>=1.3.0", + + # Debugging and profiling + "ipython>=8.17.0", + "ipdb>=0.13.0", + "memory-profiler>=0.61.0", + + # Version control and CI/CD helpers + "pre-commit>=3.5.0", +] + +[project.scripts] +mm = "medeia_macina.cli_entry:main" +medeia = "medeia_macina.cli_entry:main" + +[project.urls] +Homepage = "https://github.com/yourusername/medeia-macina" +Documentation = "https://medeia-macina.readthedocs.io" +Repository = "https://github.com/yourusername/medeia-macina.git" +Issues = "https://github.com/yourusername/medeia-macina/issues" + +[tool.setuptools] +packages = ["cmdlets", "helper", "TUI", "medeia_macina"] + +[tool.black] +line-length = 100 +target-version = ['py39', 'py310', 'py311', 'py312'] +include = '\.pyi?$' +extend-exclude = ''' +/( + # directories + \.eggs + | \.git + | \.hg + | \.mypy_cache + | \.tox + | \.venv + | _build + | buck-out + | build + | dist + | __pycache__ +)/ +''' + +[tool.isort] +profile = "black" +line_length = 100 +target_version = ["py39", "py310", "py311", "py312"] + +[tool.mypy] +python_version = "3.9" +warn_return_any = true +warn_unused_configs = true +disallow_untyped_defs = false +disallow_incomplete_defs = false +check_untyped_defs = false +no_implicit_optional = true +warn_redundant_casts = true +warn_unused_ignores = true +warn_no_return = true + +[tool.pylint.messages_control] +disable = [ + "C0330", "C0326", # Bad whitespace + "R0913", # Too many arguments + "R0914", # Too many local variables +] + +[tool.pytest.ini_options] +testpaths = ["tests"] +python_files = ["test_*.py", "*_test.py"] +addopts = "-v --cov=. --cov-report=html --cov-report=term-missing" + +[tool.coverage.run] +branch = true +omit = [ + "*/tests/*", + "*/__main__.py", +] + +[tool.coverage.report] +exclude_lines = [ + "pragma: no cover", + "def __repr__", + "raise AssertionError", + "raise NotImplementedError", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", +] diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..6923b89 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,29 @@ +# Development dependencies for Medeia-Macina +# Install with: pip install -r requirements-dev.txt + +# Main requirements +-r requirements.txt + +# Testing +pytest>=7.4.0 +pytest-cov>=4.1.0 +pytest-asyncio>=0.21.0 + +# Code quality +black>=23.11.0 +flake8>=6.1.0 +isort>=5.12.0 +mypy>=1.7.0 +pylint>=3.0.0 + +# Documentation +sphinx>=7.2.0 +sphinx-rtd-theme>=1.3.0 + +# Debugging and profiling +ipython>=8.17.0 +ipdb>=0.13.0 +memory-profiler>=0.61.0 + +# Version control and CI/CD helpers +pre-commit>=3.5.0 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..02e8244 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,43 @@ +# Core CLI and TUI frameworks +typer>=0.9.0 +prompt-toolkit>=3.0.0 +textual>=0.30.0 + +# Media processing and downloading +yt-dlp>=2023.11.0 +requests>=2.31.0 +httpx>=0.25.0 +ffmpeg-python>=0.2.0 + +# Document and data handling +PyPDF2>=3.0.0 +img2pdf>=0.6.0 +mutagen>=1.46.0 +cbor2>=4.0 + +# Image and media support +Pillow>=10.0.0 +python-bidi>=0.4.2 + +# Metadata extraction and processing +musicbrainzngs>=0.7.0 +beautifulsoup4>=4.12.0 +lxml>=4.9.0 + +# Advanced searching and libraries +libgen-api>=1.0.0 +aioslsk>=1.6.0 +imdbinfo>=0.1.10 + +# Encryption and security (if needed by Crypto usage) +pycryptodome>=3.18.0 + +# Data processing +bencode3 +tqdm>=4.66.0 + +# Browser automation (for web scraping if needed) +playwright>=1.40.0 + +# Development and utilities +python-dateutil>=2.8.0 diff --git a/result_table.py b/result_table.py new file mode 100644 index 0000000..90d1467 --- /dev/null +++ b/result_table.py @@ -0,0 +1,1228 @@ +"""Unified result table formatter for CLI display. + +Provides a structured way to convert search results, metadata, and pipeline objects +into formatted tables suitable for display in the REPL and CLI output. + +Features: +- Format results as aligned tables with row numbers +- Support multiple selection formats (single, ranges, lists, combined) +- Interactive selection with user input +- Input options for cmdlet arguments (location, source selection, etc) +""" + +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional, Union, Callable, Tuple +from pathlib import Path +import json + +# Optional Textual imports - graceful fallback if not available +try: + from textual.widgets import Tree, DataTable + from textual.containers import Horizontal, Vertical + from textual.widgets import Static, Button + TEXTUAL_AVAILABLE = True +except ImportError: + TEXTUAL_AVAILABLE = False + + +@dataclass +class InputOption: + """Represents an interactive input option (cmdlet argument) in a table. + + Allows users to select options that translate to cmdlet arguments, + enabling interactive configuration right from the result table. + + Example: + # Create an option for location selection + location_opt = InputOption( + "location", + type="enum", + choices=["local", "hydrus", "0x0"], + description="Download destination" + ) + + # Use in result table + table.add_input_option(location_opt) + selected = table.select_option("location") # Returns user choice + """ + name: str + """Option name (maps to cmdlet argument)""" + type: str = "string" + """Option type: 'string', 'enum', 'flag', 'integer'""" + choices: List[str] = field(default_factory=list) + """Valid choices for enum type""" + default: Optional[str] = None + """Default value if not specified""" + description: str = "" + """Description of what this option does""" + validator: Optional[Callable[[str], bool]] = None + """Optional validator function: takes value, returns True if valid""" + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary.""" + return { + "name": self.name, + "type": self.type, + "choices": self.choices if self.choices else None, + "default": self.default, + "description": self.description, + } + + +@dataclass +class TUIResultCard: + """Represents a result as a UI card with title, metadata, and actions. + + Used in hub-ui and TUI contexts to render individual search results + as grouped components with visual structure. + """ + title: str + subtitle: Optional[str] = None + metadata: Optional[Dict[str, str]] = None + media_kind: Optional[str] = None + tags: Optional[List[str]] = None + file_hash: Optional[str] = None + file_size: Optional[str] = None + duration: Optional[str] = None + + def __post_init__(self): + """Initialize default values.""" + if self.metadata is None: + self.metadata = {} + if self.tags is None: + self.tags = [] + + +@dataclass +class ResultColumn: + """Represents a single column in a result table.""" + name: str + value: str + width: Optional[int] = None + + def __str__(self) -> str: + """String representation of the column.""" + return f"{self.name}: {self.value}" + + def to_dict(self) -> Dict[str, str]: + """Convert to dictionary.""" + return {"name": self.name, "value": self.value} + + +@dataclass +class ResultRow: + """Represents a single row in a result table.""" + columns: List[ResultColumn] = field(default_factory=list) + selection_args: Optional[List[str]] = None + """Arguments to use for this row when selected via @N syntax (e.g., ['-item', '3'])""" + + def add_column(self, name: str, value: Any) -> None: + """Add a column to this row.""" + str_value = str(value) if value is not None else "" + self.columns.append(ResultColumn(name, str_value)) + + def get_column(self, name: str) -> Optional[str]: + """Get column value by name.""" + for col in self.columns: + if col.name.lower() == name.lower(): + return col.value + return None + + def to_dict(self) -> List[Dict[str, str]]: + """Convert to list of column dicts.""" + return [col.to_dict() for col in self.columns] + + def to_list(self) -> List[tuple[str, str]]: + """Convert to list of (name, value) tuples.""" + return [(col.name, col.value) for col in self.columns] + + def __str__(self) -> str: + """String representation of the row.""" + return " | ".join(str(col) for col in self.columns) + + +class ResultTable: + """Unified table formatter for search results, metadata, and pipeline objects. + + Provides a structured way to display results in the CLI with consistent formatting. + Handles conversion from various result types (SearchResult, PipeObject, dicts) into + a formatted table with rows and columns. + + Example: + >>> result_table = ResultTable("Search Results") + >>> row = result_table.add_row() + >>> row.add_column("File", "document.pdf") + >>> row.add_column("Size", "2.5 MB") + >>> row.add_column("Tags", "pdf, document") + >>> print(result_table) + """ + + def __init__(self, title: str = "", title_width: int = 80, max_columns: int = None): + """Initialize a result table. + + Args: + title: Optional title for the table + title_width: Width for formatting the title line + max_columns: Maximum number of columns to display (None for unlimited, default: 5 for search results) + """ + self.title = title + self.title_width = title_width + self.max_columns = max_columns if max_columns is not None else 5 # Default 5 for cleaner display + self.rows: List[ResultRow] = [] + self.column_widths: Dict[str, int] = {} + self.input_options: Dict[str, InputOption] = {} + """Options available for user input (cmdlet arguments)""" + self.source_command: Optional[str] = None + """Command that generated this table (e.g., 'download-data URL')""" + self.source_args: List[str] = [] + """Base arguments for the source command""" + + def add_row(self) -> ResultRow: + """Add a new row to the table and return it for configuration.""" + row = ResultRow() + self.rows.append(row) + return row + + def set_source_command(self, command: str, args: Optional[List[str]] = None) -> "ResultTable": + """Set the source command that generated this table. + + This is used for @N expansion: when user runs @2 | next-cmd, it will expand to: + source_command + source_args + row_selection_args | next-cmd + + Args: + command: Command name (e.g., 'download-data') + args: Base arguments for the command (e.g., ['URL']) + + Returns: + Self for chaining + """ + self.source_command = command + self.source_args = args or [] + return self + + def set_row_selection_args(self, row_index: int, selection_args: List[str]) -> None: + """Set the selection arguments for a specific row. + + When user selects this row via @N, these arguments will be appended to the + source command to re-execute with that item selected. + + Args: + row_index: Index of the row (0-based) + selection_args: Arguments to use (e.g., ['-item', '3']) + """ + if 0 <= row_index < len(self.rows): + self.rows[row_index].selection_args = selection_args + + def add_result(self, result: Any) -> "ResultTable": + """Add a result object (SearchResult, PipeObject, ResultItem, TagItem, or dict) as a row. + + Args: + result: Result object to add + + Returns: + Self for chaining + """ + row = self.add_row() + + # Handle TagItem from get_tag.py (tag display with index) + if hasattr(result, '__class__') and result.__class__.__name__ == 'TagItem': + self._add_tag_item(row, result) + # Handle ResultItem from search_file.py (compact display) + elif hasattr(result, '__class__') and result.__class__.__name__ == 'ResultItem': + self._add_result_item(row, result) + # Handle SearchResult from search_file.py + elif hasattr(result, '__class__') and result.__class__.__name__ == 'SearchResult': + self._add_search_result(row, result) + # Handle PipeObject from models.py + elif hasattr(result, '__class__') and result.__class__.__name__ == 'PipeObject': + self._add_pipe_object(row, result) + # Handle dict + elif isinstance(result, dict): + self._add_dict(row, result) + # Handle generic objects with __dict__ + elif hasattr(result, '__dict__'): + self._add_generic_object(row, result) + # Handle strings (simple text result) + elif isinstance(result, str): + row.add_column("Result", result) + + return self + + def _add_search_result(self, row: ResultRow, result: Any) -> None: + """Extract and add SearchResult fields to row.""" + # Core fields + if hasattr(result, 'title') and result.title: + row.add_column("Title", result.title) + + if hasattr(result, 'origin') and result.origin: + row.add_column("Source", result.origin) + + if hasattr(result, 'detail') and result.detail: + row.add_column("Detail", result.detail) + + if hasattr(result, 'media_kind') and result.media_kind: + row.add_column("Type", result.media_kind) + + # Target (file path or URL) + if hasattr(result, 'target') and result.target: + # Truncate long paths for display + target_str = str(result.target) + if len(target_str) > 60: + target_str = "..." + target_str[-57:] + row.add_column("Target", target_str) + + # Hash + if hasattr(result, 'hash_hex') and result.hash_hex: + row.add_column("Hash", result.hash_hex[:16] + "...") # First 16 chars + + # Tags summary + if hasattr(result, 'tag_summary') and result.tag_summary: + tags_str = str(result.tag_summary) + if len(tags_str) > 60: + tags_str = tags_str[:57] + "..." + row.add_column("Tags", tags_str) + + # Duration (for media) + if hasattr(result, 'duration_seconds') and result.duration_seconds: + minutes = int(result.duration_seconds // 60) + seconds = int(result.duration_seconds % 60) + row.add_column("Duration", f"{minutes}m {seconds}s") + + # Size (for files) + if hasattr(result, 'size_bytes') and result.size_bytes: + size_mb = result.size_bytes / (1024 * 1024) + row.add_column("Size", f"{size_mb:.1f} MB") + + # Annotations + if hasattr(result, 'annotations') and result.annotations: + ann_str = ", ".join(str(a) for a in result.annotations) + if len(ann_str) > 50: + ann_str = ann_str[:47] + "..." + row.add_column("Annotations", ann_str) + + def _add_result_item(self, row: ResultRow, item: Any) -> None: + """Extract and add ResultItem fields to row (compact display for search results). + + Shows only essential columns: + - Title (required) + - Origin (source backend) + - Size (formatted MB, integer only) + + All other fields are stored in item but not displayed to keep table compact. + Use @row# syntax to pipe full item data to next command. + """ + # Title (required - use origin as fallback) + title = getattr(item, 'title', None) or getattr(item, 'origin', 'Unknown') + if title: + row.add_column("Title", title[:90] + ("..." if len(title) > 90 else "")) + + # Storage (source backend - hydrus, local, debrid, etc) + if hasattr(item, 'origin') and item.origin: + row.add_column("Storage", item.origin) + + # Size (for files) - integer MB only + if hasattr(item, 'size_bytes') and item.size_bytes: + size_mb = int(item.size_bytes / (1024 * 1024)) + row.add_column("Size", f"{size_mb} MB") + + def _add_tag_item(self, row: ResultRow, item: Any) -> None: + """Extract and add TagItem fields to row (compact tag display). + + Shows the Tag column with the tag name and Source column to identify + which storage backend the tags come from (Hydrus, local, etc.). + All data preserved in TagItem for piping and operations. + Use @1 to select a tag, @{1,3,5} to select multiple. + """ + # Tag name (truncate if too long) + if hasattr(item, 'tag_name') and item.tag_name: + tag_name = item.tag_name + if len(tag_name) > 60: + tag_name = tag_name[:57] + "..." + row.add_column("Tag", tag_name) + + # Source/Store (where the tags come from) + if hasattr(item, 'source') and item.source: + row.add_column("Store", item.source) + elif hasattr(item, 'origin') and item.origin: + row.add_column("Store", item.origin) + + + def _add_pipe_object(self, row: ResultRow, obj: Any) -> None: + """Extract and add PipeObject fields to row.""" + # Source and identifier + if hasattr(obj, 'source') and obj.source: + row.add_column("Source", obj.source) + + # Title + if hasattr(obj, 'title') and obj.title: + row.add_column("Title", obj.title[:50] + ("..." if len(obj.title) > 50 else "")) + + # File info + if hasattr(obj, 'file_path') and obj.file_path: + file_str = str(obj.file_path) + if len(file_str) > 60: + file_str = "..." + file_str[-57:] + row.add_column("Path", file_str) + + if hasattr(obj, 'file_hash') and obj.file_hash: + row.add_column("Hash", obj.file_hash[:16] + "...") + + # Tags + if hasattr(obj, 'tags') and obj.tags: + tags_str = ", ".join(obj.tags[:3]) # First 3 tags + if len(obj.tags) > 3: + tags_str += f", +{len(obj.tags) - 3} more" + row.add_column("Tags", tags_str) + + # Duration + if hasattr(obj, 'duration') and obj.duration: + row.add_column("Duration", f"{obj.duration:.1f}s") + + # Warnings + if hasattr(obj, 'warnings') and obj.warnings: + warnings_str = "; ".join(obj.warnings[:2]) + if len(obj.warnings) > 2: + warnings_str += f" (+{len(obj.warnings) - 2} more)" + row.add_column("Warnings", warnings_str) + + def _add_dict(self, row: ResultRow, data: Dict[str, Any]) -> None: + """Extract and add dict fields to row using first-match priority groups. + + Respects max_columns limit to keep table compact and readable. + + Special handling for 'columns' field: if present, uses it to populate row columns + instead of treating it as a regular field. This allows dynamic column definitions + from search providers. + + Priority field groups (uses first match within each group): + - title | name | filename + - origin | source + - type | media_kind | kind + - target | path | url + - hash | hash_hex | file_hash + - tags | tag_summary + - detail | description + """ + # Helper to determine if a field should be hidden from display + def is_hidden_field(field_name: Any) -> bool: + # Hide internal/metadata fields + hidden_fields = {'__', 'id', 'action', 'parent_id', 'is_temp', 'file_path', 'extra'} + if isinstance(field_name, str): + if field_name.startswith('__'): + return True + if field_name in hidden_fields: + return True + return False + + # Strip out hidden metadata fields (prefixed with __) + visible_data = {k: v for k, v in data.items() if not is_hidden_field(k)} + + # Track which fields we've already added to avoid duplicates + added_fields = set() + column_count = 0 # Track total columns added + + # Helper function to format values + def format_value(value: Any) -> str: + if isinstance(value, list): + formatted = ", ".join(str(v) for v in value[:3]) + if len(value) > 3: + formatted += f", +{len(value) - 3} more" + return formatted + return str(value) + + # Special handling for 'columns' field from search providers + # If present, use it to populate row columns dynamically + if 'columns' in visible_data and isinstance(visible_data['columns'], list) and visible_data['columns']: + try: + for col_name, col_value in visible_data['columns']: + # Skip the "#" column as ResultTable already adds row numbers + if col_name == '#': + continue + if column_count >= self.max_columns: + break + col_value_str = format_value(col_value) + if len(col_value_str) > 60: + col_value_str = col_value_str[:57] + "..." + row.add_column(col_name, col_value_str) + added_fields.add(col_name.lower()) + column_count += 1 + # Mark 'columns' as handled so we don't add it as a field + added_fields.add('columns') + # Also mark common fields that shouldn't be re-displayed if they're in columns + # This prevents showing both "Store" (from columns) and "Origin" (from data fields) + added_fields.add('origin') + added_fields.add('source') + added_fields.add('target') + added_fields.add('path') + added_fields.add('media_kind') + added_fields.add('detail') + added_fields.add('annotations') + added_fields.add('full_metadata') # Don't display full metadata as column + except Exception: + # Fall back to regular field handling if columns format is unexpected + pass + + # Only add priority groups if we haven't already filled columns from 'columns' field + if column_count == 0: + # Priority field groups - uses first matching field in each group + priority_groups = [ + ('title | name | filename', ['title', 'name', 'filename']), + ('origin | source', ['origin', 'source']), + ('type | media_kind | kind', ['type', 'media_kind', 'kind']), + ('target | path | url', ['target', 'path', 'url']), + ('hash | hash_hex | file_hash', ['hash', 'hash_hex', 'file_hash']), + ('tags | tag_summary', ['tags', 'tag_summary']), + ('detail | description', ['detail', 'description']), + ] + + # Add priority field groups first - use first match in each group + for _group_label, field_options in priority_groups: + if column_count >= self.max_columns: + break + for field in field_options: + if field in visible_data and field not in added_fields: + value_str = format_value(visible_data[field]) + if len(value_str) > 60: + value_str = value_str[:57] + "..." + + row.add_column(field.replace('_', ' ').title(), value_str) + added_fields.add(field) + column_count += 1 + break # Use first match in this group, skip rest + + # Add remaining fields only if we haven't hit max_columns (and no explicit columns were set) + if column_count < self.max_columns: + for key, value in visible_data.items(): + if column_count >= self.max_columns: + break + if key not in added_fields: # Only add if not already added + value_str = format_value(value) + if len(value_str) > 40: + value_str = value_str[:37] + "..." + row.add_column(key.replace('_', ' ').title(), value_str) + added_fields.add(key) # Track in added_fields to prevent re-adding + column_count += 1 + + # Check for selection args + if '_selection_args' in data: + row.selection_args = data['_selection_args'] + # Don't display it + added_fields.add('_selection_args') + + # Helper to determine if a field should be hidden from display + def is_hidden_field(field_name: Any) -> bool: + # Hide internal/metadata fields + hidden_fields = {'__', 'id', 'action', 'parent_id', 'is_temp', 'file_path', 'extra'} + if isinstance(field_name, str): + if field_name.startswith('__'): + return True + if field_name in hidden_fields: + return True + return False + + # Strip out hidden metadata fields (prefixed with __) + visible_data = {k: v for k, v in data.items() if not is_hidden_field(k)} + + # Track which fields we've already added to avoid duplicates + added_fields = set() + column_count = 0 # Track total columns added + + # Helper function to format values + def format_value(value: Any) -> str: + if isinstance(value, list): + formatted = ", ".join(str(v) for v in value[:3]) + if len(value) > 3: + formatted += f", +{len(value) - 3} more" + return formatted + return str(value) + + # Special handling for 'columns' field from search providers + # If present, use it to populate row columns dynamically + if 'columns' in visible_data and isinstance(visible_data['columns'], list) and visible_data['columns']: + try: + for col_name, col_value in visible_data['columns']: + # Skip the "#" column as ResultTable already adds row numbers + if col_name == '#': + continue + if column_count >= self.max_columns: + break + col_value_str = format_value(col_value) + if len(col_value_str) > 60: + col_value_str = col_value_str[:57] + "..." + row.add_column(col_name, col_value_str) + added_fields.add(col_name.lower()) + column_count += 1 + # Mark 'columns' as handled so we don't add it as a field + added_fields.add('columns') + # Also mark common fields that shouldn't be re-displayed if they're in columns + # This prevents showing both "Store" (from columns) and "Origin" (from data fields) + added_fields.add('origin') + added_fields.add('source') + added_fields.add('target') + added_fields.add('path') + added_fields.add('media_kind') + added_fields.add('detail') + added_fields.add('annotations') + added_fields.add('full_metadata') # Don't display full metadata as column + except Exception: + # Fall back to regular field handling if columns format is unexpected + pass + + # Only add priority groups if we haven't already filled columns from 'columns' field + if column_count == 0: + # Priority field groups - uses first matching field in each group + priority_groups = [ + ('title | name | filename', ['title', 'name', 'filename']), + ('origin | source', ['origin', 'source']), + ('type | media_kind | kind', ['type', 'media_kind', 'kind']), + ('target | path | url', ['target', 'path', 'url']), + ('hash | hash_hex | file_hash', ['hash', 'hash_hex', 'file_hash']), + ('tags | tag_summary', ['tags', 'tag_summary']), + ('detail | description', ['detail', 'description']), + ] + + # Add priority field groups first - use first match in each group + for _group_label, field_options in priority_groups: + if column_count >= self.max_columns: + break + for field in field_options: + if field in visible_data and field not in added_fields: + value_str = format_value(visible_data[field]) + if len(value_str) > 60: + value_str = value_str[:57] + "..." + + row.add_column(field.replace('_', ' ').title(), value_str) + added_fields.add(field) + column_count += 1 + break # Use first match in this group, skip rest + + # Add remaining fields only if we haven't hit max_columns (and no explicit columns were set) + if column_count < self.max_columns: + for key, value in visible_data.items(): + if column_count >= self.max_columns: + break + if key not in added_fields: # Only add if not already added + value_str = format_value(value) + if len(value_str) > 40: + value_str = value_str[:37] + "..." + row.add_column(key.replace('_', ' ').title(), value_str) + added_fields.add(key) # Track in added_fields to prevent re-adding + column_count += 1 + + def _add_generic_object(self, row: ResultRow, obj: Any) -> None: + """Extract and add fields from generic objects.""" + if hasattr(obj, '__dict__'): + for key, value in obj.__dict__.items(): + if key.startswith('_'): # Skip private attributes + continue + + value_str = str(value) + if len(value_str) > 60: + value_str = value_str[:57] + "..." + + row.add_column(key.replace('_', ' ').title(), value_str) + + def format_plain(self) -> str: + """Format table as plain text with aligned columns and row numbers. + + Returns: + Formatted table string + """ + if not self.rows: + return "No results" + + # Calculate column widths + col_widths: Dict[str, int] = {} + for row in self.rows: + for col in row.columns: + col_name = col.name + col_widths[col_name] = max( + col_widths.get(col_name, 0), + len(col.name), + len(col.value) + ) + + # Calculate row number column width + num_width = len(str(len(self.rows))) + 1 # +1 for padding + + lines = [] + + # Add title if present + if self.title: + lines.append("=" * self.title_width) + lines.append(self.title.center(self.title_width)) + lines.append("=" * self.title_width) + + # Add header with # column + header_parts = ["#".ljust(num_width)] + separator_parts = ["-" * num_width] + for col_name in col_widths: + width = min(col_widths[col_name], 90) # Cap column width (increased for expanded titles) + header_parts.append(col_name.ljust(width)) + separator_parts.append("-" * width) + + lines.append(" | ".join(header_parts)) + lines.append("-+-".join(separator_parts)) + + # Add rows with row numbers + for row_num, row in enumerate(self.rows, 1): + row_parts = [str(row_num).ljust(num_width)] + for col_name in col_widths: + width = min(col_widths[col_name], 90) # Increased cap for expanded titles + col_value = row.get_column(col_name) or "" + if len(col_value) > width: + col_value = col_value[:width - 3] + "..." + row_parts.append(col_value.ljust(width)) + lines.append(" | ".join(row_parts)) + + return "\n".join(lines) + + def format_compact(self) -> str: + """Format table in compact form (one line per row). + + Returns: + Formatted table string + """ + lines = [] + + if self.title: + lines.append(f"\n{self.title}") + lines.append("-" * len(self.title)) + + for i, row in enumerate(self.rows, 1): + row_str = " | ".join(str(col) for col in row.columns) + lines.append(f"{i}. {row_str}") + + return "\n".join(lines) + + def format_json(self) -> str: + """Format table as JSON. + + Returns: + JSON string + """ + data = { + "title": self.title, + "row_count": len(self.rows), + "rows": [row.to_list() for row in self.rows] + } + return json.dumps(data, indent=2) + + def to_dict(self) -> Dict[str, Any]: + """Convert table to dictionary. + + Returns: + Dictionary representation + """ + return { + "title": self.title, + "rows": [row.to_list() for row in self.rows] + } + + def __str__(self) -> str: + """String representation (plain text format).""" + return self.format_plain() + + def __repr__(self) -> str: + """Developer representation.""" + return f"ResultTable(title={self.title!r}, rows={len(self.rows)})" + + def __len__(self) -> int: + """Number of rows in the table.""" + return len(self.rows) + + def __iter__(self): + """Iterate over rows.""" + return iter(self.rows) + + def __getitem__(self, index: int) -> ResultRow: + """Get row by index.""" + return self.rows[index] + + def select_interactive(self, prompt: str = "Select an item", accept_args: bool = False) -> Optional[List[int]] | dict: + """Display table and get interactive user selection (single or multiple). + + Supports multiple input formats: + - Single: "5" or "q" to quit + - Range: "3-5" (selects items 3, 4, 5) + - Multiple: "3,5,13" (selects items 3, 5, and 13) + - Combined: "1-3,7,9-11" (selects 1,2,3,7,9,10,11) + + If accept_args=True, also supports cmdlet arguments: + - "5 -storage hydrus" → returns indices [4] + args {"-storage": "hydrus"} + - "2-4 -storage hydrus -tag important" → returns indices [1,2,3] + multiple args + + Args: + prompt: Custom prompt text + accept_args: If True, parse and return cmdlet arguments from input + + Returns: + If accept_args=False: List of 0-based indices, or None if cancelled + If accept_args=True: Dict with "indices" and "args" keys, or None if cancelled + """ + # Display the table + print(f"\n{self}") + + # Get user input + while True: + try: + if accept_args: + choice = input(f"\n{prompt} (e.g., '5' or '2 -storage hydrus' or 'q' to quit): ").strip() + else: + choice = input(f"\n{prompt} (e.g., '5' or '3-5' or '1,3,5' or 'q' to quit): ").strip() + + if choice.lower() == 'q': + return None + + if accept_args: + # Parse selection and arguments + result = self._parse_selection_with_args(choice) + if result is not None: + return result + print(f"Invalid format. Use: selection (5 or 3-5 or 1,3,5) optionally followed by flags (e.g., '5 -storage hydrus').") + else: + # Parse just the selection + selected_indices = self._parse_selection(choice) + if selected_indices is not None: + return selected_indices + print(f"Invalid format. Use: single (5), range (3-5), list (1,3,5), combined (1-3,7,9-11), or 'q' to quit.") + except (ValueError, EOFError): + if accept_args: + print(f"Invalid format. Use: selection (5 or 3-5 or 1,3,5) optionally followed by flags (e.g., '5 -storage hydrus').") + else: + print(f"Invalid format. Use: single (5), range (3-5), list (1,3,5), combined (1-3,7,9-11), or 'q' to quit.") + + def _parse_selection(self, selection_str: str) -> Optional[List[int]]: + """Parse user selection string into list of 0-based indices. + + Supports: + - Single: "5" → [4] + - Range: "3-5" → [2, 3, 4] + - Multiple: "3,5,13" → [2, 4, 12] + - Combined: "1-3,7,9-11" → [0, 1, 2, 6, 8, 9, 10] + + Args: + selection_str: User input string + + Returns: + List of 0-based indices, or None if invalid + """ + indices = set() + + # Split by comma for multiple selections + parts = selection_str.split(',') + + for part in parts: + part = part.strip() + if not part: + continue + + # Check if it's a range (contains dash) + if '-' in part: + # Handle ranges like "3-5" + try: + range_parts = part.split('-') + if len(range_parts) != 2: + return None + + start = int(range_parts[0].strip()) + end = int(range_parts[1].strip()) + + # Validate range + if start < 1 or end < 1 or start > len(self.rows) or end > len(self.rows): + return None + + if start > end: + start, end = end, start + + # Add all indices in range (convert to 0-based) + for i in range(start, end + 1): + indices.add(i - 1) + + except (ValueError, IndexError): + return None + else: + # Single number + try: + num = int(part) + if num < 1 or num > len(self.rows): + return None + indices.add(num - 1) # Convert to 0-based + except ValueError: + return None + + if not indices: + return None + + # Return sorted list + return sorted(list(indices)) + + def _parse_selection_with_args(self, input_str: str) -> Optional[dict]: + """Parse user input into selection indices and cmdlet arguments. + + Supports formats like: + - "5" → {"indices": [4], "args": {}} + - "2 -storage hydrus" → {"indices": [1], "args": {"-storage": "hydrus"}} + - "3-5 -storage hydrus -tag important" → {"indices": [2,3,4], "args": {"-storage": "hydrus", "-tag": "important"}} + + Args: + input_str: User input string with selection and optional flags + + Returns: + Dict with "indices" and "args" keys, or None if invalid + """ + parts = input_str.split() + if not parts: + return None + + # First part should be the selection + selection_str = parts[0] + selected_indices = self._parse_selection(selection_str) + + if selected_indices is None: + return None + + # Remaining parts are cmdlet arguments + cmdlet_args = {} + i = 1 + while i < len(parts): + part = parts[i] + + # Check if it's a flag (starts with -) + if part.startswith("-"): + flag = part + value = None + + # Get the value if it exists and doesn't start with - + if i + 1 < len(parts) and not parts[i + 1].startswith("-"): + value = parts[i + 1] + i += 2 + else: + i += 1 + + # Store the flag + if value is not None: + cmdlet_args[flag] = value + else: + cmdlet_args[flag] = True # Flag without value + else: + i += 1 + + return { + "indices": selected_indices, + "args": cmdlet_args + } + + def add_input_option(self, option: InputOption) -> "ResultTable": + """Add an interactive input option to the table. + + Input options allow users to specify cmdlet arguments interactively, + like choosing a download location or source. + + Args: + option: InputOption definition + + Returns: + Self for chaining + """ + self.input_options[option.name] = option + return self + + def select_option(self, option_name: str, prompt: str = "") -> Optional[str]: + """Interactively get user input for a specific option. + + Displays the option choices (if enum) and prompts user for input. + + Args: + option_name: Name of the option to get input for + prompt: Custom prompt text (uses option description if not provided) + + Returns: + User's selected/entered value, or None if cancelled + """ + if option_name not in self.input_options: + print(f"Unknown option: {option_name}") + return None + + option = self.input_options[option_name] + prompt_text = prompt or option.description or option_name + + while True: + try: + # For enum options, show choices + if option.type == "enum" and option.choices: + print(f"\n{prompt_text}") + for i, choice in enumerate(option.choices, 1): + print(f" {i}. {choice}") + + choice_input = input(f"Select {option_name} (1-{len(option.choices)}, or 'q' to cancel): ").strip() + + if choice_input.lower() == 'q': + return None + + try: + idx = int(choice_input) - 1 + if 0 <= idx < len(option.choices): + return option.choices[idx] + print(f"Invalid choice. Enter 1-{len(option.choices)}") + except ValueError: + print(f"Invalid choice. Enter 1-{len(option.choices)}") + + # For string/integer options, get direct input + elif option.type in ("string", "integer"): + value = input(f"{prompt_text} (or 'q' to cancel): ").strip() + + if value.lower() == 'q': + return None + + # Validate if validator provided + if option.validator and not option.validator(value): + print(f"Invalid value for {option_name}") + continue + + # Type conversion + if option.type == "integer": + try: + int(value) + except ValueError: + print(f"Must be an integer") + continue + + return value + + # For flag options + elif option.type == "flag": + response = input(f"{prompt_text} (y/n): ").strip().lower() + if response == 'q': + return None + return "true" if response in ('y', 'yes', 'true') else "false" + + except (ValueError, EOFError): + return None + + def get_all_options(self) -> Dict[str, str]: + """Get all input options at once with user prompts. + + Interactively prompts user for all registered options. + + Returns: + Dictionary mapping option names to selected values + """ + result = {} + for name, option in self.input_options.items(): + value = self.select_option(name) + if value is not None: + result[name] = value + return result + + def select_by_index(self, index: int) -> Optional[ResultRow]: + """Get a row by 1-based index (user-friendly). + + Args: + index: 1-based index + + Returns: + ResultRow if valid, None otherwise + """ + idx = index - 1 + if 0 <= idx < len(self.rows): + return self.rows[idx] + return None + + # TUI-specific formatting methods + + def to_datatable_rows(self, source: str = "unknown") -> List[List[str]]: + """Convert results to rows suitable for Textual DataTable widget. + + Args: + source: Source type for formatting context (openlibrary, soulseek, etc.) + + Returns: + List of row value lists + """ + rows = [] + for result in self.rows: + row_values = self._format_datatable_row(result, source) + rows.append(row_values) + return rows + + def _format_datatable_row(self, row: ResultRow, source: str = "unknown") -> List[str]: + """Format a ResultRow for DataTable display. + + Args: + row: ResultRow to format + source: Source type + + Returns: + List of column values as strings + """ + # Extract values from row columns + values = [col.value for col in row.columns] + + # Truncate to reasonable lengths for table display + return [v[:60] if len(v) > 60 else v for v in values] + + def to_result_cards(self) -> List[TUIResultCard]: + """Convert all rows to TUIResultCard objects for card-based UI display. + + Returns: + List of TUIResultCard objects + """ + cards = [] + for row in self.rows: + card = self._row_to_card(row) + cards.append(card) + return cards + + def _row_to_card(self, row: ResultRow) -> TUIResultCard: + """Convert a ResultRow to a TUIResultCard. + + Args: + row: ResultRow to convert + + Returns: + TUIResultCard with extracted metadata + """ + # Build metadata dict from row columns + metadata = {} + title = "" + + for col in row.columns: + if col.name.lower() == "title": + title = col.value + metadata[col.name] = col.value + + # Extract tags if present + tags = [] + if "tags" in metadata: + tags_val = metadata["tags"] + if tags_val: + tags = [t.strip() for t in tags_val.split(",")][:5] + + # Try to find useful metadata fields + subtitle = metadata.get("Artist", metadata.get("Author", "")) + media_kind = metadata.get("Type", metadata.get("Media Kind", "")) + file_size = metadata.get("Size", "") + duration = metadata.get("Duration", "") + file_hash = metadata.get("Hash", "") + + return TUIResultCard( + title=title or "Unknown", + subtitle=subtitle, + metadata=metadata, + media_kind=media_kind, + tags=tags, + file_hash=file_hash or None, + file_size=file_size or None, + duration=duration or None + ) + + def build_metadata_tree(self, tree_widget: "Tree") -> None: + """Populate a Textual Tree widget with result metadata hierarchy. + + Args: + tree_widget: Textual Tree widget to populate + + Raises: + ImportError: If Textual not available + """ + if not TEXTUAL_AVAILABLE: + raise ImportError("Textual not available for tree building") + + tree_widget.reset() + root = tree_widget.root + + # Add each row as a top-level node + for i, row in enumerate(self.rows, 1): + row_node = root.add(f"[bold]Result {i}[/bold]") + + # Add columns as children + for col in row.columns: + value_str = col.value + if len(value_str) > 100: + value_str = value_str[:97] + "..." + row_node.add_leaf(f"[cyan]{col.name}[/cyan]: {value_str}") + + +def _format_duration(duration: Any) -> str: + """Format duration value as human-readable string. + + Args: + duration: Duration in seconds, milliseconds, or already formatted string + + Returns: + Formatted duration string (e.g., "2h 18m 5s", "5m 30s") + """ + if isinstance(duration, str): + return duration if duration else "" + + try: + # Convert to seconds if needed + if isinstance(duration, (int, float)): + seconds = int(duration) + if seconds < 1000: # Likely already in seconds + pass + else: # Likely in milliseconds + seconds = seconds // 1000 + else: + return "" + + hours = seconds // 3600 + minutes = (seconds % 3600) // 60 + secs = seconds % 60 + + if hours > 0: + return f"{hours}h {minutes}m {secs}s" + elif minutes > 0: + return f"{minutes}m {secs}s" + else: + return f"{secs}s" + except (ValueError, TypeError): + return "" + + +def _format_size(size: Any) -> str: + """Format file size as human-readable string. + + Args: + size: Size in bytes or already formatted string + + Returns: + Formatted size string (e.g., "1.5 MB", "250 KB") + """ + if isinstance(size, str): + return size if size else "" + + try: + bytes_val = int(size) + if bytes_val < 0: + return "" + + for unit, divisor in [("GB", 1024**3), ("MB", 1024**2), ("KB", 1024)]: + if bytes_val >= divisor: + return f"{bytes_val / divisor:.1f} {unit}" + + return f"{bytes_val} B" + except (ValueError, TypeError): + return "" + + +def format_result(result: Any, title: str = "") -> str: + """Quick function to format a single result or list of results. + + Args: + result: Result object, list of results, or dict + title: Optional title for the table + + Returns: + Formatted string + """ + table = ResultTable(title) + + if isinstance(result, list): + for item in result: + table.add_result(item) + else: + table.add_result(result) + + return str(table) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..558717a --- /dev/null +++ b/setup.py @@ -0,0 +1,39 @@ +""" +Setup configuration for Medeia-Macina. + +Medeia-Macina is a comprehensive media and data management system with support for: +- Video downloading from multiple sources (YouTube, etc.) +- Local and cloud-based file storage +- Advanced metadata and tag management +- Full-featured TUI and CLI interfaces +""" + +from setuptools import setup, find_packages + +with open("requirements.txt") as f: + requirements = [line.strip() for line in f if line.strip() and not line.startswith("#")] + +setup( + name="medeia-macina", + version="1.0.0", + description="Comprehensive media and data management system", + author="Anonymous", + python_requires=">=3.9", + packages=find_packages(exclude=["tests", "*.tests"]), + install_requires=requirements, + entry_points={ + "console_scripts": [ + "mm=medeia_macina.cli_entry:main", + "medeia=medeia_macina.cli_entry:main", + ], + }, + classifiers=[ + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], +)