"""Search-file cmdlet: Search for files by query, tags, size, type, duration, etc.""" from __future__ import annotations from typing import Any, Dict, Sequence, List, Optional, Tuple from pathlib import Path from dataclasses import dataclass, field from collections import OrderedDict import re import json import sys from SYS.logger import log, debug from Provider.registry import get_search_provider from cmdlets._shared import Cmdlet, CmdletArg, get_field, should_show_help import pipeline as ctx def get_origin(obj: Any, default: Any = None) -> Any: """Return the canonical origin/table identifier from a payload-like object.""" value = get_field(obj, "origin", None) if value is not None: return value value = get_field(obj, "table", None) if value is not None: return value value = get_field(obj, "store", None) if value is not None: return value return default # Optional dependencies try: import mutagen # type: ignore except ImportError: # pragma: no cover mutagen = None # type: ignore try: from config import get_hydrus_url, resolve_output_dir except Exception: # pragma: no cover get_hydrus_url = None # type: ignore resolve_output_dir = None # type: ignore try: from API.HydrusNetwork import HydrusClient, HydrusRequestError except ImportError: # pragma: no cover HydrusClient = None # type: ignore HydrusRequestError = RuntimeError # type: ignore try: from SYS.utils import sha256_file except ImportError: # pragma: no cover sha256_file = None # type: ignore try: from SYS.utils_constant import mime_maps except ImportError: # pragma: no cover mime_maps = {} # type: ignore @dataclass(slots=True) class SearchRecord: path: str size_bytes: int | None = None duration_seconds: str | None = None tags: str | None = None hash: str | None = None def as_dict(self) -> dict[str, str]: payload: dict[str, str] = {"path": self.path} if self.size_bytes is not None: payload["size"] = str(self.size_bytes) if self.duration_seconds: payload["duration"] = self.duration_seconds if self.tags: payload["tags"] = self.tags if self.hash: payload["hash"] = self.hash return payload @dataclass class ResultItem: table: str # Renamed from origin title: str detail: str annotations: List[str] target: str media_kind: str = "other" hash: Optional[str] = None columns: List[tuple[str, str]] = field(default_factory=list) tag_summary: Optional[str] = None duration_seconds: Optional[float] = None size_bytes: Optional[int] = None full_metadata: Optional[Dict[str, Any]] = None tags: Optional[set[str]] = field(default_factory=set) relationships: Optional[List[str]] = field(default_factory=list) known_urls: Optional[List[str]] = field(default_factory=list) @property def origin(self) -> str: return self.table def to_dict(self) -> Dict[str, Any]: payload: Dict[str, Any] = { "title": self.title, } # Always include these core fields for downstream cmdlets (get-file, download-data, etc) payload["table"] = self.table payload["target"] = self.target payload["media_kind"] = self.media_kind # Always include full_metadata if present (needed by download-data, etc) # This is NOT for display, but for downstream processing if self.full_metadata: payload["full_metadata"] = self.full_metadata # Include columns if defined (result renderer will use these for display) if self.columns: payload["columns"] = list(self.columns) else: # If no columns, include the detail for backwards compatibility payload["detail"] = self.detail payload["annotations"] = list(self.annotations) # Include optional fields if self.hash: payload["hash"] = self.hash if self.tag_summary: payload["tags"] = self.tag_summary if self.tags: payload["tags_set"] = list(self.tags) if self.relationships: payload["relationships"] = self.relationships if self.known_urls: payload["known_urls"] = self.known_urls return payload STORAGE_ORIGINS = {"local", "hydrus", "debrid"} class Search_File(Cmdlet): """Class-based search-file cmdlet with self-registration.""" def __init__(self) -> None: super().__init__( name="search-file", summary="Unified search cmdlet for storage (Hydrus, Local) and providers (Debrid, LibGen, OpenLibrary, Soulseek).", usage="search-file [query] [-tag TAG] [-size >100MB|<50MB] [-type audio|video|image] [-duration >10:00] [-store BACKEND] [-provider PROVIDER]", arg=[ CmdletArg("query", description="Search query string"), CmdletArg("tag", description="Filter by tag (can be used multiple times)"), CmdletArg("size", description="Filter by size: >100MB, <50MB, =10MB"), CmdletArg("type", description="Filter by type: audio, video, image, document"), CmdletArg("duration", description="Filter by duration: >10:00, <1:30:00"), CmdletArg("limit", type="integer", description="Limit results (default: 45)"), CmdletArg("store", description="Search storage backend: hydrus, local (default: all searchable storages)"), CmdletArg("provider", description="Search provider: libgen, openlibrary, soulseek, debrid, local (overrides -storage)"), ], detail=[ "Search across storage (Hydrus, Local) and providers (Debrid, LibGen, OpenLibrary, Soulseek)", "Use -provider to search a specific source, or -store to search file backends", "Filter results by: tag, size, type, duration", "Results can be piped to other commands", "Examples:", "search-file foo # Search all file backends", "search-file -provider libgen 'python programming' # Search LibGen books", "search-file -provider debrid 'movie' # Search AllDebrid magnets", "search-file 'music' -provider soulseek # Search Soulseek P2P", "search-file -provider openlibrary 'tolkien' # Search OpenLibrary", "search-file song -store hydrus -type audio # Search only Hydrus audio", "search-file movie -tag action -provider debrid # Debrid with filters", ], exec=self.run, ) self.register() # --- Helper methods ------------------------------------------------- @staticmethod def _normalize_extension(ext_value: Any) -> str: """Sanitize extension strings to alphanumerics and cap at 5 chars.""" ext = str(ext_value or "").strip().lstrip(".") for sep in (" ", "|", "(", "[", "{", ",", ";"): if sep in ext: ext = ext.split(sep, 1)[0] break if "." in ext: ext = ext.split(".")[-1] ext = "".join(ch for ch in ext if ch.isalnum()) return ext[:5] def _ensure_storage_columns(self, payload: Dict[str, Any]) -> Dict[str, Any]: """Ensure storage results have the necessary fields for result_table display.""" store_value = str(get_origin(payload, "") or "").lower() if store_value not in STORAGE_ORIGINS: return payload # Ensure we have title field if "title" not in payload: payload["title"] = payload.get("name") or payload.get("target") or payload.get("path") or "Result" # Ensure we have ext field if "ext" not in payload: title = str(payload.get("title", "")) path_obj = Path(title) if path_obj.suffix: payload["ext"] = self._normalize_extension(path_obj.suffix.lstrip('.')) else: payload["ext"] = payload.get("ext", "") # Ensure size_bytes is present for display (already set by search_file()) # result_table will handle formatting it # Don't create manual columns - let result_table handle display # This allows the table to respect max_columns and apply consistent formatting return payload # --- Execution ------------------------------------------------------ def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: """Search across multiple providers: Hydrus, Local, Debrid, LibGen, etc.""" if should_show_help(args): log(f"Cmdlet: {self.name}\nSummary: {self.summary}\nUsage: {self.usage}") return 0 args_list = [str(arg) for arg in (args or [])] # Parse arguments query = "" tag_filters: List[str] = [] size_filter: Optional[Tuple[str, int]] = None duration_filter: Optional[Tuple[str, float]] = None type_filter: Optional[str] = None storage_backend: Optional[str] = None provider_name: Optional[str] = None limit = 45 searched_backends: List[str] = [] i = 0 while i < len(args_list): arg = args_list[i] low = arg.lower() if low in {"-provider", "--provider"} and i + 1 < len(args_list): provider_name = args_list[i + 1].lower() i += 2 elif low in {"-store", "--store", "-storage", "--storage"} and i + 1 < len(args_list): storage_backend = args_list[i + 1].lower() i += 2 elif low in {"-tag", "--tag"} and i + 1 < len(args_list): tag_filters.append(args_list[i + 1]) i += 2 elif low in {"-limit", "--limit"} and i + 1 < len(args_list): try: limit = int(args_list[i + 1]) except ValueError: limit = 100 i += 2 elif low in {"-type", "--type"} and i + 1 < len(args_list): type_filter = args_list[i + 1].lower() i += 2 elif not arg.startswith("-"): query = f"{query} {arg}".strip() if query else arg i += 1 else: i += 1 store_filter: Optional[str] = None if query: match = re.search(r"\bstore:([^\s,]+)", query, flags=re.IGNORECASE) if match: store_filter = match.group(1).strip().lower() or None query = re.sub(r"\s*[,]?\s*store:[^\s,]+", " ", query, flags=re.IGNORECASE) query = re.sub(r"\s{2,}", " ", query) query = query.strip().strip(',') if storage_backend and storage_backend.lower() == "debrid": log("Use -provider debrid instead of -store debrid (debrid is provider-only)", file=sys.stderr) return 1 if store_filter and not provider_name and not storage_backend: if store_filter in {"hydrus", "local", "debrid"}: storage_backend = store_filter # --- Feature: Filter provider result table by Name column --- filter_after_search: Optional[str] = None if result: actual_result = result[0] if isinstance(result, list) and result else result origin = get_origin(actual_result) target = get_field(actual_result, 'target') # If the incoming result is from a provider (not storage) AND this invocation looks like a filter (no flags) positional_args = [a for a in args_list if not a.startswith('-')] no_flags = len(positional_args) == len(args_list) looks_like_filter = no_flags and len(positional_args) == 1 and not provider_name and not storage_backend and not tag_filters and not size_filter and not duration_filter and not type_filter if origin and origin.lower() not in STORAGE_ORIGINS and looks_like_filter and query: # Save the filter string to apply AFTER loading the provider data filter_after_search = query.strip() query = "" # Clear query so we load the target URL instead # If result is from a provider, extract the target as query and set provider if not query: if origin == 'bandcamp' and target: query = target if not provider_name: provider_name = 'bandcamp' elif origin == 'youtube' and target: query = target if not provider_name: provider_name = 'youtube' elif target and str(target).startswith(('http://', 'https://')): query = target if not provider_name: if 'bandcamp.com' in target: provider_name = 'bandcamp' elif 'youtube.com' in target or 'youtu.be' in target: provider_name = 'youtube' if not query: log("Provide a search query", file=sys.stderr) return 1 from API.folder import API_folder_store from config import get_local_storage_path import uuid worker_id = str(uuid.uuid4()) library_root = get_local_storage_path(config or {}) if not library_root: log("No library root configured", file=sys.stderr) return 1 db = None try: db = API_folder_store(library_root) db.insert_worker( worker_id, "search", title=f"Search: {query}", description=f"Query: {query}", pipe=ctx.get_current_command_text() ) results_list = [] import result_table import importlib importlib.reload(result_table) from result_table import ResultTable table_title = f"Search: {query}" if provider_name: table_title += f" [{provider_name}]" elif storage_backend: table_title += f" [{storage_backend}]" preserve_order = provider_name and provider_name.lower() in ('youtube', 'openlibrary') # Avoid setting source_command so @N does not re-run search-file; preserve row order when needed table = ResultTable(table_title).set_preserve_order(preserve_order) if provider_name: debug(f"[search_file] Attempting provider search with: {provider_name}") provider = get_search_provider(provider_name, config) if not provider: log(f"Provider '{provider_name}' not available", file=sys.stderr) db.update_worker_status(worker_id, 'error') return 1 debug(f"[search_file] Provider loaded, calling search with query: {query}") search_result = provider.search(query, limit=limit) debug(f"[search_file] Provider search returned {len(search_result)} results") # Apply post-search filter if one was set if filter_after_search: debug(f"[search_file] Applying filter: {filter_after_search}") filtered_result = [] for item in search_result: item_dict = item.to_dict() if hasattr(item, 'to_dict') else dict(item) title_val = get_field(item_dict, 'title') or get_field(item_dict, 'name') or "" if filter_after_search.lower() in str(title_val).lower(): filtered_result.append(item) search_result = filtered_result if not search_result: log(f"No results match filter: '{filter_after_search}'", file=sys.stderr) db.update_worker_status(worker_id, 'completed') return 0 debug(f"[search_file] Filter matched {len(search_result)} results") table.title = f"Filter: {filter_after_search}" for item in search_result: table.add_result(item) item_dict = item.to_dict() results_list.append(item_dict) ctx.emit(item_dict) ctx.set_last_result_table(table, results_list) debug(f"[search_file] Emitted {len(results_list)} results") db.append_worker_stdout(worker_id, json.dumps(results_list, indent=2)) db.update_worker_status(worker_id, 'completed') return 0 from Store import Store storage = Store(config=config or {}, suppress_debug=True) backend_to_search = storage_backend or None if backend_to_search: if backend_to_search == "hydrus": from API.HydrusNetwork import is_hydrus_available if not is_hydrus_available(config or {}): log(f"Backend 'hydrus' is not available (Hydrus service not running)", file=sys.stderr) db.update_worker_status(worker_id, 'error') return 1 searched_backends.append(backend_to_search) target_backend = storage[backend_to_search] results = target_backend.search_store(query, limit=limit) else: from API.HydrusNetwork import is_hydrus_available hydrus_available = is_hydrus_available(config or {}) all_results = [] for backend_name in storage.list_searchable_backends(): if backend_name == "hydrus" and not hydrus_available: continue searched_backends.append(backend_name) try: backend_results = storage[backend_name].search_store(query, limit=limit - len(all_results)) if backend_results: all_results.extend(backend_results) if len(all_results) >= limit: break except Exception as exc: log(f"Backend {backend_name} search failed: {exc}", file=sys.stderr) results = all_results[:limit] if not provider_name and not storage_backend: try: debrid_provider = get_search_provider("debrid", config) if debrid_provider and debrid_provider.validate(): remaining = max(0, limit - len(results)) if isinstance(results, list) else limit if remaining > 0: debrid_results = debrid_provider.search(query, limit=remaining) if debrid_results: if "debrid" not in searched_backends: searched_backends.append("debrid") if results is None: results = [] results.extend(debrid_results) except Exception as exc: log(f"Debrid provider search failed: {exc}", file=sys.stderr) def _format_storage_label(name: str) -> str: clean = str(name or "").strip() if not clean: return "Unknown" return clean.replace("_", " ").title() storage_counts: OrderedDict[str, int] = OrderedDict((name, 0) for name in searched_backends) for item in results or []: origin = get_origin(item) if not origin: continue key = str(origin).lower() if key not in storage_counts: storage_counts[key] = 0 storage_counts[key] += 1 if storage_counts or query: display_counts = OrderedDict((_format_storage_label(name), count) for name, count in storage_counts.items()) summary_line = table.set_storage_summary(display_counts, query, inline=True) if summary_line: table.title = summary_line if results: for item in results: def _as_dict(obj: Any) -> Dict[str, Any]: if isinstance(obj, dict): return dict(obj) if hasattr(obj, "to_dict") and callable(getattr(obj, "to_dict")): return obj.to_dict() # type: ignore[arg-type] return {"title": str(obj)} item_dict = _as_dict(item) if store_filter: origin_val = str(get_origin(item_dict) or "").lower() if store_filter != origin_val: continue normalized = self._ensure_storage_columns(item_dict) # Make hash/store available for downstream cmdlets without rerunning search-file hash_val = normalized.get("hash") store_val = normalized.get("store") or get_origin(item_dict) if hash_val and not normalized.get("hash"): normalized["hash"] = hash_val if store_val and not normalized.get("store"): normalized["store"] = store_val table.add_result(normalized) results_list.append(normalized) ctx.emit(normalized) ctx.set_last_result_table(table, results_list) db.append_worker_stdout(worker_id, json.dumps(results_list, indent=2)) else: log("No results found", file=sys.stderr) db.append_worker_stdout(worker_id, json.dumps([], indent=2)) db.update_worker_status(worker_id, 'completed') return 0 except Exception as exc: log(f"Search failed: {exc}", file=sys.stderr) import traceback traceback.print_exc(file=sys.stderr) if db: try: db.update_worker_status(worker_id, 'error') except Exception: pass return 1 finally: if db: try: db.close() except Exception: pass CMDLET = Search_File()