Files
Medios-Macina/search_file.py

531 lines
22 KiB
Python
Raw Normal View History

2025-12-11 23:21:45 -08:00
"""Search-file cmdlet: Search for files by query, tag, size, type, duration, etc."""
2025-12-11 19:04:02 -08:00
from __future__ import annotations
from typing import Any, Dict, Sequence, List, Optional, Tuple
from pathlib import Path
from dataclasses import dataclass, field
from collections import OrderedDict
import re
import json
import sys
from SYS.logger import log, debug
from Provider.registry import get_search_provider
from cmdlets._shared import Cmdlet, CmdletArg, get_field, should_show_help
import pipeline as ctx
def get_origin(obj: Any, default: Any = None) -> Any:
"""Return the canonical origin/table identifier from a payload-like object."""
value = get_field(obj, "origin", None)
if value is not None:
return value
value = get_field(obj, "table", None)
if value is not None:
return value
value = get_field(obj, "store", None)
if value is not None:
return value
return default
# Optional dependencies
try:
import mutagen # type: ignore
except ImportError: # pragma: no cover
mutagen = None # type: ignore
try:
from config import get_hydrus_url, resolve_output_dir
except Exception: # pragma: no cover
get_hydrus_url = None # type: ignore
resolve_output_dir = None # type: ignore
try:
2025-12-11 23:21:45 -08:00
from API.HydrusNetwork import HydrusNetwork, HydrusRequestError
2025-12-11 19:04:02 -08:00
except ImportError: # pragma: no cover
2025-12-11 23:21:45 -08:00
HydrusNetwork = None # type: ignore
2025-12-11 19:04:02 -08:00
HydrusRequestError = RuntimeError # type: ignore
try:
from SYS.utils import sha256_file
except ImportError: # pragma: no cover
sha256_file = None # type: ignore
try:
from SYS.utils_constant import mime_maps
except ImportError: # pragma: no cover
mime_maps = {} # type: ignore
@dataclass(slots=True)
class SearchRecord:
path: str
size_bytes: int | None = None
duration_seconds: str | None = None
2025-12-11 23:21:45 -08:00
tag: str | None = None
2025-12-11 19:04:02 -08:00
hash: str | None = None
def as_dict(self) -> dict[str, str]:
payload: dict[str, str] = {"path": self.path}
if self.size_bytes is not None:
payload["size"] = str(self.size_bytes)
if self.duration_seconds:
payload["duration"] = self.duration_seconds
2025-12-11 23:21:45 -08:00
if self.tag:
payload["tag"] = self.tag
2025-12-11 19:04:02 -08:00
if self.hash:
payload["hash"] = self.hash
return payload
@dataclass
class ResultItem:
table: str # Renamed from origin
title: str
detail: str
annotations: List[str]
target: str
media_kind: str = "other"
hash: Optional[str] = None
columns: List[tuple[str, str]] = field(default_factory=list)
tag_summary: Optional[str] = None
duration_seconds: Optional[float] = None
size_bytes: Optional[int] = None
full_metadata: Optional[Dict[str, Any]] = None
2025-12-11 23:21:45 -08:00
tag: Optional[set[str]] = field(default_factory=set)
2025-12-11 19:04:02 -08:00
relationships: Optional[List[str]] = field(default_factory=list)
known_urls: Optional[List[str]] = field(default_factory=list)
@property
def origin(self) -> str:
return self.table
def to_dict(self) -> Dict[str, Any]:
payload: Dict[str, Any] = {
"title": self.title,
}
# Always include these core fields for downstream cmdlets (get-file, download-data, etc)
payload["table"] = self.table
payload["target"] = self.target
payload["media_kind"] = self.media_kind
# Always include full_metadata if present (needed by download-data, etc)
# This is NOT for display, but for downstream processing
if self.full_metadata:
payload["full_metadata"] = self.full_metadata
# Include columns if defined (result renderer will use these for display)
if self.columns:
payload["columns"] = list(self.columns)
else:
# If no columns, include the detail for backwards compatibility
payload["detail"] = self.detail
payload["annotations"] = list(self.annotations)
# Include optional fields
if self.hash:
payload["hash"] = self.hash
if self.tag_summary:
2025-12-11 23:21:45 -08:00
payload["tag_summary"] = self.tag_summary
if self.tag:
payload["tag"] = list(self.tag)
2025-12-11 19:04:02 -08:00
if self.relationships:
payload["relationships"] = self.relationships
if self.known_urls:
payload["known_urls"] = self.known_urls
return payload
STORAGE_ORIGINS = {"local", "hydrus", "debrid"}
class Search_File(Cmdlet):
"""Class-based search-file cmdlet with self-registration."""
def __init__(self) -> None:
super().__init__(
name="search-file",
summary="Unified search cmdlet for storage (Hydrus, Local) and providers (Debrid, LibGen, OpenLibrary, Soulseek).",
usage="search-file [query] [-tag TAG] [-size >100MB|<50MB] [-type audio|video|image] [-duration >10:00] [-store BACKEND] [-provider PROVIDER]",
arg=[
CmdletArg("query", description="Search query string"),
CmdletArg("tag", description="Filter by tag (can be used multiple times)"),
CmdletArg("size", description="Filter by size: >100MB, <50MB, =10MB"),
CmdletArg("type", description="Filter by type: audio, video, image, document"),
CmdletArg("duration", description="Filter by duration: >10:00, <1:30:00"),
CmdletArg("limit", type="integer", description="Limit results (default: 45)"),
CmdletArg("store", description="Search storage backend: hydrus, local (default: all searchable storages)"),
CmdletArg("provider", description="Search provider: libgen, openlibrary, soulseek, debrid, local (overrides -storage)"),
],
detail=[
"Search across storage (Hydrus, Local) and providers (Debrid, LibGen, OpenLibrary, Soulseek)",
"Use -provider to search a specific source, or -store to search file backends",
"Filter results by: tag, size, type, duration",
"Results can be piped to other commands",
"Examples:",
"search-file foo # Search all file backends",
"search-file -provider libgen 'python programming' # Search LibGen books",
"search-file -provider debrid 'movie' # Search AllDebrid magnets",
"search-file 'music' -provider soulseek # Search Soulseek P2P",
"search-file -provider openlibrary 'tolkien' # Search OpenLibrary",
"search-file song -store hydrus -type audio # Search only Hydrus audio",
"search-file movie -tag action -provider debrid # Debrid with filters",
],
exec=self.run,
)
self.register()
# --- Helper methods -------------------------------------------------
@staticmethod
def _normalize_extension(ext_value: Any) -> str:
"""Sanitize extension strings to alphanumerics and cap at 5 chars."""
ext = str(ext_value or "").strip().lstrip(".")
for sep in (" ", "|", "(", "[", "{", ",", ";"):
if sep in ext:
ext = ext.split(sep, 1)[0]
break
if "." in ext:
ext = ext.split(".")[-1]
ext = "".join(ch for ch in ext if ch.isalnum())
return ext[:5]
def _ensure_storage_columns(self, payload: Dict[str, Any]) -> Dict[str, Any]:
"""Ensure storage results have the necessary fields for result_table display."""
store_value = str(get_origin(payload, "") or "").lower()
if store_value not in STORAGE_ORIGINS:
return payload
# Ensure we have title field
if "title" not in payload:
payload["title"] = payload.get("name") or payload.get("target") or payload.get("path") or "Result"
# Ensure we have ext field
if "ext" not in payload:
title = str(payload.get("title", ""))
path_obj = Path(title)
if path_obj.suffix:
payload["ext"] = self._normalize_extension(path_obj.suffix.lstrip('.'))
else:
payload["ext"] = payload.get("ext", "")
# Ensure size_bytes is present for display (already set by search_file())
# result_table will handle formatting it
# Don't create manual columns - let result_table handle display
# This allows the table to respect max_columns and apply consistent formatting
return payload
# --- Execution ------------------------------------------------------
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
"""Search across multiple providers: Hydrus, Local, Debrid, LibGen, etc."""
if should_show_help(args):
log(f"Cmdlet: {self.name}\nSummary: {self.summary}\nUsage: {self.usage}")
return 0
args_list = [str(arg) for arg in (args or [])]
# Parse arguments
query = ""
tag_filters: List[str] = []
size_filter: Optional[Tuple[str, int]] = None
duration_filter: Optional[Tuple[str, float]] = None
type_filter: Optional[str] = None
storage_backend: Optional[str] = None
provider_name: Optional[str] = None
limit = 45
searched_backends: List[str] = []
i = 0
while i < len(args_list):
arg = args_list[i]
low = arg.lower()
if low in {"-provider", "--provider"} and i + 1 < len(args_list):
provider_name = args_list[i + 1].lower()
i += 2
elif low in {"-store", "--store", "-storage", "--storage"} and i + 1 < len(args_list):
storage_backend = args_list[i + 1].lower()
i += 2
elif low in {"-tag", "--tag"} and i + 1 < len(args_list):
tag_filters.append(args_list[i + 1])
i += 2
elif low in {"-limit", "--limit"} and i + 1 < len(args_list):
try:
limit = int(args_list[i + 1])
except ValueError:
limit = 100
i += 2
elif low in {"-type", "--type"} and i + 1 < len(args_list):
type_filter = args_list[i + 1].lower()
i += 2
elif not arg.startswith("-"):
query = f"{query} {arg}".strip() if query else arg
i += 1
else:
i += 1
store_filter: Optional[str] = None
if query:
match = re.search(r"\bstore:([^\s,]+)", query, flags=re.IGNORECASE)
if match:
store_filter = match.group(1).strip().lower() or None
query = re.sub(r"\s*[,]?\s*store:[^\s,]+", " ", query, flags=re.IGNORECASE)
query = re.sub(r"\s{2,}", " ", query)
query = query.strip().strip(',')
if storage_backend and storage_backend.lower() == "debrid":
log("Use -provider debrid instead of -store debrid (debrid is provider-only)", file=sys.stderr)
return 1
if store_filter and not provider_name and not storage_backend:
if store_filter in {"hydrus", "local", "debrid"}:
storage_backend = store_filter
# --- Feature: Filter provider result table by Name column ---
filter_after_search: Optional[str] = None
if result:
actual_result = result[0] if isinstance(result, list) and result else result
origin = get_origin(actual_result)
target = get_field(actual_result, 'target')
# If the incoming result is from a provider (not storage) AND this invocation looks like a filter (no flags)
positional_args = [a for a in args_list if not a.startswith('-')]
no_flags = len(positional_args) == len(args_list)
looks_like_filter = no_flags and len(positional_args) == 1 and not provider_name and not storage_backend and not tag_filters and not size_filter and not duration_filter and not type_filter
if origin and origin.lower() not in STORAGE_ORIGINS and looks_like_filter and query:
# Save the filter string to apply AFTER loading the provider data
filter_after_search = query.strip()
query = "" # Clear query so we load the target URL instead
# If result is from a provider, extract the target as query and set provider
if not query:
if origin == 'bandcamp' and target:
query = target
if not provider_name:
provider_name = 'bandcamp'
elif origin == 'youtube' and target:
query = target
if not provider_name:
provider_name = 'youtube'
elif target and str(target).startswith(('http://', 'https://')):
query = target
if not provider_name:
if 'bandcamp.com' in target:
provider_name = 'bandcamp'
elif 'youtube.com' in target or 'youtu.be' in target:
provider_name = 'youtube'
if not query:
log("Provide a search query", file=sys.stderr)
return 1
from API.folder import API_folder_store
from config import get_local_storage_path
import uuid
worker_id = str(uuid.uuid4())
library_root = get_local_storage_path(config or {})
if not library_root:
log("No library root configured", file=sys.stderr)
return 1
db = None
try:
db = API_folder_store(library_root)
db.insert_worker(
worker_id,
"search",
title=f"Search: {query}",
description=f"Query: {query}",
pipe=ctx.get_current_command_text()
)
results_list = []
import result_table
import importlib
importlib.reload(result_table)
from result_table import ResultTable
table_title = f"Search: {query}"
if provider_name:
table_title += f" [{provider_name}]"
elif storage_backend:
table_title += f" [{storage_backend}]"
preserve_order = provider_name and provider_name.lower() in ('youtube', 'openlibrary')
# Avoid setting source_command so @N does not re-run search-file; preserve row order when needed
table = ResultTable(table_title).set_preserve_order(preserve_order)
if provider_name:
debug(f"[search_file] Attempting provider search with: {provider_name}")
provider = get_search_provider(provider_name, config)
if not provider:
log(f"Provider '{provider_name}' not available", file=sys.stderr)
db.update_worker_status(worker_id, 'error')
return 1
debug(f"[search_file] Provider loaded, calling search with query: {query}")
search_result = provider.search(query, limit=limit)
debug(f"[search_file] Provider search returned {len(search_result)} results")
# Apply post-search filter if one was set
if filter_after_search:
debug(f"[search_file] Applying filter: {filter_after_search}")
filtered_result = []
for item in search_result:
item_dict = item.to_dict() if hasattr(item, 'to_dict') else dict(item)
title_val = get_field(item_dict, 'title') or get_field(item_dict, 'name') or ""
if filter_after_search.lower() in str(title_val).lower():
filtered_result.append(item)
search_result = filtered_result
if not search_result:
log(f"No results match filter: '{filter_after_search}'", file=sys.stderr)
db.update_worker_status(worker_id, 'completed')
return 0
debug(f"[search_file] Filter matched {len(search_result)} results")
table.title = f"Filter: {filter_after_search}"
for item in search_result:
table.add_result(item)
item_dict = item.to_dict()
results_list.append(item_dict)
ctx.emit(item_dict)
ctx.set_last_result_table(table, results_list)
debug(f"[search_file] Emitted {len(results_list)} results")
db.append_worker_stdout(worker_id, json.dumps(results_list, indent=2))
db.update_worker_status(worker_id, 'completed')
return 0
from Store import Store
storage = Store(config=config or {}, suppress_debug=True)
backend_to_search = storage_backend or None
if backend_to_search:
if backend_to_search == "hydrus":
from API.HydrusNetwork import is_hydrus_available
if not is_hydrus_available(config or {}):
log(f"Backend 'hydrus' is not available (Hydrus service not running)", file=sys.stderr)
db.update_worker_status(worker_id, 'error')
return 1
searched_backends.append(backend_to_search)
target_backend = storage[backend_to_search]
2025-12-11 23:21:45 -08:00
results = target_backend.search(query, limit=limit)
2025-12-11 19:04:02 -08:00
else:
from API.HydrusNetwork import is_hydrus_available
hydrus_available = is_hydrus_available(config or {})
all_results = []
for backend_name in storage.list_searchable_backends():
if backend_name == "hydrus" and not hydrus_available:
continue
searched_backends.append(backend_name)
try:
2025-12-11 23:21:45 -08:00
backend_results = storage[backend_name].search(query, limit=limit - len(all_results))
2025-12-11 19:04:02 -08:00
if backend_results:
all_results.extend(backend_results)
if len(all_results) >= limit:
break
except Exception as exc:
log(f"Backend {backend_name} search failed: {exc}", file=sys.stderr)
results = all_results[:limit]
if not provider_name and not storage_backend:
try:
debrid_provider = get_search_provider("debrid", config)
if debrid_provider and debrid_provider.validate():
remaining = max(0, limit - len(results)) if isinstance(results, list) else limit
if remaining > 0:
debrid_results = debrid_provider.search(query, limit=remaining)
if debrid_results:
if "debrid" not in searched_backends:
searched_backends.append("debrid")
if results is None:
results = []
results.extend(debrid_results)
except Exception as exc:
log(f"Debrid provider search failed: {exc}", file=sys.stderr)
def _format_storage_label(name: str) -> str:
clean = str(name or "").strip()
if not clean:
return "Unknown"
return clean.replace("_", " ").title()
storage_counts: OrderedDict[str, int] = OrderedDict((name, 0) for name in searched_backends)
for item in results or []:
origin = get_origin(item)
if not origin:
continue
key = str(origin).lower()
if key not in storage_counts:
storage_counts[key] = 0
storage_counts[key] += 1
if storage_counts or query:
display_counts = OrderedDict((_format_storage_label(name), count) for name, count in storage_counts.items())
summary_line = table.set_storage_summary(display_counts, query, inline=True)
if summary_line:
table.title = summary_line
if results:
for item in results:
def _as_dict(obj: Any) -> Dict[str, Any]:
if isinstance(obj, dict):
return dict(obj)
if hasattr(obj, "to_dict") and callable(getattr(obj, "to_dict")):
return obj.to_dict() # type: ignore[arg-type]
return {"title": str(obj)}
item_dict = _as_dict(item)
if store_filter:
origin_val = str(get_origin(item_dict) or "").lower()
if store_filter != origin_val:
continue
normalized = self._ensure_storage_columns(item_dict)
# Make hash/store available for downstream cmdlets without rerunning search-file
hash_val = normalized.get("hash")
store_val = normalized.get("store") or get_origin(item_dict)
if hash_val and not normalized.get("hash"):
normalized["hash"] = hash_val
if store_val and not normalized.get("store"):
normalized["store"] = store_val
table.add_result(normalized)
results_list.append(normalized)
ctx.emit(normalized)
ctx.set_last_result_table(table, results_list)
db.append_worker_stdout(worker_id, json.dumps(results_list, indent=2))
else:
log("No results found", file=sys.stderr)
db.append_worker_stdout(worker_id, json.dumps([], indent=2))
db.update_worker_status(worker_id, 'completed')
return 0
except Exception as exc:
log(f"Search failed: {exc}", file=sys.stderr)
import traceback
traceback.print_exc(file=sys.stderr)
if db:
try:
db.update_worker_status(worker_id, 'error')
except Exception:
pass
return 1
finally:
if db:
try:
db.close()
except Exception:
pass
CMDLET = Search_File()