Medios-Macina/cmdlets/search_file.py

"""Search-file cmdlet: Search for files by query, tags, size, type, duration, etc."""
from __future__ import annotations

from typing import Any, Dict, Sequence, List, Optional, Tuple, Callable
from fnmatch import fnmatchcase
from pathlib import Path
from dataclasses import dataclass, field
from collections import OrderedDict
import re
import json
import os
import sys

from helper.logger import log, debug
import shutil
import subprocess

from helper.file_storage import FileStorage
from helper.search_provider import get_provider, list_providers, SearchResult
from metadata import import_pending_sidecars

from . import register
from ._shared import Cmdlet, CmdletArg
import models
import pipeline as ctx

# Optional dependencies
try:
    import mutagen  # type: ignore
except ImportError:  # pragma: no cover
    mutagen = None  # type: ignore

try:
    from config import get_hydrus_url, resolve_output_dir
except Exception:  # pragma: no cover
    get_hydrus_url = None  # type: ignore
    resolve_output_dir = None  # type: ignore

try:
    from helper.hydrus import HydrusClient, HydrusRequestError
except ImportError:  # pragma: no cover
    HydrusClient = None  # type: ignore
    HydrusRequestError = RuntimeError  # type: ignore

try:
    from helper.utils import sha256_file
except ImportError:  # pragma: no cover
    sha256_file = None  # type: ignore

try:
    from helper.utils_constant import mime_maps
except ImportError:  # pragma: no cover
    mime_maps = {}  # type: ignore


# ============================================================================
# Data Classes (from helper/search.py)
# ============================================================================

@dataclass(slots=True)
class SearchRecord:
    path: str
    size_bytes: int | None = None
    duration_seconds: str | None = None
    tags: str | None = None
    hash_hex: str | None = None

    def as_dict(self) -> dict[str, str]:
        payload: dict[str, str] = {"path": self.path}
        if self.size_bytes is not None:
            payload["size"] = str(self.size_bytes)
        if self.duration_seconds:
            payload["duration"] = self.duration_seconds
        if self.tags:
            payload["tags"] = self.tags
        if self.hash_hex:
            payload["hash"] = self.hash_hex
        return payload


@dataclass
class ResultItem:
    origin: str
    title: str
    detail: str
    annotations: List[str]
    target: str
    media_kind: str = "other"
    hash_hex: Optional[str] = None
    columns: List[tuple[str, str]] = field(default_factory=list)
    tag_summary: Optional[str] = None
    duration_seconds: Optional[float] = None
    size_bytes: Optional[int] = None
    full_metadata: Optional[Dict[str, Any]] = None
    tags: Optional[set[str]] = field(default_factory=set)
    relationships: Optional[List[str]] = field(default_factory=list)
    known_urls: Optional[List[str]] = field(default_factory=list)

    def to_dict(self) -> Dict[str, Any]:
        payload: Dict[str, Any] = {
            "title": self.title,
        }

        # Always include these core fields for downstream cmdlets (get-file, download-data, etc)
        payload["origin"] = self.origin
        payload["target"] = self.target
        payload["media_kind"] = self.media_kind

        # Always include full_metadata if present (needed by download-data, etc)
        # This is NOT for display, but for downstream processing
        if self.full_metadata:
            payload["full_metadata"] = self.full_metadata

        # Include columns if defined (result renderer will use these for display)
        if self.columns:
            payload["columns"] = list(self.columns)
        else:
            # If no columns, include the detail for backwards compatibility
            payload["detail"] = self.detail
            payload["annotations"] = list(self.annotations)

        # Include optional fields
        if self.hash_hex:
            payload["hash"] = self.hash_hex
        if self.tag_summary:
            payload["tags"] = self.tag_summary
        if self.tags:
            payload["tags_set"] = list(self.tags)
        if self.relationships:
            payload["relationships"] = self.relationships
        if self.known_urls:
            payload["known_urls"] = self.known_urls
        return payload


STORAGE_ORIGINS = {"local", "hydrus", "debrid"}


def _normalize_extension(ext_value: Any) -> str:
    """Sanitize extension strings to alphanumerics and cap at 5 chars."""
    ext = str(ext_value or "").strip().lstrip(".")

    # Stop at common separators to avoid dragging status text into the extension
    for sep in (" ", "|", "(", "[", "{", ",", ";"):
        if sep in ext:
            ext = ext.split(sep, 1)[0]
            break

    # If there are multiple dots, take the last token as the extension
    if "." in ext:
        ext = ext.split(".")[-1]

    # Keep only alphanumeric characters and enforce max length
    ext = "".join(ch for ch in ext if ch.isalnum())
    return ext[:5]


def _ensure_storage_columns(payload: Dict[str, Any]) -> Dict[str, Any]:
    """Attach Title/Store columns for storage-origin results to keep CLI display compact."""
    origin_value = str(payload.get("origin") or payload.get("source") or "").lower()
    if origin_value not in STORAGE_ORIGINS:
        return payload

    title = payload.get("title") or payload.get("name") or payload.get("target") or payload.get("path") or "Result"
    store_label = payload.get("origin") or payload.get("source") or origin_value

    # Handle extension
    extension = _normalize_extension(payload.get("ext", ""))
    if not extension and title:
        path_obj = Path(str(title))
        if path_obj.suffix:
            extension = _normalize_extension(path_obj.suffix.lstrip('.'))
            title = path_obj.stem

    # Handle size as integer MB (header will include units)
    size_val = payload.get("size") or payload.get("size_bytes")
    size_str = ""
    if size_val is not None:
        try:
            size_bytes = int(size_val)
            size_mb = int(size_bytes / (1024 * 1024))
            size_str = str(size_mb)
        except (ValueError, TypeError):
            size_str = str(size_val)

    normalized = dict(payload)
    normalized["columns"] = [
        ("Title", str(title)),
        ("Ext", str(extension)),
        ("Store", str(store_label)),
        ("Size(Mb)", str(size_str)),
    ]
    return normalized


CMDLET = Cmdlet(
    name="search-file",
    summary="Unified search cmdlet for storage (Hydrus, Local) and providers (Debrid, LibGen, OpenLibrary, Soulseek).",
    usage="search-file [query] [-tag TAG] [-size >100MB|<50MB] [-type audio|video|image] [-duration >10:00] [-storage BACKEND] [-provider PROVIDER]",
    args=[
        CmdletArg("query", description="Search query string"),
        CmdletArg("tag", description="Filter by tag (can be used multiple times)"),
        CmdletArg("size", description="Filter by size: >100MB, <50MB, =10MB"),
        CmdletArg("type", description="Filter by type: audio, video, image, document"),
        CmdletArg("duration", description="Filter by duration: >10:00, <1:30:00"),
        CmdletArg("limit", type="integer", description="Limit results (default: 45)"),
        CmdletArg("storage", description="Search storage backend: hydrus, local (default: all searchable storages)"),
        CmdletArg("provider", description="Search provider: libgen, openlibrary, soulseek, debrid, local (overrides -storage)"),
    ],
    details=[
        "Search across storage (Hydrus, Local) and providers (Debrid, LibGen, OpenLibrary, Soulseek)",
        "Use -provider to search a specific source, or -storage to search file backends",
        "Filter results by: tag, size, type, duration",
        "Results can be piped to other commands",
        "Examples:",
        "search-file foo                                      # Search all file backends",
        "search-file -provider libgen 'python programming'   # Search LibGen books",
        "search-file -provider debrid 'movie'                # Search AllDebrid magnets",
        "search-file 'music' -provider soulseek               # Search Soulseek P2P",
        "search-file -provider openlibrary 'tolkien'         # Search OpenLibrary",
        "search-file song -storage hydrus -type audio        # Search only Hydrus audio",
        "search-file movie -tag action -provider debrid      # Debrid with filters",
    ],
)


@register(["search-file", "search"])
def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
    """Search across multiple providers: Hydrus, Local, Debrid, LibGen, etc."""
    args_list = [str(arg) for arg in (args or [])]

    # Parse arguments
    query = ""
    tag_filters: List[str] = []
    size_filter: Optional[Tuple[str, int]] = None
    duration_filter: Optional[Tuple[str, float]] = None
    type_filter: Optional[str] = None
    storage_backend: Optional[str] = None
    provider_name: Optional[str] = None
    limit = 45
    searched_backends: List[str] = []

    # Simple argument parsing
    i = 0
    while i < len(args_list):
        arg = args_list[i]
        low = arg.lower()

        if low in {"-provider", "--provider"} and i + 1 < len(args_list):
            provider_name = args_list[i + 1].lower()
            i += 2
        elif low in {"-storage", "--storage"} and i + 1 < len(args_list):
            storage_backend = args_list[i + 1].lower()
            i += 2
        elif low in {"-tag", "--tag"} and i + 1 < len(args_list):
            tag_filters.append(args_list[i + 1])
            i += 2
        elif low in {"-limit", "--limit"} and i + 1 < len(args_list):
            try:
                limit = int(args_list[i + 1])
            except ValueError:
                limit = 100
            i += 2
        elif low in {"-type", "--type"} and i + 1 < len(args_list):
            type_filter = args_list[i + 1].lower()
            i += 2
        elif not arg.startswith("-"):
            if query:
                query += " " + arg
            else:
                query = arg
            i += 1
        else:
            i += 1

    # Extract store: filter tokens (works with commas or whitespace) and clean query for backends
    store_filter: Optional[str] = None
    if query:
        match = re.search(r"\bstore:([^\s,]+)", query, flags=re.IGNORECASE)
        if match:
            store_filter = match.group(1).strip().lower() or None
        # Remove any store: tokens so downstream backends see only the actual query
        query = re.sub(r"\s*[,]?\s*store:[^\s,]+", " ", query, flags=re.IGNORECASE)
        query = re.sub(r"\s{2,}", " ", query)
        query = query.strip().strip(',')

    # Debrid is provider-only now
    if storage_backend and storage_backend.lower() == "debrid":
        log("Use -provider debrid instead of -storage debrid (debrid is provider-only)", file=sys.stderr)
        return 1

    # If store: was provided without explicit -storage/-provider, prefer that backend
    if store_filter and not provider_name and not storage_backend:
        if store_filter in {"hydrus", "local", "debrid"}:
            storage_backend = store_filter

    # Handle piped input (e.g. from @N selection) if query is empty
    if not query and result:
        # If result is a list, take the first item
        actual_result = result[0] if isinstance(result, list) and result else result

        # Helper to get field
        def get_field(obj: Any, field: str) -> Any:
            return getattr(obj, field, None) or (obj.get(field) if isinstance(obj, dict) else None)

        origin = get_field(actual_result, 'origin')
        target = get_field(actual_result, 'target')

        # Special handling for Bandcamp artist/album drill-down
        if origin == 'bandcamp' and target:
            query = target
            if not provider_name:
                provider_name = 'bandcamp'

        # Generic URL handling
        elif target and str(target).startswith(('http://', 'https://')):
            query = target
            # Try to infer provider from URL if not set
            if not provider_name:
                if 'bandcamp.com' in target:
                    provider_name = 'bandcamp'
                elif 'youtube.com' in target or 'youtu.be' in target:
                    provider_name = 'youtube'

    if not query:
        log("Provide a search query", file=sys.stderr)
        return 1

    # Initialize worker for this search command
    from helper.local_library import LocalLibraryDB
    from config import get_local_storage_path
    import uuid
    worker_id = str(uuid.uuid4())
    library_root = get_local_storage_path(config or {})
    if not library_root:
        log("No library root configured", file=sys.stderr)
        return 1

    db = None
    try:
        db = LocalLibraryDB(library_root)
        db.insert_worker(
            worker_id,
            "search",
            title=f"Search: {query}",
            description=f"Query: {query}",
            pipe=ctx.get_current_command_text()
        )

        results_list = []
        import result_table
        import importlib
        importlib.reload(result_table)
        from result_table import ResultTable

        # Create ResultTable for display
        table_title = f"Search: {query}"
        if provider_name:
            table_title += f" [{provider_name}]"
        elif storage_backend:
            table_title += f" [{storage_backend}]"

        table = ResultTable(table_title)
        table.set_source_command("search-file", args_list)

        # Try to search using provider (libgen, soulseek, debrid, openlibrary)
        if provider_name:
            debug(f"[search_file] Attempting provider search with: {provider_name}")
            provider = get_provider(provider_name, config)
            if not provider:
                log(f"Provider '{provider_name}' not available", file=sys.stderr)
                db.update_worker_status(worker_id, 'error')
                return 1

            debug(f"[search_file] Provider loaded, calling search with query: {query}")
            search_result = provider.search(query, limit=limit)
            debug(f"[search_file] Provider search returned {len(search_result)} results")

            for item in search_result:
                # Add to table
                table.add_result(item)

                # Emit to pipeline
                item_dict = item.to_dict()
                results_list.append(item_dict)
                ctx.emit(item_dict)

            # Set the result table in context for TUI/CLI display
            ctx.set_last_result_table(table, results_list)

            debug(f"[search_file] Emitted {len(results_list)} results")

            # Write results to worker stdout
            db.append_worker_stdout(worker_id, json.dumps(results_list, indent=2))
            db.update_worker_status(worker_id, 'completed')
            return 0

        # Otherwise search using storage backends (Hydrus, Local)
        from helper.file_storage import FileStorage
        storage = FileStorage(config=config or {})

        backend_to_search = storage_backend or None
        if backend_to_search:
            # Check if requested backend is available
            if backend_to_search == "hydrus":
                from helper.hydrus import is_hydrus_available
                if not is_hydrus_available(config or {}):
                    log(f"Backend 'hydrus' is not available (Hydrus service not running)", file=sys.stderr)
                    db.update_worker_status(worker_id, 'error')
                    return 1
            searched_backends.append(backend_to_search)
            if not storage.supports_search(backend_to_search):
                log(f"Backend '{backend_to_search}' does not support searching", file=sys.stderr)
                db.update_worker_status(worker_id, 'error')
                return 1
            results = storage[backend_to_search].search(query, limit=limit)
        else:
            # Search all searchable backends, but skip hydrus if unavailable
            from helper.hydrus import is_hydrus_available
            hydrus_available = is_hydrus_available(config or {})

            all_results = []
            for backend_name in storage.list_searchable_backends():
                # Skip hydrus if not available
                if backend_name == "hydrus" and not hydrus_available:
                    continue
                searched_backends.append(backend_name)
                try:
                    backend_results = storage[backend_name].search(query, limit=limit - len(all_results))
                    if backend_results:
                        all_results.extend(backend_results)
                    if len(all_results) >= limit:
                        break
                except Exception as exc:
                    log(f"Backend {backend_name} search failed: {exc}", file=sys.stderr)
            results = all_results[:limit]

        # Also query Debrid provider by default (provider-only, but keep legacy coverage when no explicit provider given)
        if not provider_name and not storage_backend:
            try:
                debrid_provider = get_provider("debrid", config)
                if debrid_provider and debrid_provider.validate():
                    remaining = max(0, limit - len(results)) if isinstance(results, list) else limit
                    if remaining > 0:
                        debrid_results = debrid_provider.search(query, limit=remaining)
                        if debrid_results:
                            if "debrid" not in searched_backends:
                                searched_backends.append("debrid")
                            if results is None:
                                results = []
                            results.extend(debrid_results)
            except Exception as exc:
                log(f"Debrid provider search failed: {exc}", file=sys.stderr)

        def _format_storage_label(name: str) -> str:
            clean = str(name or "").strip()
            if not clean:
                return "Unknown"
            return clean.replace("_", " ").title()

        storage_counts: OrderedDict[str, int] = OrderedDict((name, 0) for name in searched_backends)
        for item in results or []:
            origin = getattr(item, 'origin', None)
            if origin is None and isinstance(item, dict):
                origin = item.get('origin') or item.get('source')
            if not origin:
                continue
            key = str(origin).lower()
            if key not in storage_counts:
                storage_counts[key] = 0
            storage_counts[key] += 1

        if storage_counts or query:
            display_counts = OrderedDict((_format_storage_label(name), count) for name, count in storage_counts.items())
            summary_line = table.set_storage_summary(display_counts, query, inline=True)
            if summary_line:
                table.title = summary_line

        # Emit results and collect for workers table
        if results:
            for item in results:
                def _as_dict(obj: Any) -> Dict[str, Any]:
                    if isinstance(obj, dict):
                        return dict(obj)
                    if hasattr(obj, "to_dict") and callable(getattr(obj, "to_dict")):
                        return obj.to_dict()  # type: ignore[arg-type]
                    return {"title": str(obj)}

                item_dict = _as_dict(item)
                if store_filter:
                    origin_val = str(item_dict.get("origin") or item_dict.get("source") or "").lower()
                    if store_filter != origin_val:
                        continue
                normalized = _ensure_storage_columns(item_dict)
                # Add to table using normalized columns to avoid extra fields (e.g., Tags/Name)
                table.add_result(normalized)

                results_list.append(normalized)
                ctx.emit(normalized)

            # Set the result table in context for TUI/CLI display
            ctx.set_last_result_table(table, results_list)

            # Write results to worker stdout
            db.append_worker_stdout(worker_id, json.dumps(results_list, indent=2))
        else:
            log("No results found", file=sys.stderr)
            db.append_worker_stdout(worker_id, json.dumps([], indent=2))

        db.update_worker_status(worker_id, 'completed')
        return 0

    except Exception as exc:
        log(f"Search failed: {exc}", file=sys.stderr)
        import traceback
        traceback.print_exc(file=sys.stderr)
        if db:
            try:
                db.update_worker_status(worker_id, 'error')
            except Exception:
                pass
        return 1

    finally:
        # Always close the database connection
        if db:
            try:
                db.close()
            except Exception:
                pass