kl

2026-01-11 18:56:26 -08:00
parent e70db8d8a6
commit 6076ea307b
7 changed files with 773 additions and 1476 deletions
--- a/cmdlet/_shared.py
+++ b/cmdlet/_shared.py
@@ -11,11 +11,16 @@ import sys
 import tempfile
 from collections.abc import Iterable as IterableABC

-from SYS.logger import log
+from SYS.logger import log, debug
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Set
 from dataclasses import dataclass, field
 from SYS import models
+from SYS import pipeline as pipeline_context
+from SYS.result_table import ResultTable
+from SYS.rich_display import stderr_console as get_stderr_console
+from rich.prompt import Confirm
+from contextlib import AbstractContextManager, nullcontext


@dataclass
@@ -2405,6 +2410,117 @@ def coerce_to_pipe_object(
    return pipe_obj


+def propagate_metadata(
+    previous_items: Sequence[Any],
+    new_items: Sequence[Any]
+) -> List[Any]:
+    """Merge metadata/tags from previous pipeline stage into new items.
+
+    Implements "sticky metadata": items generated by a transformation (download, convert)
+    should inherit rich info (lyrics, art, tags) from their source.
+
+    Strategies:
+    A. Hash Match: If inputs/outputs share a hash, they are the same item.
+    B. Index Match: If lists are same length, assume 1:1 mapping (heuristic).
+    C. Explicit Parent: If output has `parent_hash`, link to input with that hash.
+    """
+    if not previous_items or not new_items:
+        return list(new_items)
+
+    try:
+        prev_normalized = [coerce_to_pipe_object(p) for p in previous_items]
+    except Exception:
+        return list(new_items)
+
+    prev_by_hash: Dict[str, models.PipeObject] = {}
+    for p_obj in prev_normalized:
+        if p_obj.hash and p_obj.hash != "unknown":
+            prev_by_hash[p_obj.hash] = p_obj
+
+    normalized: List[models.PipeObject] = []
+    
+    # Pre-calculate length matching for heuristic
+    is_same_length = len(new_items) == len(prev_normalized)
+
+    for i, item in enumerate(new_items):
+        try:
+            obj = coerce_to_pipe_object(item)
+        except Exception:
+            normalized.append(item) # Should not happen given coerce guards
+            continue
+
+        parent: Optional[models.PipeObject] = None
+
+        # Strategy A: Precise Hash Match
+        if obj.hash in prev_by_hash:
+            parent = prev_by_hash[obj.hash]
+
+        # Strategy B: Index Match (Heuristic)
+        if not parent and is_same_length:
+            parent = prev_normalized[i]
+
+        # Strategy C: Explicit Parent Hash
+        if not parent and obj.parent_hash and obj.parent_hash in prev_by_hash:
+            parent = prev_by_hash[obj.parent_hash]
+
+        if parent:
+            # 1. Tags: Merge unique tags
+            if parent.tag:
+                if not obj.tag:
+                    obj.tag = list(parent.tag)
+                else:
+                    curr_tags = {str(t).lower() for t in obj.tag}
+                    for pt in parent.tag:
+                        if str(pt).lower() not in curr_tags:
+                            obj.tag.append(pt)
+
+            # 2. Metadata: Merge missing keys
+            if parent.metadata:
+                if not obj.metadata:
+                    obj.metadata = parent.metadata.copy()
+                else:
+                    for mk, mv in parent.metadata.items():
+                        if mk not in obj.metadata:
+                            obj.metadata[mk] = mv
+
+            # 3. Source URL: Propagate if missing
+            if parent.source_url and not obj.source_url:
+                obj.source_url = parent.source_url
+            elif parent.url and not obj.source_url and not obj.url:
+                # If parent had a URL and child has none, it's likely the source
+                obj.source_url = parent.url
+
+            # 4. Relationships: Merge missing keys
+            if parent.relationships:
+                if not obj.relationships:
+                    obj.relationships = parent.relationships.copy()
+                else:
+                    for rk, rv in parent.relationships.items():
+                        if rk not in obj.relationships:
+                            obj.relationships[rk] = rv
+
+            # 5. Extra (Notes/etc): Merge missing keys
+            # Important for passing 'notes' payload (lyrics, captions)
+            if parent.extra:
+                if not obj.extra:
+                    obj.extra = parent.extra.copy()
+                else:
+                    # Recursive merge for 'notes' dict specifically?
+                    # For now just shallow merge keys, but handle 'notes' specially if valid.
+                    for ek, ev in parent.extra.items():
+                        if ek not in obj.extra:
+                            obj.extra[ek] = ev
+                        elif ek == "notes" and isinstance(ev, dict) and isinstance(obj.extra[ek], dict):
+                            # Merge notes dict
+                            for nk, nv in ev.items():
+                                if nk not in obj.extra[ek]:
+                                    obj.extra[ek][nk] = nv
+
+        normalized.append(obj)
+
+    return normalized
+
+
 def register_url_with_local_library(
    pipe_obj: models.PipeObject,
    config: Dict[str,
@@ -2518,12 +2634,12 @@ def resolve_tidal_manifest_path(item: Any) -> Optional[str]:

            if candidate_path:
                m = re.search(
-                    r"tidal:(?://)?track[\\/](\d+)",
+                    r"(tidal|hifi):(?://)?track[\\/](\d+)",
                    str(candidate_path),
                    flags=re.IGNORECASE,
                )
                if m:
-                    track_id = m.group(1)
+                    track_id = m.group(2)

        if (not already) and track_id is not None:
            try:
@@ -2706,3 +2822,327 @@ def resolve_tidal_manifest_path(item: Any) -> Optional[str]:
        return None

    return str(target_path)
+
+def check_url_exists_in_storage(
+    urls: Sequence[str],
+    storage: Any,
+    hydrus_available: bool,
+    final_output_dir: Optional[Path] = None,
+) -> bool:
+    """Pre-flight check to see if URLs already exist in storage.
+
+    Args:
+        urls: List of URLs to check
+        storage: The storage interface
+        hydrus_available: Whether Hydrus is available
+        final_output_dir: Final output directory (to skip if same as storage)
+
+    Returns:
+        True if check passed (user said yes or no dups), False if user said no (stop).
+    """
+    if storage is None:
+        debug("Bulk URL preflight skipped: storage unavailable")
+        return True
+
+    try:
+        current_cmd_text = pipeline_context.get_current_command_text("")
+    except Exception:
+        current_cmd_text = ""
+
+    try:
+        stage_ctx = pipeline_context.get_stage_context()
+    except Exception:
+        stage_ctx = None
+
+    in_pipeline = bool(stage_ctx is not None or ("|" in str(current_cmd_text or "")))
+    if in_pipeline:
+        try:
+            cached_cmd = pipeline_context.load_value("preflight.url_duplicates.command", default="")
+            cached_decision = pipeline_context.load_value("preflight.url_duplicates.continue", default=None)
+        except Exception:
+            cached_cmd = ""
+            cached_decision = None
+
+        if cached_decision is not None and str(cached_cmd or "") == str(current_cmd_text or ""):
+            if bool(cached_decision):
+                return True
+            try:
+                pipeline_context.request_pipeline_stop(reason="duplicate-url declined", exit_code=0)
+            except Exception:
+                pass
+            return False
+
+    unique_urls: List[str] = []
+    for u in urls or []:
+        s = str(u or "").strip()
+        if s and s not in unique_urls:
+            unique_urls.append(s)
+    if len(unique_urls) == 0:
+        return True
+
+    try:
+        from SYS.metadata import normalize_urls
+    except Exception:
+        normalize_urls = None  # type: ignore[assignment]
+
+    def _httpish(value: str) -> bool:
+        try:
+            return bool(value) and (value.startswith("http://") or value.startswith("https://"))
+        except Exception:
+            return False
+
+    url_needles: Dict[str, List[str]] = {}
+    for u in unique_urls:
+        needles: List[str] = []
+        if normalize_urls is not None:
+            try:
+                needles.extend([n for n in (normalize_urls(u) or []) if isinstance(n, str)])
+            except Exception:
+                needles = []
+        if not needles:
+            needles = [u]
+        filtered: List[str] = []
+        for n in needles:
+            n2 = str(n or "").strip()
+            if not n2:
+                continue
+            if not _httpish(n2):
+                continue
+            if n2 not in filtered:
+                filtered.append(n2)
+        url_needles[u] = filtered if filtered else [u]
+
+    backend_names: List[str] = []
+    try:
+        backend_names_all = storage.list_searchable_backends()
+    except Exception:
+        backend_names_all = []
+
+    for backend_name in backend_names_all:
+        try:
+            backend = storage[backend_name]
+        except Exception:
+            continue
+
+        try:
+            if str(backend_name).strip().lower() == "temp":
+                continue
+        except Exception:
+            pass
+
+        try:
+            backend_location = getattr(backend, "_location", None)
+            if backend_location and final_output_dir:
+                backend_path = Path(str(backend_location)).expanduser().resolve()
+                temp_path = Path(str(final_output_dir)).expanduser().resolve()
+                if backend_path == temp_path:
+                    continue
+        except Exception:
+            pass
+
+        backend_names.append(backend_name)
+
+    if not backend_names:
+        debug("Bulk URL preflight skipped: no searchable backends")
+        return True
+
+    seen_pairs: set[tuple[str, str]] = set()
+    matched_urls: set[str] = set()
+    match_rows: List[Dict[str, Any]] = []
+    max_rows = 200
+
+    try:
+        from Store.HydrusNetwork import HydrusNetwork
+    except Exception:
+        HydrusNetwork = None  # type: ignore
+
+    for backend_name in backend_names:
+        if len(match_rows) >= max_rows:
+            break
+        try:
+            backend = storage[backend_name]
+        except Exception:
+            continue
+
+        if HydrusNetwork is not None and isinstance(backend, HydrusNetwork):
+            if not hydrus_available:
+                continue
+
+            client = getattr(backend, "_client", None)
+            if client is None:
+                continue
+
+            for original_url, needles in url_needles.items():
+                if len(match_rows) >= max_rows:
+                    break
+                if (original_url, str(backend_name)) in seen_pairs:
+                    continue
+
+                found_hash: Optional[str] = None
+                found = False
+                for needle in (needles or [])[:3]:
+                    if not _httpish(needle):
+                        continue
+                    try:
+                        from API.HydrusNetwork import HydrusRequestSpec
+
+                        spec = HydrusRequestSpec(
+                            method="GET",
+                            endpoint="/add_urls/get_url_files",
+                            query={"url": needle},
+                        )
+                        # Access internal client safely if possible, else skip check
+                        if hasattr(client, "_perform_request"): 
+                            response = client._perform_request(spec)
+                            raw_hashes = None
+                            if isinstance(response, dict):
+                                raw_hashes = response.get("hashes") or response.get("file_hashes")
+                                raw_ids = response.get("file_ids")
+                                has_ids = isinstance(raw_ids, list) and len(raw_ids) > 0
+                                has_hashes = isinstance(raw_hashes, list) and len(raw_hashes) > 0
+                                if has_hashes:
+                                    try:
+                                        found_hash = str(raw_hashes[0]).strip()
+                                    except Exception:
+                                        found_hash = None
+                                if has_ids or has_hashes:
+                                    found = True
+                                    break
+                    except Exception:
+                        continue
+
+                if not found:
+                    continue
+
+                seen_pairs.add((original_url, str(backend_name)))
+                matched_urls.add(original_url)
+                display_row = {
+                    "title": "(exists)",
+                    "store": str(backend_name),
+                    "hash": found_hash or "",
+                    "url": original_url,
+                    "columns": [
+                        ("Title", "(exists)"),
+                        ("Store", str(backend_name)),
+                        ("Hash", found_hash or ""),
+                        ("URL", original_url),
+                    ],
+                }
+                match_rows.append(display_row)
+            continue
+
+        for original_url, needles in url_needles.items():
+            if len(match_rows) >= max_rows:
+                break
+            if (original_url, str(backend_name)) in seen_pairs:
+                continue
+
+            backend_hits: List[Dict[str, Any]] = []
+            for needle in (needles or [])[:3]:
+                try:
+                    backend_hits = backend.search(f"url:{needle}", limit=1) or []
+                    if backend_hits:
+                        break
+                except Exception:
+                    continue
+
+            if not backend_hits:
+                continue
+
+            seen_pairs.add((original_url, str(backend_name)))
+            matched_urls.add(original_url)
+            hit = backend_hits[0]
+            title = hit.get("title") or hit.get("name") or hit.get("target") or hit.get("path") or "(exists)"
+            file_hash = hit.get("hash") or hit.get("file_hash") or hit.get("sha256") or ""
+
+            try:
+                from SYS.result_table import build_display_row
+                extracted = build_display_row(hit, keys=["title", "store", "hash", "ext", "size"])
+            except Exception:
+                extracted = {}
+
+            extracted["title"] = str(title)
+            extracted["store"] = str(hit.get("store") or backend_name)
+            extracted["hash"] = str(file_hash or "")
+
+            ext = extracted.get("ext")
+            size_val = extracted.get("size")
+
+            display_row = {
+                "title": str(title),
+                "store": str(hit.get("store") or backend_name),
+                "hash": str(file_hash or ""),
+                "ext": str(ext or ""),
+                "size": size_val,
+                "url": original_url,
+                "columns": [
+                    ("Title", str(title)),
+                    ("Store", str(hit.get("store") or backend_name)),
+                    ("Hash", str(file_hash or "")),
+                    ("Ext", str(ext or "")),
+                    ("Size", size_val),
+                    ("URL", original_url),
+                ],
+            }
+            match_rows.append(display_row)
+
+    if not match_rows:
+        debug("Bulk URL preflight: no matches")
+        return True
+
+    table = ResultTable(f"URL already exists ({len(matched_urls)} url(s))", max_columns=10)
+    table.set_no_choice(True)
+    try:
+        table.set_preserve_order(True)
+    except Exception:
+        pass
+
+    for row in match_rows:
+        table.add_result(row)
+
+    try:
+        pipeline_context.set_last_result_table_overlay(table, match_rows)
+    except Exception:
+        pass
+
+    suspend = getattr(pipeline_context, "suspend_live_progress", None)
+    cm: AbstractContextManager[Any] = nullcontext()
+    if callable(suspend):
+        try:
+            maybe_cm = suspend()
+            if maybe_cm is not None:
+                cm = maybe_cm  # type: ignore[assignment]
+        except Exception:
+            cm = nullcontext()
+
+    with cm:
+        get_stderr_console().print(table)
+        setattr(table, "_rendered_by_cmdlet", True)
+        answered_yes = bool(Confirm.ask("Continue anyway?", default=False, console=get_stderr_console()))
+
+        if in_pipeline:
+            try:
+                existing = pipeline_context.load_value("preflight", default=None)
+            except Exception:
+                existing = None
+            preflight_cache: Dict[str, Any] = existing if isinstance(existing, dict) else {}
+            url_dup_cache = preflight_cache.get("url_duplicates")
+            if not isinstance(url_dup_cache, dict):
+                url_dup_cache = {}
+            url_dup_cache["command"] = str(current_cmd_text or "")
+            url_dup_cache["continue"] = bool(answered_yes)
+            preflight_cache["url_duplicates"] = url_dup_cache
+            try:
+                pipeline_context.store_value("preflight", preflight_cache)
+            except Exception:
+                pass
+
+        if not answered_yes:
+            if in_pipeline:
+                 try:
+                    pipeline_context.request_pipeline_stop(reason="duplicate-url declined", exit_code=0)
+                 except Exception:
+                    pass
+            return False
+    return True
+