This commit is contained in:
2026-01-11 18:56:26 -08:00
parent e70db8d8a6
commit 6076ea307b
7 changed files with 773 additions and 1476 deletions

View File

@@ -11,11 +11,16 @@ import sys
import tempfile
from collections.abc import Iterable as IterableABC
from SYS.logger import log
from SYS.logger import log, debug
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Set
from dataclasses import dataclass, field
from SYS import models
from SYS import pipeline as pipeline_context
from SYS.result_table import ResultTable
from SYS.rich_display import stderr_console as get_stderr_console
from rich.prompt import Confirm
from contextlib import AbstractContextManager, nullcontext
@dataclass
@@ -2405,6 +2410,117 @@ def coerce_to_pipe_object(
return pipe_obj
def propagate_metadata(
previous_items: Sequence[Any],
new_items: Sequence[Any]
) -> List[Any]:
"""Merge metadata/tags from previous pipeline stage into new items.
Implements "sticky metadata": items generated by a transformation (download, convert)
should inherit rich info (lyrics, art, tags) from their source.
Strategies:
A. Hash Match: If inputs/outputs share a hash, they are the same item.
B. Index Match: If lists are same length, assume 1:1 mapping (heuristic).
C. Explicit Parent: If output has `parent_hash`, link to input with that hash.
"""
if not previous_items or not new_items:
return list(new_items)
try:
prev_normalized = [coerce_to_pipe_object(p) for p in previous_items]
except Exception:
return list(new_items)
prev_by_hash: Dict[str, models.PipeObject] = {}
for p_obj in prev_normalized:
if p_obj.hash and p_obj.hash != "unknown":
prev_by_hash[p_obj.hash] = p_obj
normalized: List[models.PipeObject] = []
# Pre-calculate length matching for heuristic
is_same_length = len(new_items) == len(prev_normalized)
for i, item in enumerate(new_items):
try:
obj = coerce_to_pipe_object(item)
except Exception:
normalized.append(item) # Should not happen given coerce guards
continue
parent: Optional[models.PipeObject] = None
# Strategy A: Precise Hash Match
if obj.hash in prev_by_hash:
parent = prev_by_hash[obj.hash]
# Strategy B: Index Match (Heuristic)
if not parent and is_same_length:
parent = prev_normalized[i]
# Strategy C: Explicit Parent Hash
if not parent and obj.parent_hash and obj.parent_hash in prev_by_hash:
parent = prev_by_hash[obj.parent_hash]
if parent:
# 1. Tags: Merge unique tags
if parent.tag:
if not obj.tag:
obj.tag = list(parent.tag)
else:
curr_tags = {str(t).lower() for t in obj.tag}
for pt in parent.tag:
if str(pt).lower() not in curr_tags:
obj.tag.append(pt)
# 2. Metadata: Merge missing keys
if parent.metadata:
if not obj.metadata:
obj.metadata = parent.metadata.copy()
else:
for mk, mv in parent.metadata.items():
if mk not in obj.metadata:
obj.metadata[mk] = mv
# 3. Source URL: Propagate if missing
if parent.source_url and not obj.source_url:
obj.source_url = parent.source_url
elif parent.url and not obj.source_url and not obj.url:
# If parent had a URL and child has none, it's likely the source
obj.source_url = parent.url
# 4. Relationships: Merge missing keys
if parent.relationships:
if not obj.relationships:
obj.relationships = parent.relationships.copy()
else:
for rk, rv in parent.relationships.items():
if rk not in obj.relationships:
obj.relationships[rk] = rv
# 5. Extra (Notes/etc): Merge missing keys
# Important for passing 'notes' payload (lyrics, captions)
if parent.extra:
if not obj.extra:
obj.extra = parent.extra.copy()
else:
# Recursive merge for 'notes' dict specifically?
# For now just shallow merge keys, but handle 'notes' specially if valid.
for ek, ev in parent.extra.items():
if ek not in obj.extra:
obj.extra[ek] = ev
elif ek == "notes" and isinstance(ev, dict) and isinstance(obj.extra[ek], dict):
# Merge notes dict
for nk, nv in ev.items():
if nk not in obj.extra[ek]:
obj.extra[ek][nk] = nv
normalized.append(obj)
return normalized
def register_url_with_local_library(
pipe_obj: models.PipeObject,
config: Dict[str,
@@ -2518,12 +2634,12 @@ def resolve_tidal_manifest_path(item: Any) -> Optional[str]:
if candidate_path:
m = re.search(
r"tidal:(?://)?track[\\/](\d+)",
r"(tidal|hifi):(?://)?track[\\/](\d+)",
str(candidate_path),
flags=re.IGNORECASE,
)
if m:
track_id = m.group(1)
track_id = m.group(2)
if (not already) and track_id is not None:
try:
@@ -2706,3 +2822,327 @@ def resolve_tidal_manifest_path(item: Any) -> Optional[str]:
return None
return str(target_path)
def check_url_exists_in_storage(
urls: Sequence[str],
storage: Any,
hydrus_available: bool,
final_output_dir: Optional[Path] = None,
) -> bool:
"""Pre-flight check to see if URLs already exist in storage.
Args:
urls: List of URLs to check
storage: The storage interface
hydrus_available: Whether Hydrus is available
final_output_dir: Final output directory (to skip if same as storage)
Returns:
True if check passed (user said yes or no dups), False if user said no (stop).
"""
if storage is None:
debug("Bulk URL preflight skipped: storage unavailable")
return True
try:
current_cmd_text = pipeline_context.get_current_command_text("")
except Exception:
current_cmd_text = ""
try:
stage_ctx = pipeline_context.get_stage_context()
except Exception:
stage_ctx = None
in_pipeline = bool(stage_ctx is not None or ("|" in str(current_cmd_text or "")))
if in_pipeline:
try:
cached_cmd = pipeline_context.load_value("preflight.url_duplicates.command", default="")
cached_decision = pipeline_context.load_value("preflight.url_duplicates.continue", default=None)
except Exception:
cached_cmd = ""
cached_decision = None
if cached_decision is not None and str(cached_cmd or "") == str(current_cmd_text or ""):
if bool(cached_decision):
return True
try:
pipeline_context.request_pipeline_stop(reason="duplicate-url declined", exit_code=0)
except Exception:
pass
return False
unique_urls: List[str] = []
for u in urls or []:
s = str(u or "").strip()
if s and s not in unique_urls:
unique_urls.append(s)
if len(unique_urls) == 0:
return True
try:
from SYS.metadata import normalize_urls
except Exception:
normalize_urls = None # type: ignore[assignment]
def _httpish(value: str) -> bool:
try:
return bool(value) and (value.startswith("http://") or value.startswith("https://"))
except Exception:
return False
url_needles: Dict[str, List[str]] = {}
for u in unique_urls:
needles: List[str] = []
if normalize_urls is not None:
try:
needles.extend([n for n in (normalize_urls(u) or []) if isinstance(n, str)])
except Exception:
needles = []
if not needles:
needles = [u]
filtered: List[str] = []
for n in needles:
n2 = str(n or "").strip()
if not n2:
continue
if not _httpish(n2):
continue
if n2 not in filtered:
filtered.append(n2)
url_needles[u] = filtered if filtered else [u]
backend_names: List[str] = []
try:
backend_names_all = storage.list_searchable_backends()
except Exception:
backend_names_all = []
for backend_name in backend_names_all:
try:
backend = storage[backend_name]
except Exception:
continue
try:
if str(backend_name).strip().lower() == "temp":
continue
except Exception:
pass
try:
backend_location = getattr(backend, "_location", None)
if backend_location and final_output_dir:
backend_path = Path(str(backend_location)).expanduser().resolve()
temp_path = Path(str(final_output_dir)).expanduser().resolve()
if backend_path == temp_path:
continue
except Exception:
pass
backend_names.append(backend_name)
if not backend_names:
debug("Bulk URL preflight skipped: no searchable backends")
return True
seen_pairs: set[tuple[str, str]] = set()
matched_urls: set[str] = set()
match_rows: List[Dict[str, Any]] = []
max_rows = 200
try:
from Store.HydrusNetwork import HydrusNetwork
except Exception:
HydrusNetwork = None # type: ignore
for backend_name in backend_names:
if len(match_rows) >= max_rows:
break
try:
backend = storage[backend_name]
except Exception:
continue
if HydrusNetwork is not None and isinstance(backend, HydrusNetwork):
if not hydrus_available:
continue
client = getattr(backend, "_client", None)
if client is None:
continue
for original_url, needles in url_needles.items():
if len(match_rows) >= max_rows:
break
if (original_url, str(backend_name)) in seen_pairs:
continue
found_hash: Optional[str] = None
found = False
for needle in (needles or [])[:3]:
if not _httpish(needle):
continue
try:
from API.HydrusNetwork import HydrusRequestSpec
spec = HydrusRequestSpec(
method="GET",
endpoint="/add_urls/get_url_files",
query={"url": needle},
)
# Access internal client safely if possible, else skip check
if hasattr(client, "_perform_request"):
response = client._perform_request(spec)
raw_hashes = None
if isinstance(response, dict):
raw_hashes = response.get("hashes") or response.get("file_hashes")
raw_ids = response.get("file_ids")
has_ids = isinstance(raw_ids, list) and len(raw_ids) > 0
has_hashes = isinstance(raw_hashes, list) and len(raw_hashes) > 0
if has_hashes:
try:
found_hash = str(raw_hashes[0]).strip()
except Exception:
found_hash = None
if has_ids or has_hashes:
found = True
break
except Exception:
continue
if not found:
continue
seen_pairs.add((original_url, str(backend_name)))
matched_urls.add(original_url)
display_row = {
"title": "(exists)",
"store": str(backend_name),
"hash": found_hash or "",
"url": original_url,
"columns": [
("Title", "(exists)"),
("Store", str(backend_name)),
("Hash", found_hash or ""),
("URL", original_url),
],
}
match_rows.append(display_row)
continue
for original_url, needles in url_needles.items():
if len(match_rows) >= max_rows:
break
if (original_url, str(backend_name)) in seen_pairs:
continue
backend_hits: List[Dict[str, Any]] = []
for needle in (needles or [])[:3]:
try:
backend_hits = backend.search(f"url:{needle}", limit=1) or []
if backend_hits:
break
except Exception:
continue
if not backend_hits:
continue
seen_pairs.add((original_url, str(backend_name)))
matched_urls.add(original_url)
hit = backend_hits[0]
title = hit.get("title") or hit.get("name") or hit.get("target") or hit.get("path") or "(exists)"
file_hash = hit.get("hash") or hit.get("file_hash") or hit.get("sha256") or ""
try:
from SYS.result_table import build_display_row
extracted = build_display_row(hit, keys=["title", "store", "hash", "ext", "size"])
except Exception:
extracted = {}
extracted["title"] = str(title)
extracted["store"] = str(hit.get("store") or backend_name)
extracted["hash"] = str(file_hash or "")
ext = extracted.get("ext")
size_val = extracted.get("size")
display_row = {
"title": str(title),
"store": str(hit.get("store") or backend_name),
"hash": str(file_hash or ""),
"ext": str(ext or ""),
"size": size_val,
"url": original_url,
"columns": [
("Title", str(title)),
("Store", str(hit.get("store") or backend_name)),
("Hash", str(file_hash or "")),
("Ext", str(ext or "")),
("Size", size_val),
("URL", original_url),
],
}
match_rows.append(display_row)
if not match_rows:
debug("Bulk URL preflight: no matches")
return True
table = ResultTable(f"URL already exists ({len(matched_urls)} url(s))", max_columns=10)
table.set_no_choice(True)
try:
table.set_preserve_order(True)
except Exception:
pass
for row in match_rows:
table.add_result(row)
try:
pipeline_context.set_last_result_table_overlay(table, match_rows)
except Exception:
pass
suspend = getattr(pipeline_context, "suspend_live_progress", None)
cm: AbstractContextManager[Any] = nullcontext()
if callable(suspend):
try:
maybe_cm = suspend()
if maybe_cm is not None:
cm = maybe_cm # type: ignore[assignment]
except Exception:
cm = nullcontext()
with cm:
get_stderr_console().print(table)
setattr(table, "_rendered_by_cmdlet", True)
answered_yes = bool(Confirm.ask("Continue anyway?", default=False, console=get_stderr_console()))
if in_pipeline:
try:
existing = pipeline_context.load_value("preflight", default=None)
except Exception:
existing = None
preflight_cache: Dict[str, Any] = existing if isinstance(existing, dict) else {}
url_dup_cache = preflight_cache.get("url_duplicates")
if not isinstance(url_dup_cache, dict):
url_dup_cache = {}
url_dup_cache["command"] = str(current_cmd_text or "")
url_dup_cache["continue"] = bool(answered_yes)
preflight_cache["url_duplicates"] = url_dup_cache
try:
pipeline_context.store_value("preflight", preflight_cache)
except Exception:
pass
if not answered_yes:
if in_pipeline:
try:
pipeline_context.request_pipeline_stop(reason="duplicate-url declined", exit_code=0)
except Exception:
pass
return False
return True