kl
This commit is contained in:
@@ -11,11 +11,16 @@ import sys
|
||||
import tempfile
|
||||
from collections.abc import Iterable as IterableABC
|
||||
|
||||
from SYS.logger import log
|
||||
from SYS.logger import log, debug
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Set
|
||||
from dataclasses import dataclass, field
|
||||
from SYS import models
|
||||
from SYS import pipeline as pipeline_context
|
||||
from SYS.result_table import ResultTable
|
||||
from SYS.rich_display import stderr_console as get_stderr_console
|
||||
from rich.prompt import Confirm
|
||||
from contextlib import AbstractContextManager, nullcontext
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -2405,6 +2410,117 @@ def coerce_to_pipe_object(
|
||||
return pipe_obj
|
||||
|
||||
|
||||
def propagate_metadata(
|
||||
previous_items: Sequence[Any],
|
||||
new_items: Sequence[Any]
|
||||
) -> List[Any]:
|
||||
"""Merge metadata/tags from previous pipeline stage into new items.
|
||||
|
||||
Implements "sticky metadata": items generated by a transformation (download, convert)
|
||||
should inherit rich info (lyrics, art, tags) from their source.
|
||||
|
||||
Strategies:
|
||||
A. Hash Match: If inputs/outputs share a hash, they are the same item.
|
||||
B. Index Match: If lists are same length, assume 1:1 mapping (heuristic).
|
||||
C. Explicit Parent: If output has `parent_hash`, link to input with that hash.
|
||||
"""
|
||||
if not previous_items or not new_items:
|
||||
return list(new_items)
|
||||
|
||||
try:
|
||||
prev_normalized = [coerce_to_pipe_object(p) for p in previous_items]
|
||||
except Exception:
|
||||
return list(new_items)
|
||||
|
||||
prev_by_hash: Dict[str, models.PipeObject] = {}
|
||||
for p_obj in prev_normalized:
|
||||
if p_obj.hash and p_obj.hash != "unknown":
|
||||
prev_by_hash[p_obj.hash] = p_obj
|
||||
|
||||
normalized: List[models.PipeObject] = []
|
||||
|
||||
# Pre-calculate length matching for heuristic
|
||||
is_same_length = len(new_items) == len(prev_normalized)
|
||||
|
||||
for i, item in enumerate(new_items):
|
||||
try:
|
||||
obj = coerce_to_pipe_object(item)
|
||||
except Exception:
|
||||
normalized.append(item) # Should not happen given coerce guards
|
||||
continue
|
||||
|
||||
parent: Optional[models.PipeObject] = None
|
||||
|
||||
# Strategy A: Precise Hash Match
|
||||
if obj.hash in prev_by_hash:
|
||||
parent = prev_by_hash[obj.hash]
|
||||
|
||||
# Strategy B: Index Match (Heuristic)
|
||||
if not parent and is_same_length:
|
||||
parent = prev_normalized[i]
|
||||
|
||||
# Strategy C: Explicit Parent Hash
|
||||
if not parent and obj.parent_hash and obj.parent_hash in prev_by_hash:
|
||||
parent = prev_by_hash[obj.parent_hash]
|
||||
|
||||
if parent:
|
||||
# 1. Tags: Merge unique tags
|
||||
if parent.tag:
|
||||
if not obj.tag:
|
||||
obj.tag = list(parent.tag)
|
||||
else:
|
||||
curr_tags = {str(t).lower() for t in obj.tag}
|
||||
for pt in parent.tag:
|
||||
if str(pt).lower() not in curr_tags:
|
||||
obj.tag.append(pt)
|
||||
|
||||
# 2. Metadata: Merge missing keys
|
||||
if parent.metadata:
|
||||
if not obj.metadata:
|
||||
obj.metadata = parent.metadata.copy()
|
||||
else:
|
||||
for mk, mv in parent.metadata.items():
|
||||
if mk not in obj.metadata:
|
||||
obj.metadata[mk] = mv
|
||||
|
||||
# 3. Source URL: Propagate if missing
|
||||
if parent.source_url and not obj.source_url:
|
||||
obj.source_url = parent.source_url
|
||||
elif parent.url and not obj.source_url and not obj.url:
|
||||
# If parent had a URL and child has none, it's likely the source
|
||||
obj.source_url = parent.url
|
||||
|
||||
# 4. Relationships: Merge missing keys
|
||||
if parent.relationships:
|
||||
if not obj.relationships:
|
||||
obj.relationships = parent.relationships.copy()
|
||||
else:
|
||||
for rk, rv in parent.relationships.items():
|
||||
if rk not in obj.relationships:
|
||||
obj.relationships[rk] = rv
|
||||
|
||||
# 5. Extra (Notes/etc): Merge missing keys
|
||||
# Important for passing 'notes' payload (lyrics, captions)
|
||||
if parent.extra:
|
||||
if not obj.extra:
|
||||
obj.extra = parent.extra.copy()
|
||||
else:
|
||||
# Recursive merge for 'notes' dict specifically?
|
||||
# For now just shallow merge keys, but handle 'notes' specially if valid.
|
||||
for ek, ev in parent.extra.items():
|
||||
if ek not in obj.extra:
|
||||
obj.extra[ek] = ev
|
||||
elif ek == "notes" and isinstance(ev, dict) and isinstance(obj.extra[ek], dict):
|
||||
# Merge notes dict
|
||||
for nk, nv in ev.items():
|
||||
if nk not in obj.extra[ek]:
|
||||
obj.extra[ek][nk] = nv
|
||||
|
||||
normalized.append(obj)
|
||||
|
||||
return normalized
|
||||
|
||||
|
||||
def register_url_with_local_library(
|
||||
pipe_obj: models.PipeObject,
|
||||
config: Dict[str,
|
||||
@@ -2518,12 +2634,12 @@ def resolve_tidal_manifest_path(item: Any) -> Optional[str]:
|
||||
|
||||
if candidate_path:
|
||||
m = re.search(
|
||||
r"tidal:(?://)?track[\\/](\d+)",
|
||||
r"(tidal|hifi):(?://)?track[\\/](\d+)",
|
||||
str(candidate_path),
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
if m:
|
||||
track_id = m.group(1)
|
||||
track_id = m.group(2)
|
||||
|
||||
if (not already) and track_id is not None:
|
||||
try:
|
||||
@@ -2706,3 +2822,327 @@ def resolve_tidal_manifest_path(item: Any) -> Optional[str]:
|
||||
return None
|
||||
|
||||
return str(target_path)
|
||||
|
||||
def check_url_exists_in_storage(
|
||||
urls: Sequence[str],
|
||||
storage: Any,
|
||||
hydrus_available: bool,
|
||||
final_output_dir: Optional[Path] = None,
|
||||
) -> bool:
|
||||
"""Pre-flight check to see if URLs already exist in storage.
|
||||
|
||||
Args:
|
||||
urls: List of URLs to check
|
||||
storage: The storage interface
|
||||
hydrus_available: Whether Hydrus is available
|
||||
final_output_dir: Final output directory (to skip if same as storage)
|
||||
|
||||
Returns:
|
||||
True if check passed (user said yes or no dups), False if user said no (stop).
|
||||
"""
|
||||
if storage is None:
|
||||
debug("Bulk URL preflight skipped: storage unavailable")
|
||||
return True
|
||||
|
||||
try:
|
||||
current_cmd_text = pipeline_context.get_current_command_text("")
|
||||
except Exception:
|
||||
current_cmd_text = ""
|
||||
|
||||
try:
|
||||
stage_ctx = pipeline_context.get_stage_context()
|
||||
except Exception:
|
||||
stage_ctx = None
|
||||
|
||||
in_pipeline = bool(stage_ctx is not None or ("|" in str(current_cmd_text or "")))
|
||||
if in_pipeline:
|
||||
try:
|
||||
cached_cmd = pipeline_context.load_value("preflight.url_duplicates.command", default="")
|
||||
cached_decision = pipeline_context.load_value("preflight.url_duplicates.continue", default=None)
|
||||
except Exception:
|
||||
cached_cmd = ""
|
||||
cached_decision = None
|
||||
|
||||
if cached_decision is not None and str(cached_cmd or "") == str(current_cmd_text or ""):
|
||||
if bool(cached_decision):
|
||||
return True
|
||||
try:
|
||||
pipeline_context.request_pipeline_stop(reason="duplicate-url declined", exit_code=0)
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
unique_urls: List[str] = []
|
||||
for u in urls or []:
|
||||
s = str(u or "").strip()
|
||||
if s and s not in unique_urls:
|
||||
unique_urls.append(s)
|
||||
if len(unique_urls) == 0:
|
||||
return True
|
||||
|
||||
try:
|
||||
from SYS.metadata import normalize_urls
|
||||
except Exception:
|
||||
normalize_urls = None # type: ignore[assignment]
|
||||
|
||||
def _httpish(value: str) -> bool:
|
||||
try:
|
||||
return bool(value) and (value.startswith("http://") or value.startswith("https://"))
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
url_needles: Dict[str, List[str]] = {}
|
||||
for u in unique_urls:
|
||||
needles: List[str] = []
|
||||
if normalize_urls is not None:
|
||||
try:
|
||||
needles.extend([n for n in (normalize_urls(u) or []) if isinstance(n, str)])
|
||||
except Exception:
|
||||
needles = []
|
||||
if not needles:
|
||||
needles = [u]
|
||||
filtered: List[str] = []
|
||||
for n in needles:
|
||||
n2 = str(n or "").strip()
|
||||
if not n2:
|
||||
continue
|
||||
if not _httpish(n2):
|
||||
continue
|
||||
if n2 not in filtered:
|
||||
filtered.append(n2)
|
||||
url_needles[u] = filtered if filtered else [u]
|
||||
|
||||
backend_names: List[str] = []
|
||||
try:
|
||||
backend_names_all = storage.list_searchable_backends()
|
||||
except Exception:
|
||||
backend_names_all = []
|
||||
|
||||
for backend_name in backend_names_all:
|
||||
try:
|
||||
backend = storage[backend_name]
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
try:
|
||||
if str(backend_name).strip().lower() == "temp":
|
||||
continue
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
backend_location = getattr(backend, "_location", None)
|
||||
if backend_location and final_output_dir:
|
||||
backend_path = Path(str(backend_location)).expanduser().resolve()
|
||||
temp_path = Path(str(final_output_dir)).expanduser().resolve()
|
||||
if backend_path == temp_path:
|
||||
continue
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
backend_names.append(backend_name)
|
||||
|
||||
if not backend_names:
|
||||
debug("Bulk URL preflight skipped: no searchable backends")
|
||||
return True
|
||||
|
||||
seen_pairs: set[tuple[str, str]] = set()
|
||||
matched_urls: set[str] = set()
|
||||
match_rows: List[Dict[str, Any]] = []
|
||||
max_rows = 200
|
||||
|
||||
try:
|
||||
from Store.HydrusNetwork import HydrusNetwork
|
||||
except Exception:
|
||||
HydrusNetwork = None # type: ignore
|
||||
|
||||
for backend_name in backend_names:
|
||||
if len(match_rows) >= max_rows:
|
||||
break
|
||||
try:
|
||||
backend = storage[backend_name]
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if HydrusNetwork is not None and isinstance(backend, HydrusNetwork):
|
||||
if not hydrus_available:
|
||||
continue
|
||||
|
||||
client = getattr(backend, "_client", None)
|
||||
if client is None:
|
||||
continue
|
||||
|
||||
for original_url, needles in url_needles.items():
|
||||
if len(match_rows) >= max_rows:
|
||||
break
|
||||
if (original_url, str(backend_name)) in seen_pairs:
|
||||
continue
|
||||
|
||||
found_hash: Optional[str] = None
|
||||
found = False
|
||||
for needle in (needles or [])[:3]:
|
||||
if not _httpish(needle):
|
||||
continue
|
||||
try:
|
||||
from API.HydrusNetwork import HydrusRequestSpec
|
||||
|
||||
spec = HydrusRequestSpec(
|
||||
method="GET",
|
||||
endpoint="/add_urls/get_url_files",
|
||||
query={"url": needle},
|
||||
)
|
||||
# Access internal client safely if possible, else skip check
|
||||
if hasattr(client, "_perform_request"):
|
||||
response = client._perform_request(spec)
|
||||
raw_hashes = None
|
||||
if isinstance(response, dict):
|
||||
raw_hashes = response.get("hashes") or response.get("file_hashes")
|
||||
raw_ids = response.get("file_ids")
|
||||
has_ids = isinstance(raw_ids, list) and len(raw_ids) > 0
|
||||
has_hashes = isinstance(raw_hashes, list) and len(raw_hashes) > 0
|
||||
if has_hashes:
|
||||
try:
|
||||
found_hash = str(raw_hashes[0]).strip()
|
||||
except Exception:
|
||||
found_hash = None
|
||||
if has_ids or has_hashes:
|
||||
found = True
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not found:
|
||||
continue
|
||||
|
||||
seen_pairs.add((original_url, str(backend_name)))
|
||||
matched_urls.add(original_url)
|
||||
display_row = {
|
||||
"title": "(exists)",
|
||||
"store": str(backend_name),
|
||||
"hash": found_hash or "",
|
||||
"url": original_url,
|
||||
"columns": [
|
||||
("Title", "(exists)"),
|
||||
("Store", str(backend_name)),
|
||||
("Hash", found_hash or ""),
|
||||
("URL", original_url),
|
||||
],
|
||||
}
|
||||
match_rows.append(display_row)
|
||||
continue
|
||||
|
||||
for original_url, needles in url_needles.items():
|
||||
if len(match_rows) >= max_rows:
|
||||
break
|
||||
if (original_url, str(backend_name)) in seen_pairs:
|
||||
continue
|
||||
|
||||
backend_hits: List[Dict[str, Any]] = []
|
||||
for needle in (needles or [])[:3]:
|
||||
try:
|
||||
backend_hits = backend.search(f"url:{needle}", limit=1) or []
|
||||
if backend_hits:
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not backend_hits:
|
||||
continue
|
||||
|
||||
seen_pairs.add((original_url, str(backend_name)))
|
||||
matched_urls.add(original_url)
|
||||
hit = backend_hits[0]
|
||||
title = hit.get("title") or hit.get("name") or hit.get("target") or hit.get("path") or "(exists)"
|
||||
file_hash = hit.get("hash") or hit.get("file_hash") or hit.get("sha256") or ""
|
||||
|
||||
try:
|
||||
from SYS.result_table import build_display_row
|
||||
extracted = build_display_row(hit, keys=["title", "store", "hash", "ext", "size"])
|
||||
except Exception:
|
||||
extracted = {}
|
||||
|
||||
extracted["title"] = str(title)
|
||||
extracted["store"] = str(hit.get("store") or backend_name)
|
||||
extracted["hash"] = str(file_hash or "")
|
||||
|
||||
ext = extracted.get("ext")
|
||||
size_val = extracted.get("size")
|
||||
|
||||
display_row = {
|
||||
"title": str(title),
|
||||
"store": str(hit.get("store") or backend_name),
|
||||
"hash": str(file_hash or ""),
|
||||
"ext": str(ext or ""),
|
||||
"size": size_val,
|
||||
"url": original_url,
|
||||
"columns": [
|
||||
("Title", str(title)),
|
||||
("Store", str(hit.get("store") or backend_name)),
|
||||
("Hash", str(file_hash or "")),
|
||||
("Ext", str(ext or "")),
|
||||
("Size", size_val),
|
||||
("URL", original_url),
|
||||
],
|
||||
}
|
||||
match_rows.append(display_row)
|
||||
|
||||
if not match_rows:
|
||||
debug("Bulk URL preflight: no matches")
|
||||
return True
|
||||
|
||||
table = ResultTable(f"URL already exists ({len(matched_urls)} url(s))", max_columns=10)
|
||||
table.set_no_choice(True)
|
||||
try:
|
||||
table.set_preserve_order(True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
for row in match_rows:
|
||||
table.add_result(row)
|
||||
|
||||
try:
|
||||
pipeline_context.set_last_result_table_overlay(table, match_rows)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
suspend = getattr(pipeline_context, "suspend_live_progress", None)
|
||||
cm: AbstractContextManager[Any] = nullcontext()
|
||||
if callable(suspend):
|
||||
try:
|
||||
maybe_cm = suspend()
|
||||
if maybe_cm is not None:
|
||||
cm = maybe_cm # type: ignore[assignment]
|
||||
except Exception:
|
||||
cm = nullcontext()
|
||||
|
||||
with cm:
|
||||
get_stderr_console().print(table)
|
||||
setattr(table, "_rendered_by_cmdlet", True)
|
||||
answered_yes = bool(Confirm.ask("Continue anyway?", default=False, console=get_stderr_console()))
|
||||
|
||||
if in_pipeline:
|
||||
try:
|
||||
existing = pipeline_context.load_value("preflight", default=None)
|
||||
except Exception:
|
||||
existing = None
|
||||
preflight_cache: Dict[str, Any] = existing if isinstance(existing, dict) else {}
|
||||
url_dup_cache = preflight_cache.get("url_duplicates")
|
||||
if not isinstance(url_dup_cache, dict):
|
||||
url_dup_cache = {}
|
||||
url_dup_cache["command"] = str(current_cmd_text or "")
|
||||
url_dup_cache["continue"] = bool(answered_yes)
|
||||
preflight_cache["url_duplicates"] = url_dup_cache
|
||||
try:
|
||||
pipeline_context.store_value("preflight", preflight_cache)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not answered_yes:
|
||||
if in_pipeline:
|
||||
try:
|
||||
pipeline_context.request_pipeline_stop(reason="duplicate-url declined", exit_code=0)
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
Reference in New Issue
Block a user