This commit is contained in:
2026-01-17 02:36:06 -08:00
parent 3a7c443004
commit c6fd6b4224
9 changed files with 440 additions and 226 deletions

View File

@@ -10,6 +10,7 @@ import shutil
import sys
import tempfile
from collections.abc import Iterable as IterableABC
from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
from SYS.logger import log, debug
from pathlib import Path
@@ -3074,6 +3075,19 @@ def check_url_exists_in_storage(
pass
return False
def _load_preflight_cache() -> Dict[str, Any]:
try:
existing = pipeline_context.load_value("preflight", default=None)
except Exception:
existing = None
return existing if isinstance(existing, dict) else {}
def _store_preflight_cache(cache: Dict[str, Any]) -> None:
try:
pipeline_context.store_value("preflight", cache)
except Exception:
pass
unique_urls: List[str] = []
for u in urls or []:
s = str(u or "").strip()
@@ -3093,6 +3107,56 @@ def check_url_exists_in_storage(
except Exception:
return False
def _expand_url_variants(value: str) -> List[str]:
if not _httpish(value):
return []
try:
parsed = urlparse(value)
except Exception:
return []
if parsed.scheme.lower() not in {"http", "https"}:
return []
out: List[str] = []
def _maybe_add(candidate: str) -> None:
if not candidate or candidate == value:
return
if candidate not in out:
out.append(candidate)
if parsed.fragment:
_maybe_add(urlunparse(parsed._replace(fragment="")))
time_keys = {"t", "start", "time_continue", "timestamp", "time", "begin"}
tracking_prefixes = ("utm_",)
try:
query_pairs = parse_qsl(parsed.query, keep_blank_values=True)
except Exception:
query_pairs = []
if query_pairs or parsed.fragment:
filtered_pairs = []
removed = False
for key, val in query_pairs:
key_norm = str(key or "").lower()
if key_norm in time_keys:
removed = True
continue
if key_norm.startswith(tracking_prefixes):
removed = True
continue
filtered_pairs.append((key, val))
if removed:
new_query = urlencode(filtered_pairs, doseq=True) if filtered_pairs else ""
_maybe_add(urlunparse(parsed._replace(query=new_query, fragment="")))
return out
url_needles: Dict[str, List[str]] = {}
for u in unique_urls:
needles: List[str] = []
@@ -3112,7 +3176,88 @@ def check_url_exists_in_storage(
continue
if n2 not in filtered:
filtered.append(n2)
url_needles[u] = filtered if filtered else [u]
expanded: List[str] = []
for n2 in filtered:
for extra in _expand_url_variants(n2):
if extra not in expanded and extra not in filtered:
expanded.append(extra)
combined = filtered + expanded
url_needles[u] = combined if combined else [u]
if in_pipeline:
preflight_cache = _load_preflight_cache()
url_dup_cache = preflight_cache.get("url_duplicates")
if not isinstance(url_dup_cache, dict):
url_dup_cache = {}
cached_urls = url_dup_cache.get("urls")
cached_set = {str(u) for u in cached_urls} if isinstance(cached_urls, list) else set()
if cached_set:
all_cached = True
for original_url, needles in url_needles.items():
if original_url in cached_set:
continue
if any(n in cached_set for n in (needles or [])):
continue
all_cached = False
break
if all_cached:
debug("Bulk URL preflight: cached for pipeline; skipping duplicate check")
return True
def _search_backend_url_hits(
backend: Any,
backend_name: str,
original_url: str,
needles: Sequence[str],
) -> Optional[Dict[str, Any]]:
backend_hits: List[Dict[str, Any]] = []
for needle in (needles or [])[:3]:
try:
backend_hits = backend.search(f"url:{needle}", limit=1) or []
if backend_hits:
break
except Exception:
continue
if not backend_hits:
return None
hit = backend_hits[0]
title = hit.get("title") or hit.get("name") or hit.get("target") or hit.get("path") or "(exists)"
file_hash = hit.get("hash") or hit.get("file_hash") or hit.get("sha256") or ""
try:
from SYS.result_table import build_display_row
extracted = build_display_row(hit, keys=["title", "store", "hash", "ext", "size"])
except Exception:
extracted = {}
extracted["title"] = str(title)
extracted["store"] = str(hit.get("store") or backend_name)
extracted["hash"] = str(file_hash or "")
ext = extracted.get("ext")
size_val = extracted.get("size")
return {
"title": str(title),
"store": str(hit.get("store") or backend_name),
"hash": str(file_hash or ""),
"ext": str(ext or ""),
"size": size_val,
"url": original_url,
"columns": [
("Title", str(title)),
("Store", str(hit.get("store") or backend_name)),
("Hash", str(file_hash or "")),
("Ext", str(ext or "")),
("Size", size_val),
("URL", original_url),
],
}
backend_names: List[str] = []
try:
@@ -3167,12 +3312,11 @@ def check_url_exists_in_storage(
continue
if HydrusNetwork is not None and isinstance(backend, HydrusNetwork):
if not hydrus_available:
continue
client = getattr(backend, "_client", None)
if client is None:
continue
if not hydrus_available:
debug("Bulk URL preflight: hydrus availability check failed; attempting best-effort lookup")
for original_url, needles in url_needles.items():
if len(match_rows) >= max_rows:
@@ -3214,6 +3358,11 @@ def check_url_exists_in_storage(
continue
if not found:
fallback_row = _search_backend_url_hits(backend, str(backend_name), original_url, needles)
if fallback_row:
seen_pairs.add((original_url, str(backend_name)))
matched_urls.add(original_url)
match_rows.append(fallback_row)
continue
seen_pairs.add((original_url, str(backend_name)))
@@ -3239,57 +3388,33 @@ def check_url_exists_in_storage(
if (original_url, str(backend_name)) in seen_pairs:
continue
backend_hits: List[Dict[str, Any]] = []
for needle in (needles or [])[:3]:
try:
backend_hits = backend.search(f"url:{needle}", limit=1) or []
if backend_hits:
break
except Exception:
continue
if not backend_hits:
display_row = _search_backend_url_hits(backend, str(backend_name), original_url, needles)
if not display_row:
continue
seen_pairs.add((original_url, str(backend_name)))
matched_urls.add(original_url)
hit = backend_hits[0]
title = hit.get("title") or hit.get("name") or hit.get("target") or hit.get("path") or "(exists)"
file_hash = hit.get("hash") or hit.get("file_hash") or hit.get("sha256") or ""
try:
from SYS.result_table import build_display_row
extracted = build_display_row(hit, keys=["title", "store", "hash", "ext", "size"])
except Exception:
extracted = {}
extracted["title"] = str(title)
extracted["store"] = str(hit.get("store") or backend_name)
extracted["hash"] = str(file_hash or "")
ext = extracted.get("ext")
size_val = extracted.get("size")
display_row = {
"title": str(title),
"store": str(hit.get("store") or backend_name),
"hash": str(file_hash or ""),
"ext": str(ext or ""),
"size": size_val,
"url": original_url,
"columns": [
("Title", str(title)),
("Store", str(hit.get("store") or backend_name)),
("Hash", str(file_hash or "")),
("Ext", str(ext or "")),
("Size", size_val),
("URL", original_url),
],
}
match_rows.append(display_row)
if not match_rows:
debug("Bulk URL preflight: no matches")
if in_pipeline:
preflight_cache = _load_preflight_cache()
url_dup_cache = preflight_cache.get("url_duplicates")
if not isinstance(url_dup_cache, dict):
url_dup_cache = {}
cached_urls = url_dup_cache.get("urls")
cached_set = {str(u) for u in cached_urls} if isinstance(cached_urls, list) else set()
for original_url, needles in url_needles.items():
cached_set.add(original_url)
for needle in needles or []:
cached_set.add(str(needle))
url_dup_cache["urls"] = sorted(cached_set)
preflight_cache["url_duplicates"] = url_dup_cache
_store_preflight_cache(preflight_cache)
return True
table = ResultTable(f"URL already exists ({len(matched_urls)} url(s))", max_columns=10)
@@ -3333,6 +3458,13 @@ def check_url_exists_in_storage(
url_dup_cache = {}
url_dup_cache["command"] = str(current_cmd_text or "")
url_dup_cache["continue"] = bool(answered_yes)
cached_urls = url_dup_cache.get("urls")
cached_set = {str(u) for u in cached_urls} if isinstance(cached_urls, list) else set()
for original_url, needles in url_needles.items():
cached_set.add(original_url)
for needle in needles or []:
cached_set.add(str(needle))
url_dup_cache["urls"] = sorted(cached_set)
preflight_cache["url_duplicates"] = url_dup_cache
try:
pipeline_context.store_value("preflight", preflight_cache)