f
This commit is contained in:
@@ -10,6 +10,7 @@ import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
from collections.abc import Iterable as IterableABC
|
||||
from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
|
||||
|
||||
from SYS.logger import log, debug
|
||||
from pathlib import Path
|
||||
@@ -3074,6 +3075,19 @@ def check_url_exists_in_storage(
|
||||
pass
|
||||
return False
|
||||
|
||||
def _load_preflight_cache() -> Dict[str, Any]:
|
||||
try:
|
||||
existing = pipeline_context.load_value("preflight", default=None)
|
||||
except Exception:
|
||||
existing = None
|
||||
return existing if isinstance(existing, dict) else {}
|
||||
|
||||
def _store_preflight_cache(cache: Dict[str, Any]) -> None:
|
||||
try:
|
||||
pipeline_context.store_value("preflight", cache)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
unique_urls: List[str] = []
|
||||
for u in urls or []:
|
||||
s = str(u or "").strip()
|
||||
@@ -3093,6 +3107,56 @@ def check_url_exists_in_storage(
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _expand_url_variants(value: str) -> List[str]:
|
||||
if not _httpish(value):
|
||||
return []
|
||||
|
||||
try:
|
||||
parsed = urlparse(value)
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
if parsed.scheme.lower() not in {"http", "https"}:
|
||||
return []
|
||||
|
||||
out: List[str] = []
|
||||
|
||||
def _maybe_add(candidate: str) -> None:
|
||||
if not candidate or candidate == value:
|
||||
return
|
||||
if candidate not in out:
|
||||
out.append(candidate)
|
||||
|
||||
if parsed.fragment:
|
||||
_maybe_add(urlunparse(parsed._replace(fragment="")))
|
||||
|
||||
time_keys = {"t", "start", "time_continue", "timestamp", "time", "begin"}
|
||||
tracking_prefixes = ("utm_",)
|
||||
|
||||
try:
|
||||
query_pairs = parse_qsl(parsed.query, keep_blank_values=True)
|
||||
except Exception:
|
||||
query_pairs = []
|
||||
|
||||
if query_pairs or parsed.fragment:
|
||||
filtered_pairs = []
|
||||
removed = False
|
||||
for key, val in query_pairs:
|
||||
key_norm = str(key or "").lower()
|
||||
if key_norm in time_keys:
|
||||
removed = True
|
||||
continue
|
||||
if key_norm.startswith(tracking_prefixes):
|
||||
removed = True
|
||||
continue
|
||||
filtered_pairs.append((key, val))
|
||||
|
||||
if removed:
|
||||
new_query = urlencode(filtered_pairs, doseq=True) if filtered_pairs else ""
|
||||
_maybe_add(urlunparse(parsed._replace(query=new_query, fragment="")))
|
||||
|
||||
return out
|
||||
|
||||
url_needles: Dict[str, List[str]] = {}
|
||||
for u in unique_urls:
|
||||
needles: List[str] = []
|
||||
@@ -3112,7 +3176,88 @@ def check_url_exists_in_storage(
|
||||
continue
|
||||
if n2 not in filtered:
|
||||
filtered.append(n2)
|
||||
url_needles[u] = filtered if filtered else [u]
|
||||
expanded: List[str] = []
|
||||
for n2 in filtered:
|
||||
for extra in _expand_url_variants(n2):
|
||||
if extra not in expanded and extra not in filtered:
|
||||
expanded.append(extra)
|
||||
|
||||
combined = filtered + expanded
|
||||
url_needles[u] = combined if combined else [u]
|
||||
|
||||
if in_pipeline:
|
||||
preflight_cache = _load_preflight_cache()
|
||||
url_dup_cache = preflight_cache.get("url_duplicates")
|
||||
if not isinstance(url_dup_cache, dict):
|
||||
url_dup_cache = {}
|
||||
cached_urls = url_dup_cache.get("urls")
|
||||
cached_set = {str(u) for u in cached_urls} if isinstance(cached_urls, list) else set()
|
||||
|
||||
if cached_set:
|
||||
all_cached = True
|
||||
for original_url, needles in url_needles.items():
|
||||
if original_url in cached_set:
|
||||
continue
|
||||
if any(n in cached_set for n in (needles or [])):
|
||||
continue
|
||||
all_cached = False
|
||||
break
|
||||
|
||||
if all_cached:
|
||||
debug("Bulk URL preflight: cached for pipeline; skipping duplicate check")
|
||||
return True
|
||||
|
||||
def _search_backend_url_hits(
|
||||
backend: Any,
|
||||
backend_name: str,
|
||||
original_url: str,
|
||||
needles: Sequence[str],
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
backend_hits: List[Dict[str, Any]] = []
|
||||
for needle in (needles or [])[:3]:
|
||||
try:
|
||||
backend_hits = backend.search(f"url:{needle}", limit=1) or []
|
||||
if backend_hits:
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not backend_hits:
|
||||
return None
|
||||
|
||||
hit = backend_hits[0]
|
||||
title = hit.get("title") or hit.get("name") or hit.get("target") or hit.get("path") or "(exists)"
|
||||
file_hash = hit.get("hash") or hit.get("file_hash") or hit.get("sha256") or ""
|
||||
|
||||
try:
|
||||
from SYS.result_table import build_display_row
|
||||
extracted = build_display_row(hit, keys=["title", "store", "hash", "ext", "size"])
|
||||
except Exception:
|
||||
extracted = {}
|
||||
|
||||
extracted["title"] = str(title)
|
||||
extracted["store"] = str(hit.get("store") or backend_name)
|
||||
extracted["hash"] = str(file_hash or "")
|
||||
|
||||
ext = extracted.get("ext")
|
||||
size_val = extracted.get("size")
|
||||
|
||||
return {
|
||||
"title": str(title),
|
||||
"store": str(hit.get("store") or backend_name),
|
||||
"hash": str(file_hash or ""),
|
||||
"ext": str(ext or ""),
|
||||
"size": size_val,
|
||||
"url": original_url,
|
||||
"columns": [
|
||||
("Title", str(title)),
|
||||
("Store", str(hit.get("store") or backend_name)),
|
||||
("Hash", str(file_hash or "")),
|
||||
("Ext", str(ext or "")),
|
||||
("Size", size_val),
|
||||
("URL", original_url),
|
||||
],
|
||||
}
|
||||
|
||||
backend_names: List[str] = []
|
||||
try:
|
||||
@@ -3167,12 +3312,11 @@ def check_url_exists_in_storage(
|
||||
continue
|
||||
|
||||
if HydrusNetwork is not None and isinstance(backend, HydrusNetwork):
|
||||
if not hydrus_available:
|
||||
continue
|
||||
|
||||
client = getattr(backend, "_client", None)
|
||||
if client is None:
|
||||
continue
|
||||
if not hydrus_available:
|
||||
debug("Bulk URL preflight: hydrus availability check failed; attempting best-effort lookup")
|
||||
|
||||
for original_url, needles in url_needles.items():
|
||||
if len(match_rows) >= max_rows:
|
||||
@@ -3214,6 +3358,11 @@ def check_url_exists_in_storage(
|
||||
continue
|
||||
|
||||
if not found:
|
||||
fallback_row = _search_backend_url_hits(backend, str(backend_name), original_url, needles)
|
||||
if fallback_row:
|
||||
seen_pairs.add((original_url, str(backend_name)))
|
||||
matched_urls.add(original_url)
|
||||
match_rows.append(fallback_row)
|
||||
continue
|
||||
|
||||
seen_pairs.add((original_url, str(backend_name)))
|
||||
@@ -3239,57 +3388,33 @@ def check_url_exists_in_storage(
|
||||
if (original_url, str(backend_name)) in seen_pairs:
|
||||
continue
|
||||
|
||||
backend_hits: List[Dict[str, Any]] = []
|
||||
for needle in (needles or [])[:3]:
|
||||
try:
|
||||
backend_hits = backend.search(f"url:{needle}", limit=1) or []
|
||||
if backend_hits:
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not backend_hits:
|
||||
display_row = _search_backend_url_hits(backend, str(backend_name), original_url, needles)
|
||||
if not display_row:
|
||||
continue
|
||||
|
||||
seen_pairs.add((original_url, str(backend_name)))
|
||||
matched_urls.add(original_url)
|
||||
hit = backend_hits[0]
|
||||
title = hit.get("title") or hit.get("name") or hit.get("target") or hit.get("path") or "(exists)"
|
||||
file_hash = hit.get("hash") or hit.get("file_hash") or hit.get("sha256") or ""
|
||||
|
||||
try:
|
||||
from SYS.result_table import build_display_row
|
||||
extracted = build_display_row(hit, keys=["title", "store", "hash", "ext", "size"])
|
||||
except Exception:
|
||||
extracted = {}
|
||||
|
||||
extracted["title"] = str(title)
|
||||
extracted["store"] = str(hit.get("store") or backend_name)
|
||||
extracted["hash"] = str(file_hash or "")
|
||||
|
||||
ext = extracted.get("ext")
|
||||
size_val = extracted.get("size")
|
||||
|
||||
display_row = {
|
||||
"title": str(title),
|
||||
"store": str(hit.get("store") or backend_name),
|
||||
"hash": str(file_hash or ""),
|
||||
"ext": str(ext or ""),
|
||||
"size": size_val,
|
||||
"url": original_url,
|
||||
"columns": [
|
||||
("Title", str(title)),
|
||||
("Store", str(hit.get("store") or backend_name)),
|
||||
("Hash", str(file_hash or "")),
|
||||
("Ext", str(ext or "")),
|
||||
("Size", size_val),
|
||||
("URL", original_url),
|
||||
],
|
||||
}
|
||||
match_rows.append(display_row)
|
||||
|
||||
if not match_rows:
|
||||
debug("Bulk URL preflight: no matches")
|
||||
if in_pipeline:
|
||||
preflight_cache = _load_preflight_cache()
|
||||
url_dup_cache = preflight_cache.get("url_duplicates")
|
||||
if not isinstance(url_dup_cache, dict):
|
||||
url_dup_cache = {}
|
||||
|
||||
cached_urls = url_dup_cache.get("urls")
|
||||
cached_set = {str(u) for u in cached_urls} if isinstance(cached_urls, list) else set()
|
||||
|
||||
for original_url, needles in url_needles.items():
|
||||
cached_set.add(original_url)
|
||||
for needle in needles or []:
|
||||
cached_set.add(str(needle))
|
||||
|
||||
url_dup_cache["urls"] = sorted(cached_set)
|
||||
preflight_cache["url_duplicates"] = url_dup_cache
|
||||
_store_preflight_cache(preflight_cache)
|
||||
return True
|
||||
|
||||
table = ResultTable(f"URL already exists ({len(matched_urls)} url(s))", max_columns=10)
|
||||
@@ -3333,6 +3458,13 @@ def check_url_exists_in_storage(
|
||||
url_dup_cache = {}
|
||||
url_dup_cache["command"] = str(current_cmd_text or "")
|
||||
url_dup_cache["continue"] = bool(answered_yes)
|
||||
cached_urls = url_dup_cache.get("urls")
|
||||
cached_set = {str(u) for u in cached_urls} if isinstance(cached_urls, list) else set()
|
||||
for original_url, needles in url_needles.items():
|
||||
cached_set.add(original_url)
|
||||
for needle in needles or []:
|
||||
cached_set.add(str(needle))
|
||||
url_dup_cache["urls"] = sorted(cached_set)
|
||||
preflight_cache["url_duplicates"] = url_dup_cache
|
||||
try:
|
||||
pipeline_context.store_value("preflight", preflight_cache)
|
||||
|
||||
Reference in New Issue
Block a user