This commit is contained in:
2026-01-17 03:37:11 -08:00
parent c6fd6b4224
commit 5e76c44155
2 changed files with 294 additions and 22 deletions

View File

@@ -3060,20 +3060,17 @@ def check_url_exists_in_storage(
in_pipeline = bool(stage_ctx is not None or ("|" in str(current_cmd_text or ""))) in_pipeline = bool(stage_ctx is not None or ("|" in str(current_cmd_text or "")))
if in_pipeline: if in_pipeline:
try: try:
cached_cmd = pipeline_context.load_value("preflight.url_duplicates.command", default="") already_checked = bool(
cached_decision = pipeline_context.load_value("preflight.url_duplicates.continue", default=None) pipeline_context.load_value(
"preflight.url_duplicates.checked", default=False
)
)
except Exception: except Exception:
cached_cmd = "" already_checked = False
cached_decision = None
if cached_decision is not None and str(cached_cmd or "") == str(current_cmd_text or ""): if already_checked:
if bool(cached_decision): debug("Bulk URL preflight: already checked in pipeline; skipping duplicate check")
return True return True
try:
pipeline_context.request_pipeline_stop(reason="duplicate-url declined", exit_code=0)
except Exception:
pass
return False
def _load_preflight_cache() -> Dict[str, Any]: def _load_preflight_cache() -> Dict[str, Any]:
try: try:
@@ -3088,6 +3085,40 @@ def check_url_exists_in_storage(
except Exception: except Exception:
pass pass
def _mark_preflight_checked() -> None:
if not in_pipeline:
return
try:
pipeline_context.store_value("preflight.url_duplicates.checked", True)
except Exception:
pass
preflight_cache = _load_preflight_cache()
preflight_cache["url_duplicates_checked"] = True
url_dup_cache = preflight_cache.get("url_duplicates")
if not isinstance(url_dup_cache, dict):
url_dup_cache = {}
url_dup_cache["checked"] = True
preflight_cache["url_duplicates"] = url_dup_cache
_store_preflight_cache(preflight_cache)
if in_pipeline:
try:
cached_cmd = pipeline_context.load_value("preflight.url_duplicates.command", default="")
cached_decision = pipeline_context.load_value("preflight.url_duplicates.continue", default=None)
except Exception:
cached_cmd = ""
cached_decision = None
if cached_decision is not None and str(cached_cmd or "") == str(current_cmd_text or ""):
_mark_preflight_checked()
if bool(cached_decision):
return True
try:
pipeline_context.request_pipeline_stop(reason="duplicate-url declined", exit_code=0)
except Exception:
pass
return False
unique_urls: List[str] = [] unique_urls: List[str] = []
for u in urls or []: for u in urls or []:
s = str(u or "").strip() s = str(u or "").strip()
@@ -3107,6 +3138,46 @@ def check_url_exists_in_storage(
except Exception: except Exception:
return False return False
def _normalize_url_for_search(value: str) -> str:
url = str(value or "").strip()
# Strip fragment (e.g., #t=10) before matching
url = url.split("#", 1)[0]
# Strip common time/tracking query params for matching
try:
parsed = urlparse(url)
except Exception:
parsed = None
if parsed is not None and parsed.query:
time_keys = {"t", "start", "time_continue", "timestamp", "time", "begin"}
tracking_prefixes = ("utm_",)
try:
pairs = parse_qsl(parsed.query, keep_blank_values=True)
filtered = []
for key, val in pairs:
key_norm = str(key or "").lower()
if key_norm in time_keys:
continue
if key_norm.startswith(tracking_prefixes):
continue
filtered.append((key, val))
if filtered:
url = urlunparse(parsed._replace(query=urlencode(filtered, doseq=True)))
else:
url = urlunparse(parsed._replace(query=""))
except Exception:
pass
# Remove protocol (http://, https://, ftp://, etc.)
url = re.sub(r"^[a-z][a-z0-9+.-]*://", "", url, flags=re.IGNORECASE)
# Remove www. prefix (case-insensitive)
url = re.sub(r"^www\.", "", url, flags=re.IGNORECASE)
return url.lower()
def _expand_url_variants(value: str) -> List[str]: def _expand_url_variants(value: str) -> List[str]:
if not _httpish(value): if not _httpish(value):
return [] return []
@@ -3121,6 +3192,51 @@ def check_url_exists_in_storage(
out: List[str] = [] out: List[str] = []
def _add_variant(candidate: str) -> None:
_maybe_add(candidate)
try:
lower = str(candidate or "").lower()
except Exception:
lower = ""
if lower and lower != candidate:
_maybe_add(lower)
try:
parsed_candidate = urlparse(candidate)
except Exception:
parsed_candidate = None
if parsed_candidate is None:
return
host = (parsed_candidate.hostname or "").strip().lower()
if host.startswith("www."):
host = host[4:]
if host:
netloc = host
try:
if parsed_candidate.port:
netloc = f"{netloc}:{parsed_candidate.port}"
except Exception:
pass
try:
if parsed_candidate.username or parsed_candidate.password:
userinfo = parsed_candidate.username or ""
if parsed_candidate.password:
userinfo = f"{userinfo}:{parsed_candidate.password}"
if userinfo:
netloc = f"{userinfo}@{netloc}"
except Exception:
pass
alt = urlunparse(parsed_candidate._replace(netloc=netloc))
_maybe_add(alt)
try:
lower_alt = alt.lower()
except Exception:
lower_alt = ""
if lower_alt and lower_alt != alt:
_maybe_add(lower_alt)
def _maybe_add(candidate: str) -> None: def _maybe_add(candidate: str) -> None:
if not candidate or candidate == value: if not candidate or candidate == value:
return return
@@ -3128,7 +3244,7 @@ def check_url_exists_in_storage(
out.append(candidate) out.append(candidate)
if parsed.fragment: if parsed.fragment:
_maybe_add(urlunparse(parsed._replace(fragment=""))) _add_variant(urlunparse(parsed._replace(fragment="")))
time_keys = {"t", "start", "time_continue", "timestamp", "time", "begin"} time_keys = {"t", "start", "time_continue", "timestamp", "time", "begin"}
tracking_prefixes = ("utm_",) tracking_prefixes = ("utm_",)
@@ -3153,7 +3269,7 @@ def check_url_exists_in_storage(
if removed: if removed:
new_query = urlencode(filtered_pairs, doseq=True) if filtered_pairs else "" new_query = urlencode(filtered_pairs, doseq=True) if filtered_pairs else ""
_maybe_add(urlunparse(parsed._replace(query=new_query, fragment=""))) _add_variant(urlunparse(parsed._replace(query=new_query, fragment="")))
return out return out
@@ -3176,13 +3292,35 @@ def check_url_exists_in_storage(
continue continue
if n2 not in filtered: if n2 not in filtered:
filtered.append(n2) filtered.append(n2)
lowered: List[str] = []
for n2 in filtered:
try:
lower = n2.lower()
except Exception:
lower = ""
if lower and lower != n2 and lower not in filtered and lower not in lowered:
lowered.append(lower)
normalized: List[str] = []
for n2 in filtered:
norm = _normalize_url_for_search(n2)
if norm and norm not in normalized and norm not in filtered:
normalized.append(norm)
expanded: List[str] = [] expanded: List[str] = []
for n2 in filtered: for n2 in filtered:
for extra in _expand_url_variants(n2): for extra in _expand_url_variants(n2):
if extra not in expanded and extra not in filtered: if extra not in expanded and extra not in filtered and extra not in lowered:
expanded.append(extra) expanded.append(extra)
norm_extra = _normalize_url_for_search(extra)
if (
norm_extra
and norm_extra not in normalized
and norm_extra not in filtered
and norm_extra not in expanded
and norm_extra not in lowered
):
normalized.append(norm_extra)
combined = filtered + expanded combined = filtered + expanded + lowered + normalized
url_needles[u] = combined if combined else [u] url_needles[u] = combined if combined else [u]
if in_pipeline: if in_pipeline:
@@ -3196,15 +3334,28 @@ def check_url_exists_in_storage(
if cached_set: if cached_set:
all_cached = True all_cached = True
for original_url, needles in url_needles.items(): for original_url, needles in url_needles.items():
if original_url in cached_set: original_cached = str(original_url or "") in cached_set
needles_cached = True
if original_cached:
for needle in (needles or []):
needle_text = str(needle or "")
if not needle_text:
continue continue
if any(n in cached_set for n in (needles or [])): if needle_text not in cached_set:
needles_cached = False
break
else:
needles_cached = False
if original_cached and needles_cached:
continue continue
all_cached = False all_cached = False
break break
if all_cached: if all_cached:
debug("Bulk URL preflight: cached for pipeline; skipping duplicate check") debug("Bulk URL preflight: cached for pipeline; skipping duplicate check")
_mark_preflight_checked()
return True return True
def _search_backend_url_hits( def _search_backend_url_hits(
@@ -3215,14 +3366,103 @@ def check_url_exists_in_storage(
) -> Optional[Dict[str, Any]]: ) -> Optional[Dict[str, Any]]:
backend_hits: List[Dict[str, Any]] = [] backend_hits: List[Dict[str, Any]] = []
for needle in (needles or [])[:3]: for needle in (needles or [])[:3]:
needle_text = str(needle or "").strip()
if not needle_text:
continue
search_needle = _normalize_url_for_search(needle_text) or needle_text
query = f"url:*{search_needle}*"
try: try:
backend_hits = backend.search(f"url:{needle}", limit=1) or [] backend_hits = backend.search(query, limit=1) or []
if backend_hits: if backend_hits:
break break
except Exception: except Exception:
continue continue
if not backend_hits: if not backend_hits:
def _match_normalized_url(pattern_text: str, candidate_url: str) -> bool:
pattern_norm = _normalize_url_for_search(pattern_text)
candidate_norm = _normalize_url_for_search(candidate_url)
if not pattern_norm or not candidate_norm:
return False
if pattern_norm == candidate_norm:
return True
return pattern_norm in candidate_norm
fallback_hits: List[Dict[str, Any]] = []
try:
fallback_hits = backend.search("url:*", limit=200) or []
except Exception:
fallback_hits = []
for hit in fallback_hits:
url_values: List[str] = []
try:
raw_urls = get_field(hit, "known_urls") or get_field(hit, "urls") or get_field(hit, "url")
if isinstance(raw_urls, str) and raw_urls.strip():
url_values.append(raw_urls.strip())
elif isinstance(raw_urls, (list, tuple, set)):
for item in raw_urls:
if isinstance(item, str) and item.strip():
url_values.append(item.strip())
except Exception:
url_values = []
if not url_values:
try:
file_hash = hit.get("hash") if isinstance(hit, dict) else None
if file_hash:
fetched = backend.get_url(str(file_hash))
if isinstance(fetched, str) and fetched.strip():
url_values.append(fetched.strip())
elif isinstance(fetched, (list, tuple, set)):
for item in fetched:
if isinstance(item, str) and item.strip():
url_values.append(item.strip())
except Exception:
pass
if not url_values:
continue
matched = False
for url_value in url_values:
for needle in (needles or []):
if _match_normalized_url(str(needle or ""), str(url_value or "")):
matched = True
break
if matched:
break
if not matched:
continue
title = "(exists)"
try:
title = hit.get("title") or hit.get("name") or hit.get("target") or hit.get("path") or "(exists)"
except Exception:
title = "(exists)"
file_hash = ""
try:
file_hash = hit.get("hash") or hit.get("file_hash") or hit.get("sha256") or ""
except Exception:
file_hash = ""
return {
"title": str(title),
"store": str(hit.get("store") or backend_name),
"hash": str(file_hash or ""),
"ext": "",
"size": None,
"url": original_url,
"columns": [
("Title", str(title)),
("Store", str(hit.get("store") or backend_name)),
("Hash", str(file_hash or "")),
("URL", original_url),
],
}
return None return None
hit = backend_hits[0] hit = backend_hits[0]
@@ -3326,7 +3566,7 @@ def check_url_exists_in_storage(
found_hash: Optional[str] = None found_hash: Optional[str] = None
found = False found = False
for needle in (needles or [])[:3]: for needle in (needles or [])[:6]:
if not _httpish(needle): if not _httpish(needle):
continue continue
try: try:
@@ -3415,6 +3655,7 @@ def check_url_exists_in_storage(
url_dup_cache["urls"] = sorted(cached_set) url_dup_cache["urls"] = sorted(cached_set)
preflight_cache["url_duplicates"] = url_dup_cache preflight_cache["url_duplicates"] = url_dup_cache
_store_preflight_cache(preflight_cache) _store_preflight_cache(preflight_cache)
_mark_preflight_checked()
return True return True
table = ResultTable(f"URL already exists ({len(matched_urls)} url(s))", max_columns=10) table = ResultTable(f"URL already exists ({len(matched_urls)} url(s))", max_columns=10)
@@ -3477,6 +3718,8 @@ def check_url_exists_in_storage(
pipeline_context.request_pipeline_stop(reason="duplicate-url declined", exit_code=0) pipeline_context.request_pipeline_stop(reason="duplicate-url declined", exit_code=0)
except Exception: except Exception:
pass pass
_mark_preflight_checked()
return False return False
_mark_preflight_checked()
return True return True

View File

@@ -7,7 +7,7 @@ from typing import Any, Dict, List, Sequence, Optional, Set, Tuple
import sys import sys
import re import re
from fnmatch import fnmatch from fnmatch import fnmatch
from urllib.parse import urlparse from urllib.parse import urlparse, parse_qsl, urlencode, urlunparse
from . import _shared as sh from . import _shared as sh
Cmdlet, SharedArgs, parse_cmdlet_args, get_field, normalize_hash = ( Cmdlet, SharedArgs, parse_cmdlet_args, get_field, normalize_hash = (
@@ -67,6 +67,35 @@ class Get_Url(Cmdlet):
""" """
url = str(url or "").strip() url = str(url or "").strip()
# Strip fragment (e.g., #t=10) before matching
url = url.split("#", 1)[0]
# Strip common time/tracking query params for matching
try:
parsed = urlparse(url)
except Exception:
parsed = None
if parsed is not None and parsed.query:
time_keys = {"t", "start", "time_continue", "timestamp", "time", "begin"}
tracking_prefixes = ("utm_",)
try:
pairs = parse_qsl(parsed.query, keep_blank_values=True)
filtered = []
for key, val in pairs:
key_norm = str(key or "").lower()
if key_norm in time_keys:
continue
if key_norm.startswith(tracking_prefixes):
continue
filtered.append((key, val))
if filtered:
url = urlunparse(parsed._replace(query=urlencode(filtered, doseq=True)))
else:
url = urlunparse(parsed._replace(query=""))
except Exception:
pass
# Remove protocol (http://, https://, ftp://, etc.) # Remove protocol (http://, https://, ftp://, etc.)
url = re.sub(r"^[a-z][a-z0-9+.-]*://", "", url, flags=re.IGNORECASE) url = re.sub(r"^[a-z][a-z0-9+.-]*://", "", url, flags=re.IGNORECASE)