d
This commit is contained in:
@@ -3060,20 +3060,17 @@ def check_url_exists_in_storage(
|
||||
in_pipeline = bool(stage_ctx is not None or ("|" in str(current_cmd_text or "")))
|
||||
if in_pipeline:
|
||||
try:
|
||||
cached_cmd = pipeline_context.load_value("preflight.url_duplicates.command", default="")
|
||||
cached_decision = pipeline_context.load_value("preflight.url_duplicates.continue", default=None)
|
||||
already_checked = bool(
|
||||
pipeline_context.load_value(
|
||||
"preflight.url_duplicates.checked", default=False
|
||||
)
|
||||
)
|
||||
except Exception:
|
||||
cached_cmd = ""
|
||||
cached_decision = None
|
||||
already_checked = False
|
||||
|
||||
if cached_decision is not None and str(cached_cmd or "") == str(current_cmd_text or ""):
|
||||
if bool(cached_decision):
|
||||
return True
|
||||
try:
|
||||
pipeline_context.request_pipeline_stop(reason="duplicate-url declined", exit_code=0)
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
if already_checked:
|
||||
debug("Bulk URL preflight: already checked in pipeline; skipping duplicate check")
|
||||
return True
|
||||
|
||||
def _load_preflight_cache() -> Dict[str, Any]:
|
||||
try:
|
||||
@@ -3088,6 +3085,40 @@ def check_url_exists_in_storage(
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _mark_preflight_checked() -> None:
|
||||
if not in_pipeline:
|
||||
return
|
||||
try:
|
||||
pipeline_context.store_value("preflight.url_duplicates.checked", True)
|
||||
except Exception:
|
||||
pass
|
||||
preflight_cache = _load_preflight_cache()
|
||||
preflight_cache["url_duplicates_checked"] = True
|
||||
url_dup_cache = preflight_cache.get("url_duplicates")
|
||||
if not isinstance(url_dup_cache, dict):
|
||||
url_dup_cache = {}
|
||||
url_dup_cache["checked"] = True
|
||||
preflight_cache["url_duplicates"] = url_dup_cache
|
||||
_store_preflight_cache(preflight_cache)
|
||||
|
||||
if in_pipeline:
|
||||
try:
|
||||
cached_cmd = pipeline_context.load_value("preflight.url_duplicates.command", default="")
|
||||
cached_decision = pipeline_context.load_value("preflight.url_duplicates.continue", default=None)
|
||||
except Exception:
|
||||
cached_cmd = ""
|
||||
cached_decision = None
|
||||
|
||||
if cached_decision is not None and str(cached_cmd or "") == str(current_cmd_text or ""):
|
||||
_mark_preflight_checked()
|
||||
if bool(cached_decision):
|
||||
return True
|
||||
try:
|
||||
pipeline_context.request_pipeline_stop(reason="duplicate-url declined", exit_code=0)
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
unique_urls: List[str] = []
|
||||
for u in urls or []:
|
||||
s = str(u or "").strip()
|
||||
@@ -3107,6 +3138,46 @@ def check_url_exists_in_storage(
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _normalize_url_for_search(value: str) -> str:
|
||||
url = str(value or "").strip()
|
||||
|
||||
# Strip fragment (e.g., #t=10) before matching
|
||||
url = url.split("#", 1)[0]
|
||||
|
||||
# Strip common time/tracking query params for matching
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
except Exception:
|
||||
parsed = None
|
||||
|
||||
if parsed is not None and parsed.query:
|
||||
time_keys = {"t", "start", "time_continue", "timestamp", "time", "begin"}
|
||||
tracking_prefixes = ("utm_",)
|
||||
try:
|
||||
pairs = parse_qsl(parsed.query, keep_blank_values=True)
|
||||
filtered = []
|
||||
for key, val in pairs:
|
||||
key_norm = str(key or "").lower()
|
||||
if key_norm in time_keys:
|
||||
continue
|
||||
if key_norm.startswith(tracking_prefixes):
|
||||
continue
|
||||
filtered.append((key, val))
|
||||
if filtered:
|
||||
url = urlunparse(parsed._replace(query=urlencode(filtered, doseq=True)))
|
||||
else:
|
||||
url = urlunparse(parsed._replace(query=""))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Remove protocol (http://, https://, ftp://, etc.)
|
||||
url = re.sub(r"^[a-z][a-z0-9+.-]*://", "", url, flags=re.IGNORECASE)
|
||||
|
||||
# Remove www. prefix (case-insensitive)
|
||||
url = re.sub(r"^www\.", "", url, flags=re.IGNORECASE)
|
||||
|
||||
return url.lower()
|
||||
|
||||
def _expand_url_variants(value: str) -> List[str]:
|
||||
if not _httpish(value):
|
||||
return []
|
||||
@@ -3121,6 +3192,51 @@ def check_url_exists_in_storage(
|
||||
|
||||
out: List[str] = []
|
||||
|
||||
def _add_variant(candidate: str) -> None:
|
||||
_maybe_add(candidate)
|
||||
try:
|
||||
lower = str(candidate or "").lower()
|
||||
except Exception:
|
||||
lower = ""
|
||||
if lower and lower != candidate:
|
||||
_maybe_add(lower)
|
||||
|
||||
try:
|
||||
parsed_candidate = urlparse(candidate)
|
||||
except Exception:
|
||||
parsed_candidate = None
|
||||
|
||||
if parsed_candidate is None:
|
||||
return
|
||||
|
||||
host = (parsed_candidate.hostname or "").strip().lower()
|
||||
if host.startswith("www."):
|
||||
host = host[4:]
|
||||
if host:
|
||||
netloc = host
|
||||
try:
|
||||
if parsed_candidate.port:
|
||||
netloc = f"{netloc}:{parsed_candidate.port}"
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
if parsed_candidate.username or parsed_candidate.password:
|
||||
userinfo = parsed_candidate.username or ""
|
||||
if parsed_candidate.password:
|
||||
userinfo = f"{userinfo}:{parsed_candidate.password}"
|
||||
if userinfo:
|
||||
netloc = f"{userinfo}@{netloc}"
|
||||
except Exception:
|
||||
pass
|
||||
alt = urlunparse(parsed_candidate._replace(netloc=netloc))
|
||||
_maybe_add(alt)
|
||||
try:
|
||||
lower_alt = alt.lower()
|
||||
except Exception:
|
||||
lower_alt = ""
|
||||
if lower_alt and lower_alt != alt:
|
||||
_maybe_add(lower_alt)
|
||||
|
||||
def _maybe_add(candidate: str) -> None:
|
||||
if not candidate or candidate == value:
|
||||
return
|
||||
@@ -3128,7 +3244,7 @@ def check_url_exists_in_storage(
|
||||
out.append(candidate)
|
||||
|
||||
if parsed.fragment:
|
||||
_maybe_add(urlunparse(parsed._replace(fragment="")))
|
||||
_add_variant(urlunparse(parsed._replace(fragment="")))
|
||||
|
||||
time_keys = {"t", "start", "time_continue", "timestamp", "time", "begin"}
|
||||
tracking_prefixes = ("utm_",)
|
||||
@@ -3153,7 +3269,7 @@ def check_url_exists_in_storage(
|
||||
|
||||
if removed:
|
||||
new_query = urlencode(filtered_pairs, doseq=True) if filtered_pairs else ""
|
||||
_maybe_add(urlunparse(parsed._replace(query=new_query, fragment="")))
|
||||
_add_variant(urlunparse(parsed._replace(query=new_query, fragment="")))
|
||||
|
||||
return out
|
||||
|
||||
@@ -3176,13 +3292,35 @@ def check_url_exists_in_storage(
|
||||
continue
|
||||
if n2 not in filtered:
|
||||
filtered.append(n2)
|
||||
lowered: List[str] = []
|
||||
for n2 in filtered:
|
||||
try:
|
||||
lower = n2.lower()
|
||||
except Exception:
|
||||
lower = ""
|
||||
if lower and lower != n2 and lower not in filtered and lower not in lowered:
|
||||
lowered.append(lower)
|
||||
normalized: List[str] = []
|
||||
for n2 in filtered:
|
||||
norm = _normalize_url_for_search(n2)
|
||||
if norm and norm not in normalized and norm not in filtered:
|
||||
normalized.append(norm)
|
||||
expanded: List[str] = []
|
||||
for n2 in filtered:
|
||||
for extra in _expand_url_variants(n2):
|
||||
if extra not in expanded and extra not in filtered:
|
||||
if extra not in expanded and extra not in filtered and extra not in lowered:
|
||||
expanded.append(extra)
|
||||
norm_extra = _normalize_url_for_search(extra)
|
||||
if (
|
||||
norm_extra
|
||||
and norm_extra not in normalized
|
||||
and norm_extra not in filtered
|
||||
and norm_extra not in expanded
|
||||
and norm_extra not in lowered
|
||||
):
|
||||
normalized.append(norm_extra)
|
||||
|
||||
combined = filtered + expanded
|
||||
combined = filtered + expanded + lowered + normalized
|
||||
url_needles[u] = combined if combined else [u]
|
||||
|
||||
if in_pipeline:
|
||||
@@ -3196,15 +3334,28 @@ def check_url_exists_in_storage(
|
||||
if cached_set:
|
||||
all_cached = True
|
||||
for original_url, needles in url_needles.items():
|
||||
if original_url in cached_set:
|
||||
continue
|
||||
if any(n in cached_set for n in (needles or [])):
|
||||
original_cached = str(original_url or "") in cached_set
|
||||
needles_cached = True
|
||||
if original_cached:
|
||||
for needle in (needles or []):
|
||||
needle_text = str(needle or "")
|
||||
if not needle_text:
|
||||
continue
|
||||
if needle_text not in cached_set:
|
||||
needles_cached = False
|
||||
break
|
||||
else:
|
||||
needles_cached = False
|
||||
|
||||
if original_cached and needles_cached:
|
||||
continue
|
||||
|
||||
all_cached = False
|
||||
break
|
||||
|
||||
if all_cached:
|
||||
debug("Bulk URL preflight: cached for pipeline; skipping duplicate check")
|
||||
_mark_preflight_checked()
|
||||
return True
|
||||
|
||||
def _search_backend_url_hits(
|
||||
@@ -3215,14 +3366,103 @@ def check_url_exists_in_storage(
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
backend_hits: List[Dict[str, Any]] = []
|
||||
for needle in (needles or [])[:3]:
|
||||
needle_text = str(needle or "").strip()
|
||||
if not needle_text:
|
||||
continue
|
||||
search_needle = _normalize_url_for_search(needle_text) or needle_text
|
||||
query = f"url:*{search_needle}*"
|
||||
try:
|
||||
backend_hits = backend.search(f"url:{needle}", limit=1) or []
|
||||
backend_hits = backend.search(query, limit=1) or []
|
||||
if backend_hits:
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not backend_hits:
|
||||
def _match_normalized_url(pattern_text: str, candidate_url: str) -> bool:
|
||||
pattern_norm = _normalize_url_for_search(pattern_text)
|
||||
candidate_norm = _normalize_url_for_search(candidate_url)
|
||||
if not pattern_norm or not candidate_norm:
|
||||
return False
|
||||
if pattern_norm == candidate_norm:
|
||||
return True
|
||||
return pattern_norm in candidate_norm
|
||||
|
||||
fallback_hits: List[Dict[str, Any]] = []
|
||||
try:
|
||||
fallback_hits = backend.search("url:*", limit=200) or []
|
||||
except Exception:
|
||||
fallback_hits = []
|
||||
|
||||
for hit in fallback_hits:
|
||||
url_values: List[str] = []
|
||||
try:
|
||||
raw_urls = get_field(hit, "known_urls") or get_field(hit, "urls") or get_field(hit, "url")
|
||||
if isinstance(raw_urls, str) and raw_urls.strip():
|
||||
url_values.append(raw_urls.strip())
|
||||
elif isinstance(raw_urls, (list, tuple, set)):
|
||||
for item in raw_urls:
|
||||
if isinstance(item, str) and item.strip():
|
||||
url_values.append(item.strip())
|
||||
except Exception:
|
||||
url_values = []
|
||||
|
||||
if not url_values:
|
||||
try:
|
||||
file_hash = hit.get("hash") if isinstance(hit, dict) else None
|
||||
if file_hash:
|
||||
fetched = backend.get_url(str(file_hash))
|
||||
if isinstance(fetched, str) and fetched.strip():
|
||||
url_values.append(fetched.strip())
|
||||
elif isinstance(fetched, (list, tuple, set)):
|
||||
for item in fetched:
|
||||
if isinstance(item, str) and item.strip():
|
||||
url_values.append(item.strip())
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not url_values:
|
||||
continue
|
||||
|
||||
matched = False
|
||||
for url_value in url_values:
|
||||
for needle in (needles or []):
|
||||
if _match_normalized_url(str(needle or ""), str(url_value or "")):
|
||||
matched = True
|
||||
break
|
||||
if matched:
|
||||
break
|
||||
|
||||
if not matched:
|
||||
continue
|
||||
|
||||
title = "(exists)"
|
||||
try:
|
||||
title = hit.get("title") or hit.get("name") or hit.get("target") or hit.get("path") or "(exists)"
|
||||
except Exception:
|
||||
title = "(exists)"
|
||||
|
||||
file_hash = ""
|
||||
try:
|
||||
file_hash = hit.get("hash") or hit.get("file_hash") or hit.get("sha256") or ""
|
||||
except Exception:
|
||||
file_hash = ""
|
||||
|
||||
return {
|
||||
"title": str(title),
|
||||
"store": str(hit.get("store") or backend_name),
|
||||
"hash": str(file_hash or ""),
|
||||
"ext": "",
|
||||
"size": None,
|
||||
"url": original_url,
|
||||
"columns": [
|
||||
("Title", str(title)),
|
||||
("Store", str(hit.get("store") or backend_name)),
|
||||
("Hash", str(file_hash or "")),
|
||||
("URL", original_url),
|
||||
],
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
hit = backend_hits[0]
|
||||
@@ -3326,7 +3566,7 @@ def check_url_exists_in_storage(
|
||||
|
||||
found_hash: Optional[str] = None
|
||||
found = False
|
||||
for needle in (needles or [])[:3]:
|
||||
for needle in (needles or [])[:6]:
|
||||
if not _httpish(needle):
|
||||
continue
|
||||
try:
|
||||
@@ -3415,6 +3655,7 @@ def check_url_exists_in_storage(
|
||||
url_dup_cache["urls"] = sorted(cached_set)
|
||||
preflight_cache["url_duplicates"] = url_dup_cache
|
||||
_store_preflight_cache(preflight_cache)
|
||||
_mark_preflight_checked()
|
||||
return True
|
||||
|
||||
table = ResultTable(f"URL already exists ({len(matched_urls)} url(s))", max_columns=10)
|
||||
@@ -3477,6 +3718,8 @@ def check_url_exists_in_storage(
|
||||
pipeline_context.request_pipeline_stop(reason="duplicate-url declined", exit_code=0)
|
||||
except Exception:
|
||||
pass
|
||||
_mark_preflight_checked()
|
||||
return False
|
||||
_mark_preflight_checked()
|
||||
return True
|
||||
|
||||
|
||||
Reference in New Issue
Block a user