f
This commit is contained in:
@@ -3253,6 +3253,20 @@ def check_url_exists_in_storage(
|
||||
|
||||
return out
|
||||
|
||||
def _dedupe_needles(raw_needles: Sequence[str]) -> List[str]:
|
||||
output: List[str] = []
|
||||
seen: set[str] = set()
|
||||
for candidate in (raw_needles or []):
|
||||
candidate_text = str(candidate or "").strip()
|
||||
if not candidate_text:
|
||||
continue
|
||||
key = candidate_text.lower()
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
output.append(candidate_text)
|
||||
return output
|
||||
|
||||
url_needles: Dict[str, List[str]] = {}
|
||||
for u in unique_urls:
|
||||
needles: List[str] = []
|
||||
@@ -3301,7 +3315,8 @@ def check_url_exists_in_storage(
|
||||
normalized.append(norm_extra)
|
||||
|
||||
combined = filtered + expanded + lowered + normalized
|
||||
url_needles[u] = combined if combined else [u]
|
||||
deduped = _dedupe_needles(combined)
|
||||
url_needles[u] = deduped if deduped else [u]
|
||||
|
||||
if in_pipeline:
|
||||
preflight_cache = _load_preflight_cache()
|
||||
@@ -3341,7 +3356,10 @@ def check_url_exists_in_storage(
|
||||
if _timed_out("before backend scan"):
|
||||
return True
|
||||
|
||||
bulk_mode = len(unique_urls) > 1
|
||||
# Use bulk mode only if we have a significant number of URLs.
|
||||
# For small sets (1-3 URLs), individual targeted searches are faster
|
||||
# and more accurate than scanning all files with URLs in the backend.
|
||||
bulk_mode = len(unique_urls) > 3
|
||||
|
||||
def _build_bulk_patterns(needles_map: Dict[str, List[str]], max_per_url: int = 3, max_total: int = 240) -> List[str]:
|
||||
patterns: List[str] = []
|
||||
@@ -3455,6 +3473,24 @@ def check_url_exists_in_storage(
|
||||
needles: Sequence[str],
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
backend_hits: List[Dict[str, Any]] = []
|
||||
|
||||
# 1) Try exact match first (no wildcards).
|
||||
# This is extremely fast for Hydrus and others that support direct URL lookup.
|
||||
for needle in (needles or [])[:5]:
|
||||
needle_stripped = str(needle or "").strip()
|
||||
if not needle_stripped or not _httpish(needle_stripped):
|
||||
continue
|
||||
try:
|
||||
# Use 'url:' prefix to ensure storage layers (like Hydrus) recognize it as a URL lookup
|
||||
query = f"url:{needle_stripped}"
|
||||
backend_hits = backend.search(query, limit=1, minimal=True) or []
|
||||
if backend_hits:
|
||||
return _build_display_row_for_hit(backend_hits[0], backend_name, original_url)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# 2) Fallback to wildcard substring search for normalized variants.
|
||||
# This is for backends where the URL might be stored differently (partial match).
|
||||
for needle in (needles or [])[:3]:
|
||||
needle_text = str(needle or "").strip()
|
||||
if not needle_text:
|
||||
@@ -3462,7 +3498,7 @@ def check_url_exists_in_storage(
|
||||
search_needle = _normalize_url_for_search(needle_text) or needle_text
|
||||
query = f"url:*{search_needle}*"
|
||||
try:
|
||||
backend_hits = backend.search(query, limit=1) or []
|
||||
backend_hits = backend.search(query, limit=1, minimal=True) or []
|
||||
if backend_hits:
|
||||
break
|
||||
except Exception:
|
||||
@@ -3540,61 +3576,6 @@ def check_url_exists_in_storage(
|
||||
if _timed_out("hydrus scan"):
|
||||
return True
|
||||
|
||||
if bulk_mode and bulk_patterns:
|
||||
bulk_hits: Optional[List[Any]] = None
|
||||
bulk_limit = min(2000, max(200, len(unique_urls) * 8))
|
||||
try:
|
||||
bulk_hits = backend.search(
|
||||
"url:*",
|
||||
limit=bulk_limit,
|
||||
pattern_hint=bulk_patterns,
|
||||
) or []
|
||||
except Exception:
|
||||
try:
|
||||
bulk_hits = backend.search("url:*", limit=bulk_limit) or []
|
||||
except Exception:
|
||||
bulk_hits = None
|
||||
|
||||
if bulk_hits is None:
|
||||
debug("Bulk URL preflight: Hydrus bulk scan failed; skipping per-URL checks")
|
||||
continue
|
||||
|
||||
for hit in bulk_hits:
|
||||
if _timed_out("hydrus bulk scan"):
|
||||
return True
|
||||
if len(match_rows) >= max_rows:
|
||||
break
|
||||
url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=False)
|
||||
if not url_values:
|
||||
continue
|
||||
|
||||
for original_url, needles in url_needles.items():
|
||||
if _timed_out("hydrus bulk scan"):
|
||||
return True
|
||||
if len(match_rows) >= max_rows:
|
||||
break
|
||||
if (original_url, str(backend_name)) in seen_pairs:
|
||||
continue
|
||||
|
||||
matched = False
|
||||
for url_value in url_values:
|
||||
for needle in (needles or []):
|
||||
if _match_normalized_url(str(needle or ""), str(url_value or "")):
|
||||
matched = True
|
||||
break
|
||||
if matched:
|
||||
break
|
||||
|
||||
if not matched:
|
||||
continue
|
||||
|
||||
seen_pairs.add((original_url, str(backend_name)))
|
||||
matched_urls.add(original_url)
|
||||
match_rows.append(
|
||||
_build_display_row_for_hit(hit, str(backend_name), original_url)
|
||||
)
|
||||
continue
|
||||
|
||||
for original_url, needles in url_needles.items():
|
||||
if _timed_out("hydrus per-url scan"):
|
||||
return True
|
||||
@@ -3616,8 +3597,7 @@ def check_url_exists_in_storage(
|
||||
endpoint="/add_urls/get_url_files",
|
||||
query={"url": needle},
|
||||
)
|
||||
# Access internal client safely if possible, else skip check
|
||||
if hasattr(client, "_perform_request"):
|
||||
if hasattr(client, "_perform_request"):
|
||||
response = client._perform_request(spec)
|
||||
raw_hashes = None
|
||||
if isinstance(response, dict):
|
||||
@@ -3638,11 +3618,6 @@ def check_url_exists_in_storage(
|
||||
continue
|
||||
|
||||
if not found:
|
||||
fallback_row = _search_backend_url_hits(backend, str(backend_name), original_url, needles)
|
||||
if fallback_row:
|
||||
seen_pairs.add((original_url, str(backend_name)))
|
||||
matched_urls.add(original_url)
|
||||
match_rows.append(fallback_row)
|
||||
continue
|
||||
|
||||
seen_pairs.add((original_url, str(backend_name)))
|
||||
|
||||
Reference in New Issue
Block a user