This commit is contained in:
2026-01-17 21:32:44 -08:00
parent 193fa5aec3
commit 3f874af54a
4 changed files with 329 additions and 112 deletions

View File

@@ -3358,6 +3358,113 @@ def check_url_exists_in_storage(
_mark_preflight_checked()
return True
bulk_mode = len(unique_urls) >= 8
def _build_bulk_patterns(needles_map: Dict[str, List[str]], max_per_url: int = 3, max_total: int = 240) -> List[str]:
patterns: List[str] = []
for _original, needles in needles_map.items():
for needle in (needles or [])[:max_per_url]:
needle_text = str(needle or "").strip()
if not needle_text:
continue
if needle_text not in patterns:
patterns.append(needle_text)
if len(patterns) >= max_total:
return patterns
return patterns
bulk_patterns = _build_bulk_patterns(url_needles)
def _match_normalized_url(pattern_text: str, candidate_url: str) -> bool:
pattern_norm = _normalize_url_for_search(pattern_text)
candidate_norm = _normalize_url_for_search(candidate_url)
if not pattern_norm or not candidate_norm:
return False
if pattern_norm == candidate_norm:
return True
return pattern_norm in candidate_norm
def _extract_urls_from_hit(
hit: Any,
backend: Any,
*,
allow_backend_lookup: bool = True,
) -> List[str]:
url_values: List[str] = []
try:
raw_urls = get_field(hit, "known_urls") or get_field(hit, "urls") or get_field(hit, "url")
if isinstance(raw_urls, str) and raw_urls.strip():
url_values.append(raw_urls.strip())
elif isinstance(raw_urls, (list, tuple, set)):
for item in raw_urls:
if isinstance(item, str) and item.strip():
url_values.append(item.strip())
except Exception:
url_values = []
if url_values or not allow_backend_lookup:
return url_values
try:
file_hash = get_field(hit, "hash") or get_field(hit, "file_hash") or get_field(hit, "sha256") or ""
except Exception:
file_hash = ""
if file_hash:
try:
fetched = backend.get_url(str(file_hash))
if isinstance(fetched, str) and fetched.strip():
url_values.append(fetched.strip())
elif isinstance(fetched, (list, tuple, set)):
for item in fetched:
if isinstance(item, str) and item.strip():
url_values.append(item.strip())
except Exception:
pass
return url_values
def _build_display_row_for_hit(
hit: Any,
backend_name: str,
original_url: str,
) -> Dict[str, Any]:
try:
from SYS.result_table import build_display_row
extracted = build_display_row(hit, keys=["title", "store", "hash", "ext", "size"])
except Exception:
extracted = {}
try:
title = extracted.get("title") or get_field(hit, "title") or get_field(hit, "name") or get_field(hit, "target") or get_field(hit, "path") or "(exists)"
except Exception:
title = "(exists)"
try:
file_hash = extracted.get("hash") or get_field(hit, "hash") or get_field(hit, "file_hash") or get_field(hit, "sha256") or ""
except Exception:
file_hash = ""
ext = extracted.get("ext") if isinstance(extracted, dict) else ""
size_val = extracted.get("size") if isinstance(extracted, dict) else None
return {
"title": str(title),
"store": str(get_field(hit, "store") or backend_name),
"hash": str(file_hash or ""),
"ext": str(ext or ""),
"size": size_val,
"url": original_url,
"columns": [
("Title", str(title)),
("Store", str(get_field(hit, "store") or backend_name)),
("Hash", str(file_hash or "")),
("Ext", str(ext or "")),
("Size", size_val),
("URL", original_url),
],
}
def _search_backend_url_hits(
backend: Any,
backend_name: str,
@@ -3379,15 +3486,6 @@ def check_url_exists_in_storage(
continue
if not backend_hits:
def _match_normalized_url(pattern_text: str, candidate_url: str) -> bool:
pattern_norm = _normalize_url_for_search(pattern_text)
candidate_norm = _normalize_url_for_search(candidate_url)
if not pattern_norm or not candidate_norm:
return False
if pattern_norm == candidate_norm:
return True
return pattern_norm in candidate_norm
fallback_hits: List[Dict[str, Any]] = []
try:
fallback_hits = backend.search("url:*", limit=200) or []
@@ -3395,31 +3493,7 @@ def check_url_exists_in_storage(
fallback_hits = []
for hit in fallback_hits:
url_values: List[str] = []
try:
raw_urls = get_field(hit, "known_urls") or get_field(hit, "urls") or get_field(hit, "url")
if isinstance(raw_urls, str) and raw_urls.strip():
url_values.append(raw_urls.strip())
elif isinstance(raw_urls, (list, tuple, set)):
for item in raw_urls:
if isinstance(item, str) and item.strip():
url_values.append(item.strip())
except Exception:
url_values = []
if not url_values:
try:
file_hash = hit.get("hash") if isinstance(hit, dict) else None
if file_hash:
fetched = backend.get_url(str(file_hash))
if isinstance(fetched, str) and fetched.strip():
url_values.append(fetched.strip())
elif isinstance(fetched, (list, tuple, set)):
for item in fetched:
if isinstance(item, str) and item.strip():
url_values.append(item.strip())
except Exception:
pass
url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=True)
if not url_values:
continue
@@ -3436,68 +3510,12 @@ def check_url_exists_in_storage(
if not matched:
continue
title = "(exists)"
try:
title = hit.get("title") or hit.get("name") or hit.get("target") or hit.get("path") or "(exists)"
except Exception:
title = "(exists)"
file_hash = ""
try:
file_hash = hit.get("hash") or hit.get("file_hash") or hit.get("sha256") or ""
except Exception:
file_hash = ""
return {
"title": str(title),
"store": str(hit.get("store") or backend_name),
"hash": str(file_hash or ""),
"ext": "",
"size": None,
"url": original_url,
"columns": [
("Title", str(title)),
("Store", str(hit.get("store") or backend_name)),
("Hash", str(file_hash or "")),
("URL", original_url),
],
}
return _build_display_row_for_hit(hit, backend_name, original_url)
return None
hit = backend_hits[0]
title = hit.get("title") or hit.get("name") or hit.get("target") or hit.get("path") or "(exists)"
file_hash = hit.get("hash") or hit.get("file_hash") or hit.get("sha256") or ""
try:
from SYS.result_table import build_display_row
extracted = build_display_row(hit, keys=["title", "store", "hash", "ext", "size"])
except Exception:
extracted = {}
extracted["title"] = str(title)
extracted["store"] = str(hit.get("store") or backend_name)
extracted["hash"] = str(file_hash or "")
ext = extracted.get("ext")
size_val = extracted.get("size")
return {
"title": str(title),
"store": str(hit.get("store") or backend_name),
"hash": str(file_hash or ""),
"ext": str(ext or ""),
"size": size_val,
"url": original_url,
"columns": [
("Title", str(title)),
("Store", str(hit.get("store") or backend_name)),
("Hash", str(file_hash or "")),
("Ext", str(ext or "")),
("Size", size_val),
("URL", original_url),
],
}
return _build_display_row_for_hit(hit, backend_name, original_url)
backend_names: List[str] = []
try:
@@ -3558,6 +3576,54 @@ def check_url_exists_in_storage(
if not hydrus_available:
debug("Bulk URL preflight: hydrus availability check failed; attempting best-effort lookup")
if bulk_mode and bulk_patterns:
bulk_hits: Optional[List[Any]] = None
bulk_limit = min(2000, max(200, len(unique_urls) * 8))
try:
bulk_hits = backend.search(
"url:*",
limit=bulk_limit,
pattern_hint=bulk_patterns,
) or []
except Exception:
try:
bulk_hits = backend.search("url:*", limit=bulk_limit) or []
except Exception:
bulk_hits = None
if bulk_hits is not None:
for hit in bulk_hits:
if len(match_rows) >= max_rows:
break
url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=False)
if not url_values:
continue
for original_url, needles in url_needles.items():
if len(match_rows) >= max_rows:
break
if (original_url, str(backend_name)) in seen_pairs:
continue
matched = False
for url_value in url_values:
for needle in (needles or []):
if _match_normalized_url(str(needle or ""), str(url_value or "")):
matched = True
break
if matched:
break
if not matched:
continue
seen_pairs.add((original_url, str(backend_name)))
matched_urls.add(original_url)
match_rows.append(
_build_display_row_for_hit(hit, str(backend_name), original_url)
)
continue
for original_url, needles in url_needles.items():
if len(match_rows) >= max_rows:
break
@@ -3622,6 +3688,54 @@ def check_url_exists_in_storage(
match_rows.append(display_row)
continue
if bulk_mode and bulk_patterns:
bulk_hits: Optional[List[Any]] = None
bulk_limit = min(2000, max(200, len(unique_urls) * 8))
try:
bulk_hits = backend.search(
"url:*",
limit=bulk_limit,
pattern_hint=bulk_patterns,
) or []
except Exception:
try:
bulk_hits = backend.search("url:*", limit=bulk_limit) or []
except Exception:
bulk_hits = None
if bulk_hits is not None:
for hit in bulk_hits:
if len(match_rows) >= max_rows:
break
url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=False)
if not url_values:
continue
for original_url, needles in url_needles.items():
if len(match_rows) >= max_rows:
break
if (original_url, str(backend_name)) in seen_pairs:
continue
matched = False
for url_value in url_values:
for needle in (needles or []):
if _match_normalized_url(str(needle or ""), str(url_value or "")):
matched = True
break
if matched:
break
if not matched:
continue
seen_pairs.add((original_url, str(backend_name)))
matched_urls.add(original_url)
match_rows.append(
_build_display_row_for_hit(hit, str(backend_name), original_url)
)
continue
for original_url, needles in url_needles.items():
if len(match_rows) >= max_rows:
break