f
This commit is contained in:
@@ -3358,6 +3358,113 @@ def check_url_exists_in_storage(
|
||||
_mark_preflight_checked()
|
||||
return True
|
||||
|
||||
bulk_mode = len(unique_urls) >= 8
|
||||
|
||||
def _build_bulk_patterns(needles_map: Dict[str, List[str]], max_per_url: int = 3, max_total: int = 240) -> List[str]:
|
||||
patterns: List[str] = []
|
||||
for _original, needles in needles_map.items():
|
||||
for needle in (needles or [])[:max_per_url]:
|
||||
needle_text = str(needle or "").strip()
|
||||
if not needle_text:
|
||||
continue
|
||||
if needle_text not in patterns:
|
||||
patterns.append(needle_text)
|
||||
if len(patterns) >= max_total:
|
||||
return patterns
|
||||
return patterns
|
||||
|
||||
bulk_patterns = _build_bulk_patterns(url_needles)
|
||||
|
||||
def _match_normalized_url(pattern_text: str, candidate_url: str) -> bool:
|
||||
pattern_norm = _normalize_url_for_search(pattern_text)
|
||||
candidate_norm = _normalize_url_for_search(candidate_url)
|
||||
if not pattern_norm or not candidate_norm:
|
||||
return False
|
||||
if pattern_norm == candidate_norm:
|
||||
return True
|
||||
return pattern_norm in candidate_norm
|
||||
|
||||
def _extract_urls_from_hit(
|
||||
hit: Any,
|
||||
backend: Any,
|
||||
*,
|
||||
allow_backend_lookup: bool = True,
|
||||
) -> List[str]:
|
||||
url_values: List[str] = []
|
||||
try:
|
||||
raw_urls = get_field(hit, "known_urls") or get_field(hit, "urls") or get_field(hit, "url")
|
||||
if isinstance(raw_urls, str) and raw_urls.strip():
|
||||
url_values.append(raw_urls.strip())
|
||||
elif isinstance(raw_urls, (list, tuple, set)):
|
||||
for item in raw_urls:
|
||||
if isinstance(item, str) and item.strip():
|
||||
url_values.append(item.strip())
|
||||
except Exception:
|
||||
url_values = []
|
||||
|
||||
if url_values or not allow_backend_lookup:
|
||||
return url_values
|
||||
|
||||
try:
|
||||
file_hash = get_field(hit, "hash") or get_field(hit, "file_hash") or get_field(hit, "sha256") or ""
|
||||
except Exception:
|
||||
file_hash = ""
|
||||
|
||||
if file_hash:
|
||||
try:
|
||||
fetched = backend.get_url(str(file_hash))
|
||||
if isinstance(fetched, str) and fetched.strip():
|
||||
url_values.append(fetched.strip())
|
||||
elif isinstance(fetched, (list, tuple, set)):
|
||||
for item in fetched:
|
||||
if isinstance(item, str) and item.strip():
|
||||
url_values.append(item.strip())
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return url_values
|
||||
|
||||
def _build_display_row_for_hit(
|
||||
hit: Any,
|
||||
backend_name: str,
|
||||
original_url: str,
|
||||
) -> Dict[str, Any]:
|
||||
try:
|
||||
from SYS.result_table import build_display_row
|
||||
extracted = build_display_row(hit, keys=["title", "store", "hash", "ext", "size"])
|
||||
except Exception:
|
||||
extracted = {}
|
||||
|
||||
try:
|
||||
title = extracted.get("title") or get_field(hit, "title") or get_field(hit, "name") or get_field(hit, "target") or get_field(hit, "path") or "(exists)"
|
||||
except Exception:
|
||||
title = "(exists)"
|
||||
|
||||
try:
|
||||
file_hash = extracted.get("hash") or get_field(hit, "hash") or get_field(hit, "file_hash") or get_field(hit, "sha256") or ""
|
||||
except Exception:
|
||||
file_hash = ""
|
||||
|
||||
ext = extracted.get("ext") if isinstance(extracted, dict) else ""
|
||||
size_val = extracted.get("size") if isinstance(extracted, dict) else None
|
||||
|
||||
return {
|
||||
"title": str(title),
|
||||
"store": str(get_field(hit, "store") or backend_name),
|
||||
"hash": str(file_hash or ""),
|
||||
"ext": str(ext or ""),
|
||||
"size": size_val,
|
||||
"url": original_url,
|
||||
"columns": [
|
||||
("Title", str(title)),
|
||||
("Store", str(get_field(hit, "store") or backend_name)),
|
||||
("Hash", str(file_hash or "")),
|
||||
("Ext", str(ext or "")),
|
||||
("Size", size_val),
|
||||
("URL", original_url),
|
||||
],
|
||||
}
|
||||
|
||||
def _search_backend_url_hits(
|
||||
backend: Any,
|
||||
backend_name: str,
|
||||
@@ -3379,15 +3486,6 @@ def check_url_exists_in_storage(
|
||||
continue
|
||||
|
||||
if not backend_hits:
|
||||
def _match_normalized_url(pattern_text: str, candidate_url: str) -> bool:
|
||||
pattern_norm = _normalize_url_for_search(pattern_text)
|
||||
candidate_norm = _normalize_url_for_search(candidate_url)
|
||||
if not pattern_norm or not candidate_norm:
|
||||
return False
|
||||
if pattern_norm == candidate_norm:
|
||||
return True
|
||||
return pattern_norm in candidate_norm
|
||||
|
||||
fallback_hits: List[Dict[str, Any]] = []
|
||||
try:
|
||||
fallback_hits = backend.search("url:*", limit=200) or []
|
||||
@@ -3395,31 +3493,7 @@ def check_url_exists_in_storage(
|
||||
fallback_hits = []
|
||||
|
||||
for hit in fallback_hits:
|
||||
url_values: List[str] = []
|
||||
try:
|
||||
raw_urls = get_field(hit, "known_urls") or get_field(hit, "urls") or get_field(hit, "url")
|
||||
if isinstance(raw_urls, str) and raw_urls.strip():
|
||||
url_values.append(raw_urls.strip())
|
||||
elif isinstance(raw_urls, (list, tuple, set)):
|
||||
for item in raw_urls:
|
||||
if isinstance(item, str) and item.strip():
|
||||
url_values.append(item.strip())
|
||||
except Exception:
|
||||
url_values = []
|
||||
|
||||
if not url_values:
|
||||
try:
|
||||
file_hash = hit.get("hash") if isinstance(hit, dict) else None
|
||||
if file_hash:
|
||||
fetched = backend.get_url(str(file_hash))
|
||||
if isinstance(fetched, str) and fetched.strip():
|
||||
url_values.append(fetched.strip())
|
||||
elif isinstance(fetched, (list, tuple, set)):
|
||||
for item in fetched:
|
||||
if isinstance(item, str) and item.strip():
|
||||
url_values.append(item.strip())
|
||||
except Exception:
|
||||
pass
|
||||
url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=True)
|
||||
|
||||
if not url_values:
|
||||
continue
|
||||
@@ -3436,68 +3510,12 @@ def check_url_exists_in_storage(
|
||||
if not matched:
|
||||
continue
|
||||
|
||||
title = "(exists)"
|
||||
try:
|
||||
title = hit.get("title") or hit.get("name") or hit.get("target") or hit.get("path") or "(exists)"
|
||||
except Exception:
|
||||
title = "(exists)"
|
||||
|
||||
file_hash = ""
|
||||
try:
|
||||
file_hash = hit.get("hash") or hit.get("file_hash") or hit.get("sha256") or ""
|
||||
except Exception:
|
||||
file_hash = ""
|
||||
|
||||
return {
|
||||
"title": str(title),
|
||||
"store": str(hit.get("store") or backend_name),
|
||||
"hash": str(file_hash or ""),
|
||||
"ext": "",
|
||||
"size": None,
|
||||
"url": original_url,
|
||||
"columns": [
|
||||
("Title", str(title)),
|
||||
("Store", str(hit.get("store") or backend_name)),
|
||||
("Hash", str(file_hash or "")),
|
||||
("URL", original_url),
|
||||
],
|
||||
}
|
||||
return _build_display_row_for_hit(hit, backend_name, original_url)
|
||||
|
||||
return None
|
||||
|
||||
hit = backend_hits[0]
|
||||
title = hit.get("title") or hit.get("name") or hit.get("target") or hit.get("path") or "(exists)"
|
||||
file_hash = hit.get("hash") or hit.get("file_hash") or hit.get("sha256") or ""
|
||||
|
||||
try:
|
||||
from SYS.result_table import build_display_row
|
||||
extracted = build_display_row(hit, keys=["title", "store", "hash", "ext", "size"])
|
||||
except Exception:
|
||||
extracted = {}
|
||||
|
||||
extracted["title"] = str(title)
|
||||
extracted["store"] = str(hit.get("store") or backend_name)
|
||||
extracted["hash"] = str(file_hash or "")
|
||||
|
||||
ext = extracted.get("ext")
|
||||
size_val = extracted.get("size")
|
||||
|
||||
return {
|
||||
"title": str(title),
|
||||
"store": str(hit.get("store") or backend_name),
|
||||
"hash": str(file_hash or ""),
|
||||
"ext": str(ext or ""),
|
||||
"size": size_val,
|
||||
"url": original_url,
|
||||
"columns": [
|
||||
("Title", str(title)),
|
||||
("Store", str(hit.get("store") or backend_name)),
|
||||
("Hash", str(file_hash or "")),
|
||||
("Ext", str(ext or "")),
|
||||
("Size", size_val),
|
||||
("URL", original_url),
|
||||
],
|
||||
}
|
||||
return _build_display_row_for_hit(hit, backend_name, original_url)
|
||||
|
||||
backend_names: List[str] = []
|
||||
try:
|
||||
@@ -3558,6 +3576,54 @@ def check_url_exists_in_storage(
|
||||
if not hydrus_available:
|
||||
debug("Bulk URL preflight: hydrus availability check failed; attempting best-effort lookup")
|
||||
|
||||
if bulk_mode and bulk_patterns:
|
||||
bulk_hits: Optional[List[Any]] = None
|
||||
bulk_limit = min(2000, max(200, len(unique_urls) * 8))
|
||||
try:
|
||||
bulk_hits = backend.search(
|
||||
"url:*",
|
||||
limit=bulk_limit,
|
||||
pattern_hint=bulk_patterns,
|
||||
) or []
|
||||
except Exception:
|
||||
try:
|
||||
bulk_hits = backend.search("url:*", limit=bulk_limit) or []
|
||||
except Exception:
|
||||
bulk_hits = None
|
||||
|
||||
if bulk_hits is not None:
|
||||
for hit in bulk_hits:
|
||||
if len(match_rows) >= max_rows:
|
||||
break
|
||||
url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=False)
|
||||
if not url_values:
|
||||
continue
|
||||
|
||||
for original_url, needles in url_needles.items():
|
||||
if len(match_rows) >= max_rows:
|
||||
break
|
||||
if (original_url, str(backend_name)) in seen_pairs:
|
||||
continue
|
||||
|
||||
matched = False
|
||||
for url_value in url_values:
|
||||
for needle in (needles or []):
|
||||
if _match_normalized_url(str(needle or ""), str(url_value or "")):
|
||||
matched = True
|
||||
break
|
||||
if matched:
|
||||
break
|
||||
|
||||
if not matched:
|
||||
continue
|
||||
|
||||
seen_pairs.add((original_url, str(backend_name)))
|
||||
matched_urls.add(original_url)
|
||||
match_rows.append(
|
||||
_build_display_row_for_hit(hit, str(backend_name), original_url)
|
||||
)
|
||||
continue
|
||||
|
||||
for original_url, needles in url_needles.items():
|
||||
if len(match_rows) >= max_rows:
|
||||
break
|
||||
@@ -3622,6 +3688,54 @@ def check_url_exists_in_storage(
|
||||
match_rows.append(display_row)
|
||||
continue
|
||||
|
||||
if bulk_mode and bulk_patterns:
|
||||
bulk_hits: Optional[List[Any]] = None
|
||||
bulk_limit = min(2000, max(200, len(unique_urls) * 8))
|
||||
try:
|
||||
bulk_hits = backend.search(
|
||||
"url:*",
|
||||
limit=bulk_limit,
|
||||
pattern_hint=bulk_patterns,
|
||||
) or []
|
||||
except Exception:
|
||||
try:
|
||||
bulk_hits = backend.search("url:*", limit=bulk_limit) or []
|
||||
except Exception:
|
||||
bulk_hits = None
|
||||
|
||||
if bulk_hits is not None:
|
||||
for hit in bulk_hits:
|
||||
if len(match_rows) >= max_rows:
|
||||
break
|
||||
url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=False)
|
||||
if not url_values:
|
||||
continue
|
||||
|
||||
for original_url, needles in url_needles.items():
|
||||
if len(match_rows) >= max_rows:
|
||||
break
|
||||
if (original_url, str(backend_name)) in seen_pairs:
|
||||
continue
|
||||
|
||||
matched = False
|
||||
for url_value in url_values:
|
||||
for needle in (needles or []):
|
||||
if _match_normalized_url(str(needle or ""), str(url_value or "")):
|
||||
matched = True
|
||||
break
|
||||
if matched:
|
||||
break
|
||||
|
||||
if not matched:
|
||||
continue
|
||||
|
||||
seen_pairs.add((original_url, str(backend_name)))
|
||||
matched_urls.add(original_url)
|
||||
match_rows.append(
|
||||
_build_display_row_for_hit(hit, str(backend_name), original_url)
|
||||
)
|
||||
continue
|
||||
|
||||
for original_url, needles in url_needles.items():
|
||||
if len(match_rows) >= max_rows:
|
||||
break
|
||||
|
||||
Reference in New Issue
Block a user