d
This commit is contained in:
@@ -9,6 +9,7 @@ import re
|
||||
import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
from collections.abc import Iterable as IterableABC
|
||||
from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
|
||||
|
||||
@@ -2640,6 +2641,9 @@ def propagate_metadata(
|
||||
is_same_length = len(new_items) == len(prev_normalized)
|
||||
|
||||
for i, item in enumerate(new_items):
|
||||
if isinstance(item, dict) and item.get("_skip_metadata_propagation"):
|
||||
normalized.append(item)
|
||||
continue
|
||||
try:
|
||||
obj = coerce_to_pipe_object(item)
|
||||
except Exception:
|
||||
@@ -3058,6 +3062,9 @@ def check_url_exists_in_storage(
|
||||
stage_ctx = None
|
||||
|
||||
in_pipeline = bool(stage_ctx is not None or ("|" in str(current_cmd_text or "")))
|
||||
start_time = time.monotonic()
|
||||
time_budget = 45.0
|
||||
debug(f"[preflight] check_url_exists_in_storage: checking {len(urls)} url(s)")
|
||||
if in_pipeline:
|
||||
try:
|
||||
already_checked = bool(
|
||||
@@ -3101,6 +3108,18 @@ def check_url_exists_in_storage(
|
||||
preflight_cache["url_duplicates"] = url_dup_cache
|
||||
_store_preflight_cache(preflight_cache)
|
||||
|
||||
def _timed_out(reason: str) -> bool:
|
||||
try:
|
||||
if (time.monotonic() - start_time) >= time_budget:
|
||||
debug(
|
||||
f"Bulk URL preflight timed out after {time_budget:.0f}s ({reason}); continuing"
|
||||
)
|
||||
_mark_preflight_checked()
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
return False
|
||||
|
||||
if in_pipeline:
|
||||
try:
|
||||
cached_cmd = pipeline_context.load_value("preflight.url_duplicates.command", default="")
|
||||
@@ -3358,7 +3377,10 @@ def check_url_exists_in_storage(
|
||||
_mark_preflight_checked()
|
||||
return True
|
||||
|
||||
bulk_mode = len(unique_urls) >= 8
|
||||
if _timed_out("before backend scan"):
|
||||
return True
|
||||
|
||||
bulk_mode = len(unique_urls) > 1
|
||||
|
||||
def _build_bulk_patterns(needles_map: Dict[str, List[str]], max_per_url: int = 3, max_total: int = 240) -> List[str]:
|
||||
patterns: List[str] = []
|
||||
@@ -3562,12 +3584,16 @@ def check_url_exists_in_storage(
|
||||
HydrusNetwork = None # type: ignore
|
||||
|
||||
for backend_name in backend_names:
|
||||
if _timed_out("backend scan"):
|
||||
return True
|
||||
if len(match_rows) >= max_rows:
|
||||
break
|
||||
try:
|
||||
backend = storage[backend_name]
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
debug(f"[preflight] Scanning backend: {backend_name}")
|
||||
|
||||
if HydrusNetwork is not None and isinstance(backend, HydrusNetwork):
|
||||
client = getattr(backend, "_client", None)
|
||||
@@ -3576,6 +3602,9 @@ def check_url_exists_in_storage(
|
||||
if not hydrus_available:
|
||||
debug("Bulk URL preflight: hydrus availability check failed; attempting best-effort lookup")
|
||||
|
||||
if _timed_out("hydrus scan"):
|
||||
return True
|
||||
|
||||
if bulk_mode and bulk_patterns:
|
||||
bulk_hits: Optional[List[Any]] = None
|
||||
bulk_limit = min(2000, max(200, len(unique_urls) * 8))
|
||||
@@ -3591,40 +3620,49 @@ def check_url_exists_in_storage(
|
||||
except Exception:
|
||||
bulk_hits = None
|
||||
|
||||
if bulk_hits is not None:
|
||||
for hit in bulk_hits:
|
||||
if len(match_rows) >= max_rows:
|
||||
break
|
||||
url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=False)
|
||||
if not url_values:
|
||||
continue
|
||||
|
||||
for original_url, needles in url_needles.items():
|
||||
if len(match_rows) >= max_rows:
|
||||
break
|
||||
if (original_url, str(backend_name)) in seen_pairs:
|
||||
continue
|
||||
|
||||
matched = False
|
||||
for url_value in url_values:
|
||||
for needle in (needles or []):
|
||||
if _match_normalized_url(str(needle or ""), str(url_value or "")):
|
||||
matched = True
|
||||
break
|
||||
if matched:
|
||||
break
|
||||
|
||||
if not matched:
|
||||
continue
|
||||
|
||||
seen_pairs.add((original_url, str(backend_name)))
|
||||
matched_urls.add(original_url)
|
||||
match_rows.append(
|
||||
_build_display_row_for_hit(hit, str(backend_name), original_url)
|
||||
)
|
||||
if bulk_hits is None:
|
||||
debug("Bulk URL preflight: Hydrus bulk scan failed; skipping per-URL checks")
|
||||
continue
|
||||
|
||||
for hit in bulk_hits:
|
||||
if _timed_out("hydrus bulk scan"):
|
||||
return True
|
||||
if len(match_rows) >= max_rows:
|
||||
break
|
||||
url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=False)
|
||||
if not url_values:
|
||||
continue
|
||||
|
||||
for original_url, needles in url_needles.items():
|
||||
if _timed_out("hydrus bulk scan"):
|
||||
return True
|
||||
if len(match_rows) >= max_rows:
|
||||
break
|
||||
if (original_url, str(backend_name)) in seen_pairs:
|
||||
continue
|
||||
|
||||
matched = False
|
||||
for url_value in url_values:
|
||||
for needle in (needles or []):
|
||||
if _match_normalized_url(str(needle or ""), str(url_value or "")):
|
||||
matched = True
|
||||
break
|
||||
if matched:
|
||||
break
|
||||
|
||||
if not matched:
|
||||
continue
|
||||
|
||||
seen_pairs.add((original_url, str(backend_name)))
|
||||
matched_urls.add(original_url)
|
||||
match_rows.append(
|
||||
_build_display_row_for_hit(hit, str(backend_name), original_url)
|
||||
)
|
||||
continue
|
||||
|
||||
for original_url, needles in url_needles.items():
|
||||
if _timed_out("hydrus per-url scan"):
|
||||
return True
|
||||
if len(match_rows) >= max_rows:
|
||||
break
|
||||
if (original_url, str(backend_name)) in seen_pairs:
|
||||
@@ -3705,6 +3743,8 @@ def check_url_exists_in_storage(
|
||||
|
||||
if bulk_hits is not None:
|
||||
for hit in bulk_hits:
|
||||
if _timed_out("backend bulk scan"):
|
||||
return True
|
||||
if len(match_rows) >= max_rows:
|
||||
break
|
||||
url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=False)
|
||||
@@ -3712,6 +3752,8 @@ def check_url_exists_in_storage(
|
||||
continue
|
||||
|
||||
for original_url, needles in url_needles.items():
|
||||
if _timed_out("backend bulk scan"):
|
||||
return True
|
||||
if len(match_rows) >= max_rows:
|
||||
break
|
||||
if (original_url, str(backend_name)) in seen_pairs:
|
||||
@@ -3737,6 +3779,8 @@ def check_url_exists_in_storage(
|
||||
continue
|
||||
|
||||
for original_url, needles in url_needles.items():
|
||||
if _timed_out("backend per-url scan"):
|
||||
return True
|
||||
if len(match_rows) >= max_rows:
|
||||
break
|
||||
if (original_url, str(backend_name)) in seen_pairs:
|
||||
|
||||
Reference in New Issue
Block a user