This commit is contained in:
2026-01-18 03:18:48 -08:00
parent 3f874af54a
commit aa675a625a
8 changed files with 508 additions and 161 deletions

View File

@@ -9,6 +9,7 @@ import re
import shutil
import sys
import tempfile
import time
from collections.abc import Iterable as IterableABC
from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
@@ -2640,6 +2641,9 @@ def propagate_metadata(
is_same_length = len(new_items) == len(prev_normalized)
for i, item in enumerate(new_items):
if isinstance(item, dict) and item.get("_skip_metadata_propagation"):
normalized.append(item)
continue
try:
obj = coerce_to_pipe_object(item)
except Exception:
@@ -3058,6 +3062,9 @@ def check_url_exists_in_storage(
stage_ctx = None
in_pipeline = bool(stage_ctx is not None or ("|" in str(current_cmd_text or "")))
start_time = time.monotonic()
time_budget = 45.0
debug(f"[preflight] check_url_exists_in_storage: checking {len(urls)} url(s)")
if in_pipeline:
try:
already_checked = bool(
@@ -3101,6 +3108,18 @@ def check_url_exists_in_storage(
preflight_cache["url_duplicates"] = url_dup_cache
_store_preflight_cache(preflight_cache)
def _timed_out(reason: str) -> bool:
try:
if (time.monotonic() - start_time) >= time_budget:
debug(
f"Bulk URL preflight timed out after {time_budget:.0f}s ({reason}); continuing"
)
_mark_preflight_checked()
return True
except Exception:
return False
return False
if in_pipeline:
try:
cached_cmd = pipeline_context.load_value("preflight.url_duplicates.command", default="")
@@ -3358,7 +3377,10 @@ def check_url_exists_in_storage(
_mark_preflight_checked()
return True
bulk_mode = len(unique_urls) >= 8
if _timed_out("before backend scan"):
return True
bulk_mode = len(unique_urls) > 1
def _build_bulk_patterns(needles_map: Dict[str, List[str]], max_per_url: int = 3, max_total: int = 240) -> List[str]:
patterns: List[str] = []
@@ -3562,12 +3584,16 @@ def check_url_exists_in_storage(
HydrusNetwork = None # type: ignore
for backend_name in backend_names:
if _timed_out("backend scan"):
return True
if len(match_rows) >= max_rows:
break
try:
backend = storage[backend_name]
except Exception:
continue
debug(f"[preflight] Scanning backend: {backend_name}")
if HydrusNetwork is not None and isinstance(backend, HydrusNetwork):
client = getattr(backend, "_client", None)
@@ -3576,6 +3602,9 @@ def check_url_exists_in_storage(
if not hydrus_available:
debug("Bulk URL preflight: hydrus availability check failed; attempting best-effort lookup")
if _timed_out("hydrus scan"):
return True
if bulk_mode and bulk_patterns:
bulk_hits: Optional[List[Any]] = None
bulk_limit = min(2000, max(200, len(unique_urls) * 8))
@@ -3591,40 +3620,49 @@ def check_url_exists_in_storage(
except Exception:
bulk_hits = None
if bulk_hits is not None:
for hit in bulk_hits:
if len(match_rows) >= max_rows:
break
url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=False)
if not url_values:
continue
for original_url, needles in url_needles.items():
if len(match_rows) >= max_rows:
break
if (original_url, str(backend_name)) in seen_pairs:
continue
matched = False
for url_value in url_values:
for needle in (needles or []):
if _match_normalized_url(str(needle or ""), str(url_value or "")):
matched = True
break
if matched:
break
if not matched:
continue
seen_pairs.add((original_url, str(backend_name)))
matched_urls.add(original_url)
match_rows.append(
_build_display_row_for_hit(hit, str(backend_name), original_url)
)
if bulk_hits is None:
debug("Bulk URL preflight: Hydrus bulk scan failed; skipping per-URL checks")
continue
for hit in bulk_hits:
if _timed_out("hydrus bulk scan"):
return True
if len(match_rows) >= max_rows:
break
url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=False)
if not url_values:
continue
for original_url, needles in url_needles.items():
if _timed_out("hydrus bulk scan"):
return True
if len(match_rows) >= max_rows:
break
if (original_url, str(backend_name)) in seen_pairs:
continue
matched = False
for url_value in url_values:
for needle in (needles or []):
if _match_normalized_url(str(needle or ""), str(url_value or "")):
matched = True
break
if matched:
break
if not matched:
continue
seen_pairs.add((original_url, str(backend_name)))
matched_urls.add(original_url)
match_rows.append(
_build_display_row_for_hit(hit, str(backend_name), original_url)
)
continue
for original_url, needles in url_needles.items():
if _timed_out("hydrus per-url scan"):
return True
if len(match_rows) >= max_rows:
break
if (original_url, str(backend_name)) in seen_pairs:
@@ -3705,6 +3743,8 @@ def check_url_exists_in_storage(
if bulk_hits is not None:
for hit in bulk_hits:
if _timed_out("backend bulk scan"):
return True
if len(match_rows) >= max_rows:
break
url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=False)
@@ -3712,6 +3752,8 @@ def check_url_exists_in_storage(
continue
for original_url, needles in url_needles.items():
if _timed_out("backend bulk scan"):
return True
if len(match_rows) >= max_rows:
break
if (original_url, str(backend_name)) in seen_pairs:
@@ -3737,6 +3779,8 @@ def check_url_exists_in_storage(
continue
for original_url, needles in url_needles.items():
if _timed_out("backend per-url scan"):
return True
if len(match_rows) >= max_rows:
break
if (original_url, str(backend_name)) in seen_pairs:

View File

@@ -387,18 +387,23 @@ class Download_File(Cmdlet):
total_items = len(expanded_items)
processed_items = 0
debug(f"[download-file] Processing {total_items} piped item(s)...")
try:
if total_items:
progress.set_percent(0)
except Exception:
pass
for item in expanded_items:
for idx, item in enumerate(expanded_items, 1):
try:
label = "item"
table = get_field(item, "table")
title = get_field(item, "title")
target = get_field(item, "path") or get_field(item, "url")
debug(f"[download-file] Item {idx}/{total_items}: {title or target or 'unnamed'}")
media_kind = get_field(item, "media_kind")
tags_val = get_field(item, "tag")
tags_list: Optional[List[str]]
@@ -931,15 +936,26 @@ class Download_File(Cmdlet):
@staticmethod
def _init_storage(config: Dict[str, Any]) -> tuple[Optional[Any], bool]:
# Cache storage object in config to avoid excessive DB initialization in loops
if isinstance(config, dict) and "_storage_cache" in config:
cached = config["_storage_cache"]
if isinstance(cached, tuple) and len(cached) == 2:
return cached # type: ignore
storage = None
hydrus_available = True
try:
from Store import Store
from API.HydrusNetwork import is_hydrus_available
debug(f"[download-file] Initializing storage interface...")
storage = Store(config=config or {}, suppress_debug=True)
hydrus_available = bool(is_hydrus_available(config or {}))
except Exception:
if isinstance(config, dict):
config["_storage_cache"] = (storage, hydrus_available)
except Exception as e:
debug(f"[download-file] Storage initialization error: {e}")
storage = None
return storage, hydrus_available
@@ -1052,6 +1068,7 @@ class Download_File(Cmdlet):
@staticmethod
def _canonicalize_url_for_storage(*, requested_url: str, ytdlp_tool: YtDlpTool, playlist_items: Optional[str]) -> str:
if playlist_items:
debug(f"[download-file] Skipping canonicalization for playlist item(s): {playlist_items}")
return str(requested_url)
try:
cf = None
@@ -1061,14 +1078,19 @@ class Download_File(Cmdlet):
cf = str(cookie_path)
except Exception:
cf = None
debug(f"[download-file] Canonicalizing URL: {requested_url}")
pr = probe_url(requested_url, no_playlist=False, timeout_seconds=15, cookiefile=cf)
if isinstance(pr, dict):
for key in ("webpage_url", "original_url", "url", "requested_url"):
value = pr.get(key)
if isinstance(value, str) and value.strip():
return value.strip()
except Exception:
pass
canon = value.strip()
if canon != requested_url:
debug(f"[download-file] Resolved canonical URL: {requested_url} -> {canon}")
return canon
except Exception as e:
debug(f"[download-file] Canonicalization error for {requested_url}: {e}")
return str(requested_url)
@@ -1113,6 +1135,10 @@ class Download_File(Cmdlet):
def _maybe_show_playlist_table(self, *, url: str, ytdlp_tool: YtDlpTool) -> bool:
ctx = pipeline_context.get_stage_context()
if ctx is not None and getattr(ctx, "total_stages", 0) > 1:
return False
try:
cf = self._cookiefile_str(ytdlp_tool)
pr = probe_url(url, no_playlist=False, timeout_seconds=15, cookiefile=cf)
@@ -1240,6 +1266,13 @@ class Download_File(Cmdlet):
args: Sequence[str],
skip_preflight: bool = False,
) -> Optional[int]:
try:
ctx = pipeline_context.get_stage_context()
if ctx is not None and getattr(ctx, "total_stages", 0) > 1:
# In pipelines, skip interactive format tables; require explicit -query format.
return None
except Exception:
pass
if (
mode != "audio"
and not clip_spec
@@ -1415,7 +1448,7 @@ class Download_File(Cmdlet):
for url in supported_url:
try:
debug(f"Processing: {url}")
debug(f"[download-file] Processing URL in loop (1/3 stage 1): {url}")
canonical_url = self._canonicalize_url_for_storage(
requested_url=url,
@@ -1424,6 +1457,7 @@ class Download_File(Cmdlet):
)
if not skip_per_url_preflight:
debug(f"[download-file] Running duplicate preflight for: {canonical_url}")
if not self._preflight_url_duplicate(
storage=storage,
hydrus_available=hydrus_available,
@@ -1431,7 +1465,7 @@ class Download_File(Cmdlet):
candidate_url=canonical_url,
extra_urls=[url],
):
log(f"Skipping download: {url}", file=sys.stderr)
log(f"Skipping download (duplicate found): {url}", file=sys.stderr)
continue
PipelineProgress(pipeline_context).begin_steps(2)
@@ -1510,9 +1544,9 @@ class Download_File(Cmdlet):
)
PipelineProgress(pipeline_context).step("downloading")
debug(f"Starting download with 5-minute timeout...")
debug(f"Starting download for {url} (format: {actual_format or 'default'}) with {download_timeout_seconds}s activity timeout...")
result_obj = _download_with_timeout(opts, timeout_seconds=download_timeout_seconds)
debug(f"Download completed, building pipe object...")
debug(f"Download completed for {url}, building pipe object...")
break
except DownloadError as e:
cause = getattr(e, "__cause__", None)
@@ -1816,14 +1850,21 @@ class Download_File(Cmdlet):
debug(f"Output directory: {final_output_dir}")
try:
PipelineProgress(pipeline_context).ensure_local_ui(
label="download-file",
total_items=len(supported_url),
items_preview=supported_url,
)
except Exception:
pass
# If we are already in a pipeline stage, the parent UI is already handling progress.
# Calling ensure_local_ui can cause re-initialization hangs on some platforms.
if pipeline_context.get_stage_context() is None:
debug("[download-file] Initializing local UI...")
PipelineProgress(pipeline_context).ensure_local_ui(
label="download-file",
total_items=len(supported_url),
items_preview=supported_url,
)
else:
debug("[download-file] Skipping local UI: running inside pipeline stage")
except Exception as e:
debug(f"[download-file] PipelineProgress update error: {e}")
debug("[download-file] Parsing clip and query specs...")
clip_spec = parsed.get("clip")
query_spec = parsed.get("query")
@@ -1914,6 +1955,7 @@ class Download_File(Cmdlet):
if query_format and not query_wants_audio:
try:
debug(f"[download-file] Resolving numeric format for {candidate_url}...")
idx_fmt = self._format_id_for_query_index(query_format, candidate_url, formats_cache, ytdlp_tool)
except ValueError as e:
log(f"Error parsing format selection: {e}", file=sys.stderr)
@@ -1923,6 +1965,7 @@ class Download_File(Cmdlet):
ytdl_format = idx_fmt
if not ytdl_format:
debug(f"[download-file] Checking for playlist at {candidate_url}...")
if self._maybe_show_playlist_table(url=candidate_url, ytdlp_tool=ytdlp_tool):
playlist_selection_handled = True
try:
@@ -1996,6 +2039,7 @@ class Download_File(Cmdlet):
forced_single_format_id = None
forced_single_format_for_batch = False
debug(f"[download-file] Checking if format table should be shown...")
early_ret = self._maybe_show_format_table_for_single_url(
mode=mode,
clip_spec=clip_spec,
@@ -2023,6 +2067,7 @@ class Download_File(Cmdlet):
except Exception:
timeout_seconds = 300
debug(f"[download-file] Proceeding to final download call for {len(supported_url)} URL(s)...")
return self._download_supported_urls(
supported_url=supported_url,
ytdlp_tool=ytdlp_tool,
@@ -2693,14 +2738,17 @@ class Download_File(Cmdlet):
config["_skip_direct_on_streaming_failure"] = True
if isinstance(config, dict) and config.get("_pipeobject_timeout_seconds") is None:
config["_pipeobject_timeout_seconds"] = 60
# Use a generous default for individual items
config["_pipeobject_timeout_seconds"] = 600
successes = 0
failures = 0
last_code = 0
for run_args in selection_runs:
debug(f"[ytdlp] Detected selection args from table selection: {run_args}")
debug(f"[ytdlp] Re-invoking download-file with: {run_args}")
total_selection = len(selection_runs)
debug(f"[download-file] Processing {total_selection} selected item(s) from table...")
for idx, run_args in enumerate(selection_runs, 1):
debug(f"[download-file] Item {idx}/{total_selection}: {run_args}")
debug(f"[download-file] Re-invoking download-file for selected item...")
exit_code = self._run_impl(None, run_args, config)
if exit_code == 0:
successes += 1