This commit is contained in:
2026-01-24 01:38:12 -08:00
parent 4e4c374908
commit 3a4d3f029d
5 changed files with 210 additions and 229 deletions

View File

@@ -3253,6 +3253,20 @@ def check_url_exists_in_storage(
return out
def _dedupe_needles(raw_needles: Sequence[str]) -> List[str]:
output: List[str] = []
seen: set[str] = set()
for candidate in (raw_needles or []):
candidate_text = str(candidate or "").strip()
if not candidate_text:
continue
key = candidate_text.lower()
if key in seen:
continue
seen.add(key)
output.append(candidate_text)
return output
url_needles: Dict[str, List[str]] = {}
for u in unique_urls:
needles: List[str] = []
@@ -3301,7 +3315,8 @@ def check_url_exists_in_storage(
normalized.append(norm_extra)
combined = filtered + expanded + lowered + normalized
url_needles[u] = combined if combined else [u]
deduped = _dedupe_needles(combined)
url_needles[u] = deduped if deduped else [u]
if in_pipeline:
preflight_cache = _load_preflight_cache()
@@ -3341,7 +3356,10 @@ def check_url_exists_in_storage(
if _timed_out("before backend scan"):
return True
bulk_mode = len(unique_urls) > 1
# Use bulk mode only if we have a significant number of URLs.
# For small sets (1-3 URLs), individual targeted searches are faster
# and more accurate than scanning all files with URLs in the backend.
bulk_mode = len(unique_urls) > 3
def _build_bulk_patterns(needles_map: Dict[str, List[str]], max_per_url: int = 3, max_total: int = 240) -> List[str]:
patterns: List[str] = []
@@ -3455,6 +3473,24 @@ def check_url_exists_in_storage(
needles: Sequence[str],
) -> Optional[Dict[str, Any]]:
backend_hits: List[Dict[str, Any]] = []
# 1) Try exact match first (no wildcards).
# This is extremely fast for Hydrus and others that support direct URL lookup.
for needle in (needles or [])[:5]:
needle_stripped = str(needle or "").strip()
if not needle_stripped or not _httpish(needle_stripped):
continue
try:
# Use 'url:' prefix to ensure storage layers (like Hydrus) recognize it as a URL lookup
query = f"url:{needle_stripped}"
backend_hits = backend.search(query, limit=1, minimal=True) or []
if backend_hits:
return _build_display_row_for_hit(backend_hits[0], backend_name, original_url)
except Exception:
continue
# 2) Fallback to wildcard substring search for normalized variants.
# This is for backends where the URL might be stored differently (partial match).
for needle in (needles or [])[:3]:
needle_text = str(needle or "").strip()
if not needle_text:
@@ -3462,7 +3498,7 @@ def check_url_exists_in_storage(
search_needle = _normalize_url_for_search(needle_text) or needle_text
query = f"url:*{search_needle}*"
try:
backend_hits = backend.search(query, limit=1) or []
backend_hits = backend.search(query, limit=1, minimal=True) or []
if backend_hits:
break
except Exception:
@@ -3540,61 +3576,6 @@ def check_url_exists_in_storage(
if _timed_out("hydrus scan"):
return True
if bulk_mode and bulk_patterns:
bulk_hits: Optional[List[Any]] = None
bulk_limit = min(2000, max(200, len(unique_urls) * 8))
try:
bulk_hits = backend.search(
"url:*",
limit=bulk_limit,
pattern_hint=bulk_patterns,
) or []
except Exception:
try:
bulk_hits = backend.search("url:*", limit=bulk_limit) or []
except Exception:
bulk_hits = None
if bulk_hits is None:
debug("Bulk URL preflight: Hydrus bulk scan failed; skipping per-URL checks")
continue
for hit in bulk_hits:
if _timed_out("hydrus bulk scan"):
return True
if len(match_rows) >= max_rows:
break
url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=False)
if not url_values:
continue
for original_url, needles in url_needles.items():
if _timed_out("hydrus bulk scan"):
return True
if len(match_rows) >= max_rows:
break
if (original_url, str(backend_name)) in seen_pairs:
continue
matched = False
for url_value in url_values:
for needle in (needles or []):
if _match_normalized_url(str(needle or ""), str(url_value or "")):
matched = True
break
if matched:
break
if not matched:
continue
seen_pairs.add((original_url, str(backend_name)))
matched_urls.add(original_url)
match_rows.append(
_build_display_row_for_hit(hit, str(backend_name), original_url)
)
continue
for original_url, needles in url_needles.items():
if _timed_out("hydrus per-url scan"):
return True
@@ -3616,8 +3597,7 @@ def check_url_exists_in_storage(
endpoint="/add_urls/get_url_files",
query={"url": needle},
)
# Access internal client safely if possible, else skip check
if hasattr(client, "_perform_request"):
if hasattr(client, "_perform_request"):
response = client._perform_request(spec)
raw_hashes = None
if isinstance(response, dict):
@@ -3638,11 +3618,6 @@ def check_url_exists_in_storage(
continue
if not found:
fallback_row = _search_backend_url_hits(backend, str(backend_name), original_url, needles)
if fallback_row:
seen_pairs.add((original_url, str(backend_name)))
matched_urls.add(original_url)
match_rows.append(fallback_row)
continue
seen_pairs.add((original_url, str(backend_name)))

View File

@@ -7,6 +7,7 @@ import sys
import shutil
import tempfile
import re
from urllib.parse import urlparse
from SYS import models
from SYS import pipeline as ctx
@@ -14,6 +15,7 @@ from SYS.logger import log, debug, is_debug_enabled
from SYS.pipeline_progress import PipelineProgress
from SYS.utils_constant import ALL_SUPPORTED_EXTENSIONS
from Store import Store
from API.HTTP import _download_direct_file
from . import _shared as sh
Cmdlet = sh.Cmdlet
@@ -34,7 +36,7 @@ coerce_to_path = sh.coerce_to_path
build_pipeline_preview = sh.build_pipeline_preview
get_field = sh.get_field
from SYS.utils import sha256_file, unique_path
from SYS.utils import sha256_file, unique_path, sanitize_filename
from SYS.metadata import write_metadata
# Canonical supported filetypes for all stores/cmdlets
@@ -1079,6 +1081,62 @@ class Add_File(Cmdlet):
pass
return None, None
@staticmethod
def _build_provider_filename(
pipe_obj: models.PipeObject,
fallback_hash: Optional[str] = None,
source_url: Optional[str] = None,
) -> str:
title_candidates: List[str] = []
title_value = getattr(pipe_obj, "title", "")
if title_value:
title_candidates.append(str(title_value))
extra = getattr(pipe_obj, "extra", {})
if isinstance(extra, dict):
candid = extra.get("name") or extra.get("title")
if candid:
title_candidates.append(str(candid))
metadata = getattr(pipe_obj, "metadata", {})
if isinstance(metadata, dict):
meta_name = metadata.get("title") or metadata.get("name")
if meta_name:
title_candidates.append(str(meta_name))
text = ""
for candidate in title_candidates:
if candidate:
text = candidate.strip()
if text:
break
if not text and fallback_hash:
text = fallback_hash[:8]
safe_name = sanitize_filename(text or "download")
ext = ""
if isinstance(metadata, dict):
ext = metadata.get("ext") or metadata.get("extension") or ""
if not ext and isinstance(extra, dict):
ext = extra.get("ext") or ""
if not ext and source_url:
try:
parsed = urlparse(source_url)
ext = Path(parsed.path).suffix.lstrip(".")
except Exception:
ext = ""
if ext:
ext_text = str(ext)
if not ext_text.startswith("."):
ext_text = "." + ext_text.lstrip(".")
if not safe_name.lower().endswith(ext_text.lower()):
safe_name = f"{safe_name}{ext_text}"
return safe_name or "download"
@staticmethod
def _resolve_backend_by_name(store: Any, backend_name: str) -> Optional[Any]:
if not store or not backend_name:
@@ -1219,6 +1277,32 @@ class Add_File(Cmdlet):
)
if dl_path and dl_path.exists():
return dl_path, str(r_hash), tmp_dir
source_url = str(source).strip()
if source_url.lower().startswith(("http://", "https://")):
download_dir = Path(tempfile.mkdtemp(prefix="add-file-src-"))
try:
filename = Add_File._build_provider_filename(
pipe_obj,
str(r_hash),
source_url,
)
downloaded = _download_direct_file(
source_url,
download_dir,
quiet=True,
suggested_filename=filename,
)
downloaded_path = downloaded.path
if downloaded_path and downloaded_path.exists():
pipe_obj.is_temp = True
pipe_obj.path = str(downloaded_path)
return downloaded_path, str(r_hash), download_dir
except Exception as exc:
debug(f"[add-file] Provider download failed: {exc}")
try:
shutil.rmtree(download_dir, ignore_errors=True)
except Exception:
pass
except Exception:
pass

View File

@@ -241,95 +241,32 @@ class Get_Url(Cmdlet):
return None
@staticmethod
def _resolve_title_for_hash(backend: Any, file_hash: str, hit: Any = None) -> str:
"""Best-effort title resolution for a found hash.
Strategy:
- Use the hit's existing title/columns when present.
- Prefer backend.get_metadata(hash) when available (direct lookup).
- Fallback to backend.search('hash:<sha>', limit=1) and read title.
"""
try:
if hit is not None:
from_hit = Get_Url._extract_title_from_result(hit)
if from_hit:
return from_hit
except Exception:
pass
try:
if hasattr(backend, "get_metadata"):
meta = backend.get_metadata(file_hash)
if isinstance(meta, dict):
t = meta.get("title")
if isinstance(t, str) and t.strip():
return t.strip()
except Exception:
pass
try:
if hasattr(backend, "search"):
hits = backend.search(f"hash:{file_hash}", limit=1)
if isinstance(hits, list) and hits:
t2 = Get_Url._extract_title_from_result(hits[0])
if t2:
return t2
except Exception:
pass
return ""
def _extract_size_from_hit(hit: Any) -> int | None:
for key in ("size", "file_size", "filesize", "size_bytes"):
try:
val = get_field(hit, key)
except Exception:
val = None
if val is None:
continue
if isinstance(val, (int, float)):
return int(val)
try:
return int(val)
except Exception:
continue
return None
@staticmethod
def _resolve_size_ext_for_hash(backend: Any, file_hash: str, hit: Any = None) -> tuple[int | None, str]:
"""Best-effort (size, ext) resolution for a found hash."""
# First: see if the hit already includes these fields.
try:
size_val = get_field(hit, "size")
if size_val is None:
size_val = get_field(hit, "file_size")
if size_val is None:
size_val = get_field(hit, "filesize")
if size_val is None:
size_val = get_field(hit, "size_bytes")
size_int = int(size_val) if isinstance(size_val, (int, float)) else None
except Exception:
size_int = None
try:
ext_val = get_field(hit, "ext")
if ext_val is None:
ext_val = get_field(hit, "extension")
ext = str(ext_val).strip().lstrip(".") if isinstance(ext_val, str) else ""
except Exception:
ext = ""
if size_int is not None or ext:
return size_int, ext
# Next: backend.get_metadata(hash) when available.
try:
if hasattr(backend, "get_metadata"):
meta = backend.get_metadata(file_hash)
if isinstance(meta, dict):
size_val2 = meta.get("size")
if size_val2 is None:
size_val2 = meta.get("file_size")
if size_val2 is None:
size_val2 = meta.get("filesize")
if size_val2 is None:
size_val2 = meta.get("size_bytes")
if isinstance(size_val2, (int, float)):
size_int = int(size_val2)
ext_val2 = meta.get("ext")
if ext_val2 is None:
ext_val2 = meta.get("extension")
if isinstance(ext_val2, str) and ext_val2.strip():
ext = ext_val2.strip().lstrip(".")
except Exception:
pass
return size_int, ext
def _extract_ext_from_hit(hit: Any) -> str:
for key in ("ext", "extension"):
try:
ext_val = get_field(hit, key)
except Exception:
ext_val = None
if isinstance(ext_val, str) and ext_val.strip():
return ext_val.strip().lstrip(".")
return ""
def _search_urls_across_stores(self,
pattern: str,
@@ -360,9 +297,6 @@ class Get_Url(Cmdlet):
try:
backend = storage[store_name]
title_cache: Dict[str, str] = {}
meta_cache: Dict[str, tuple[int | None, str]] = {}
# Search only URL-bearing records using the backend's URL search capability.
# This avoids the expensive/incorrect "search('*')" scan.
try:
@@ -431,22 +365,12 @@ class Get_Url(Cmdlet):
search_limit,
store_name,
pattern_hint=target_pattern,
minimal=True,
)
if search_results is None:
continue
search_results = search_results or []
if not search_results and target_pattern and not has_wildcards:
fallback_results = self._execute_search_with_timeout(
backend,
"url:*",
search_limit,
store_name,
pattern_hint=target_pattern,
)
if fallback_results is None:
continue
search_results = fallback_results or []
for hit in (search_results or []):
if len(items) >= MAX_RESULTS:
@@ -459,44 +383,9 @@ class Get_Url(Cmdlet):
file_hash = str(file_hash)
title = title_cache.get(file_hash, "")
if not title:
try:
title = (
get_field(hit, "title")
or get_field(hit, "name")
or get_field(hit, "file_title")
or ""
)
except Exception:
title = ""
if not title:
title = self._resolve_title_for_hash(backend, file_hash, hit)
title_cache[file_hash] = title
size, ext = meta_cache.get(file_hash, (None, ""))
if size is None and not ext:
try:
size = get_field(hit, "size")
if size is None:
size = get_field(hit, "size_bytes")
if size is None:
size = get_field(hit, "file_size")
if size is None:
size = get_field(hit, "filesize")
size = int(size) if isinstance(size, (int, float)) else None
except Exception:
size = None
try:
ext = get_field(hit, "ext") or get_field(hit, "extension")
ext = str(ext).strip().lstrip(".") if isinstance(ext, str) else ""
except Exception:
ext = ""
if size is None and not ext:
size, ext = self._resolve_size_ext_for_hash(backend, file_hash, hit)
meta_cache[file_hash] = (size, ext)
title = self._extract_title_from_result(hit) or ""
size = self._extract_size_from_hit(hit)
ext = self._extract_ext_from_hit(hit)
urls = self._extract_urls_from_hit(hit)
if not urls:
@@ -505,6 +394,7 @@ class Get_Url(Cmdlet):
except Exception:
urls = []
hit_added = False
for url in (urls or []):
if len(items) >= MAX_RESULTS:
break
@@ -526,7 +416,9 @@ class Get_Url(Cmdlet):
ext=str(ext or ""),
)
)
found_stores.add(str(store_name))
hit_added = True
if hit_added:
found_stores.add(str(store_name))
if len(items) >= MAX_RESULTS:
break
except Exception as exc: