f

2026-01-24 01:38:12 -08:00
parent 4e4c374908
commit 3a4d3f029d
5 changed files with 210 additions and 229 deletions
@@ -3253,6 +3253,20 @@ def check_url_exists_in_storage(

        return out

+    def _dedupe_needles(raw_needles: Sequence[str]) -> List[str]:
+        output: List[str] = []
+        seen: set[str] = set()
+        for candidate in (raw_needles or []):
+            candidate_text = str(candidate or "").strip()
+            if not candidate_text:
+                continue
+            key = candidate_text.lower()
+            if key in seen:
+                continue
+            seen.add(key)
+            output.append(candidate_text)
+        return output
+
    url_needles: Dict[str, List[str]] = {}
    for u in unique_urls:
        needles: List[str] = []
@@ -3301,7 +3315,8 @@ def check_url_exists_in_storage(
                    normalized.append(norm_extra)

        combined = filtered + expanded + lowered + normalized
-        url_needles[u] = combined if combined else [u]
+        deduped = _dedupe_needles(combined)
+        url_needles[u] = deduped if deduped else [u]

    if in_pipeline:
        preflight_cache = _load_preflight_cache()
@@ -3341,7 +3356,10 @@ def check_url_exists_in_storage(
    if _timed_out("before backend scan"):
        return True

-    bulk_mode = len(unique_urls) > 1
+    # Use bulk mode only if we have a significant number of URLs.
+    # For small sets (1-3 URLs), individual targeted searches are faster
+    # and more accurate than scanning all files with URLs in the backend.
+    bulk_mode = len(unique_urls) > 3

    def _build_bulk_patterns(needles_map: Dict[str, List[str]], max_per_url: int = 3, max_total: int = 240) -> List[str]:
        patterns: List[str] = []
@@ -3455,6 +3473,24 @@ def check_url_exists_in_storage(
        needles: Sequence[str],
    ) -> Optional[Dict[str, Any]]:
        backend_hits: List[Dict[str, Any]] = []
+
+        # 1) Try exact match first (no wildcards).
+        # This is extremely fast for Hydrus and others that support direct URL lookup.
+        for needle in (needles or [])[:5]:
+            needle_stripped = str(needle or "").strip()
+            if not needle_stripped or not _httpish(needle_stripped):
+                continue
+            try:
+                # Use 'url:' prefix to ensure storage layers (like Hydrus) recognize it as a URL lookup
+                query = f"url:{needle_stripped}"
+                backend_hits = backend.search(query, limit=1, minimal=True) or []
+                if backend_hits:
+                    return _build_display_row_for_hit(backend_hits[0], backend_name, original_url)
+            except Exception:
+                continue
+
+        # 2) Fallback to wildcard substring search for normalized variants.
+        # This is for backends where the URL might be stored differently (partial match).
        for needle in (needles or [])[:3]:
            needle_text = str(needle or "").strip()
            if not needle_text:
@@ -3462,7 +3498,7 @@ def check_url_exists_in_storage(
            search_needle = _normalize_url_for_search(needle_text) or needle_text
            query = f"url:*{search_needle}*"
            try:
-                backend_hits = backend.search(query, limit=1) or []
+                backend_hits = backend.search(query, limit=1, minimal=True) or []
                if backend_hits:
                    break
            except Exception:
@@ -3540,61 +3576,6 @@ def check_url_exists_in_storage(
            if _timed_out("hydrus scan"):
                return True

-            if bulk_mode and bulk_patterns:
-                bulk_hits: Optional[List[Any]] = None
-                bulk_limit = min(2000, max(200, len(unique_urls) * 8))
-                try:
-                    bulk_hits = backend.search(
-                        "url:*",
-                        limit=bulk_limit,
-                        pattern_hint=bulk_patterns,
-                    ) or []
-                except Exception:
-                    try:
-                        bulk_hits = backend.search("url:*", limit=bulk_limit) or []
-                    except Exception:
-                        bulk_hits = None
-
-                if bulk_hits is None:
-                    debug("Bulk URL preflight: Hydrus bulk scan failed; skipping per-URL checks")
-                    continue
-
-                for hit in bulk_hits:
-                    if _timed_out("hydrus bulk scan"):
-                        return True
-                    if len(match_rows) >= max_rows:
-                        break
-                    url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=False)
-                    if not url_values:
-                        continue
-
-                    for original_url, needles in url_needles.items():
-                        if _timed_out("hydrus bulk scan"):
-                            return True
-                        if len(match_rows) >= max_rows:
-                            break
-                        if (original_url, str(backend_name)) in seen_pairs:
-                            continue
-
-                        matched = False
-                        for url_value in url_values:
-                            for needle in (needles or []):
-                                if _match_normalized_url(str(needle or ""), str(url_value or "")):
-                                    matched = True
-                                    break
-                            if matched:
-                                break
-
-                        if not matched:
-                            continue
-
-                        seen_pairs.add((original_url, str(backend_name)))
-                        matched_urls.add(original_url)
-                        match_rows.append(
-                            _build_display_row_for_hit(hit, str(backend_name), original_url)
-                        )
-                continue
-
            for original_url, needles in url_needles.items():
                if _timed_out("hydrus per-url scan"):
                    return True
@@ -3616,8 +3597,7 @@ def check_url_exists_in_storage(
                            endpoint="/add_urls/get_url_files",
                            query={"url": needle},
                        )
-                        # Access internal client safely if possible, else skip check
-                        if hasattr(client, "_perform_request"): 
+                        if hasattr(client, "_perform_request"):
                            response = client._perform_request(spec)
                            raw_hashes = None
                            if isinstance(response, dict):
@@ -3638,11 +3618,6 @@ def check_url_exists_in_storage(
                        continue

                if not found:
-                    fallback_row = _search_backend_url_hits(backend, str(backend_name), original_url, needles)
-                    if fallback_row:
-                        seen_pairs.add((original_url, str(backend_name)))
-                        matched_urls.add(original_url)
-                        match_rows.append(fallback_row)
                    continue

                seen_pairs.add((original_url, str(backend_name)))
@@ -7,6 +7,7 @@ import sys
 import shutil
 import tempfile
 import re
+from urllib.parse import urlparse

 from SYS import models
 from SYS import pipeline as ctx
@@ -14,6 +15,7 @@ from SYS.logger import log, debug, is_debug_enabled
 from SYS.pipeline_progress import PipelineProgress
 from SYS.utils_constant import ALL_SUPPORTED_EXTENSIONS
 from Store import Store
+from API.HTTP import _download_direct_file
 from . import _shared as sh

 Cmdlet = sh.Cmdlet
@@ -34,7 +36,7 @@ coerce_to_path = sh.coerce_to_path
 build_pipeline_preview = sh.build_pipeline_preview
 get_field = sh.get_field

-from SYS.utils import sha256_file, unique_path
+from SYS.utils import sha256_file, unique_path, sanitize_filename
 from SYS.metadata import write_metadata

 # Canonical supported filetypes for all stores/cmdlets
@@ -1079,6 +1081,62 @@ class Add_File(Cmdlet):
                pass
        return None, None

+    @staticmethod
+    def _build_provider_filename(
+        pipe_obj: models.PipeObject,
+        fallback_hash: Optional[str] = None,
+        source_url: Optional[str] = None,
+    ) -> str:
+        title_candidates: List[str] = []
+        title_value = getattr(pipe_obj, "title", "")
+        if title_value:
+            title_candidates.append(str(title_value))
+
+        extra = getattr(pipe_obj, "extra", {})
+        if isinstance(extra, dict):
+            candid = extra.get("name") or extra.get("title")
+            if candid:
+                title_candidates.append(str(candid))
+
+        metadata = getattr(pipe_obj, "metadata", {})
+        if isinstance(metadata, dict):
+            meta_name = metadata.get("title") or metadata.get("name")
+            if meta_name:
+                title_candidates.append(str(meta_name))
+
+        text = ""
+        for candidate in title_candidates:
+            if candidate:
+                text = candidate.strip()
+                if text:
+                    break
+
+        if not text and fallback_hash:
+            text = fallback_hash[:8]
+
+        safe_name = sanitize_filename(text or "download")
+
+        ext = ""
+        if isinstance(metadata, dict):
+            ext = metadata.get("ext") or metadata.get("extension") or ""
+        if not ext and isinstance(extra, dict):
+            ext = extra.get("ext") or ""
+        if not ext and source_url:
+            try:
+                parsed = urlparse(source_url)
+                ext = Path(parsed.path).suffix.lstrip(".")
+            except Exception:
+                ext = ""
+
+        if ext:
+            ext_text = str(ext)
+            if not ext_text.startswith("."):
+                ext_text = "." + ext_text.lstrip(".")
+            if not safe_name.lower().endswith(ext_text.lower()):
+                safe_name = f"{safe_name}{ext_text}"
+
+        return safe_name or "download"
+
    @staticmethod
    def _resolve_backend_by_name(store: Any, backend_name: str) -> Optional[Any]:
        if not store or not backend_name:
@@ -1219,6 +1277,32 @@ class Add_File(Cmdlet):
                )
                if dl_path and dl_path.exists():
                    return dl_path, str(r_hash), tmp_dir
+                source_url = str(source).strip()
+                if source_url.lower().startswith(("http://", "https://")):
+                    download_dir = Path(tempfile.mkdtemp(prefix="add-file-src-"))
+                    try:
+                        filename = Add_File._build_provider_filename(
+                            pipe_obj,
+                            str(r_hash),
+                            source_url,
+                        )
+                        downloaded = _download_direct_file(
+                            source_url,
+                            download_dir,
+                            quiet=True,
+                            suggested_filename=filename,
+                        )
+                        downloaded_path = downloaded.path
+                        if downloaded_path and downloaded_path.exists():
+                            pipe_obj.is_temp = True
+                            pipe_obj.path = str(downloaded_path)
+                            return downloaded_path, str(r_hash), download_dir
+                    except Exception as exc:
+                        debug(f"[add-file] Provider download failed: {exc}")
+                    try:
+                        shutil.rmtree(download_dir, ignore_errors=True)
+                    except Exception:
+                        pass
        except Exception:
            pass

@@ -241,95 +241,32 @@ class Get_Url(Cmdlet):
        return None

    @staticmethod
-    def _resolve_title_for_hash(backend: Any, file_hash: str, hit: Any = None) -> str:
-        """Best-effort title resolution for a found hash.
-
-        Strategy:
-        - Use the hit's existing title/columns when present.
-        - Prefer backend.get_metadata(hash) when available (direct lookup).
-        - Fallback to backend.search('hash:<sha>', limit=1) and read title.
-        """
-        try:
-            if hit is not None:
-                from_hit = Get_Url._extract_title_from_result(hit)
-                if from_hit:
-                    return from_hit
-        except Exception:
-            pass
-
-        try:
-            if hasattr(backend, "get_metadata"):
-                meta = backend.get_metadata(file_hash)
-                if isinstance(meta, dict):
-                    t = meta.get("title")
-                    if isinstance(t, str) and t.strip():
-                        return t.strip()
-        except Exception:
-            pass
-
-        try:
-            if hasattr(backend, "search"):
-                hits = backend.search(f"hash:{file_hash}", limit=1)
-                if isinstance(hits, list) and hits:
-                    t2 = Get_Url._extract_title_from_result(hits[0])
-                    if t2:
-                        return t2
-        except Exception:
-            pass
-
-        return ""
+    def _extract_size_from_hit(hit: Any) -> int | None:
+        for key in ("size", "file_size", "filesize", "size_bytes"):
+            try:
+                val = get_field(hit, key)
+            except Exception:
+                val = None
+            if val is None:
+                continue
+            if isinstance(val, (int, float)):
+                return int(val)
+            try:
+                return int(val)
+            except Exception:
+                continue
+        return None

    @staticmethod
-    def _resolve_size_ext_for_hash(backend: Any, file_hash: str, hit: Any = None) -> tuple[int | None, str]:
-        """Best-effort (size, ext) resolution for a found hash."""
-        # First: see if the hit already includes these fields.
-        try:
-            size_val = get_field(hit, "size")
-            if size_val is None:
-                size_val = get_field(hit, "file_size")
-            if size_val is None:
-                size_val = get_field(hit, "filesize")
-            if size_val is None:
-                size_val = get_field(hit, "size_bytes")
-            size_int = int(size_val) if isinstance(size_val, (int, float)) else None
-        except Exception:
-            size_int = None
-
-        try:
-            ext_val = get_field(hit, "ext")
-            if ext_val is None:
-                ext_val = get_field(hit, "extension")
-            ext = str(ext_val).strip().lstrip(".") if isinstance(ext_val, str) else ""
-        except Exception:
-            ext = ""
-
-        if size_int is not None or ext:
-            return size_int, ext
-
-        # Next: backend.get_metadata(hash) when available.
-        try:
-            if hasattr(backend, "get_metadata"):
-                meta = backend.get_metadata(file_hash)
-                if isinstance(meta, dict):
-                    size_val2 = meta.get("size")
-                    if size_val2 is None:
-                        size_val2 = meta.get("file_size")
-                    if size_val2 is None:
-                        size_val2 = meta.get("filesize")
-                    if size_val2 is None:
-                        size_val2 = meta.get("size_bytes")
-                    if isinstance(size_val2, (int, float)):
-                        size_int = int(size_val2)
-
-                    ext_val2 = meta.get("ext")
-                    if ext_val2 is None:
-                        ext_val2 = meta.get("extension")
-                    if isinstance(ext_val2, str) and ext_val2.strip():
-                        ext = ext_val2.strip().lstrip(".")
-        except Exception:
-            pass
-
-        return size_int, ext
+    def _extract_ext_from_hit(hit: Any) -> str:
+        for key in ("ext", "extension"):
+            try:
+                ext_val = get_field(hit, key)
+            except Exception:
+                ext_val = None
+            if isinstance(ext_val, str) and ext_val.strip():
+                return ext_val.strip().lstrip(".")
+        return ""

    def _search_urls_across_stores(self,
                                   pattern: str,
@@ -360,9 +297,6 @@ class Get_Url(Cmdlet):
                try:
                    backend = storage[store_name]

-                    title_cache: Dict[str, str] = {}
-                    meta_cache: Dict[str, tuple[int | None, str]] = {}
-
                    # Search only URL-bearing records using the backend's URL search capability.
                    # This avoids the expensive/incorrect "search('*')" scan.
                    try:
@@ -431,22 +365,12 @@ class Get_Url(Cmdlet):
                            search_limit,
                            store_name,
                            pattern_hint=target_pattern,
+                            minimal=True,
                        )
                        if search_results is None:
                            continue

                        search_results = search_results or []
-                        if not search_results and target_pattern and not has_wildcards:
-                            fallback_results = self._execute_search_with_timeout(
-                                backend,
-                                "url:*",
-                                search_limit,
-                                store_name,
-                                pattern_hint=target_pattern,
-                            )
-                            if fallback_results is None:
-                                continue
-                            search_results = fallback_results or []

                        for hit in (search_results or []):
                            if len(items) >= MAX_RESULTS:
@@ -459,44 +383,9 @@ class Get_Url(Cmdlet):

                            file_hash = str(file_hash)

-                            title = title_cache.get(file_hash, "")
-                            if not title:
-                                try:
-                                    title = (
-                                        get_field(hit, "title")
-                                        or get_field(hit, "name")
-                                        or get_field(hit, "file_title")
-                                        or ""
-                                    )
-                                except Exception:
-                                    title = ""
-                            if not title:
-                                title = self._resolve_title_for_hash(backend, file_hash, hit)
-                            title_cache[file_hash] = title
-
-                            size, ext = meta_cache.get(file_hash, (None, ""))
-                            if size is None and not ext:
-                                try:
-                                    size = get_field(hit, "size")
-                                    if size is None:
-                                        size = get_field(hit, "size_bytes")
-                                    if size is None:
-                                        size = get_field(hit, "file_size")
-                                    if size is None:
-                                        size = get_field(hit, "filesize")
-                                    size = int(size) if isinstance(size, (int, float)) else None
-                                except Exception:
-                                    size = None
-
-                                try:
-                                    ext = get_field(hit, "ext") or get_field(hit, "extension")
-                                    ext = str(ext).strip().lstrip(".") if isinstance(ext, str) else ""
-                                except Exception:
-                                    ext = ""
-
-                            if size is None and not ext:
-                                size, ext = self._resolve_size_ext_for_hash(backend, file_hash, hit)
-                            meta_cache[file_hash] = (size, ext)
+                            title = self._extract_title_from_result(hit) or ""
+                            size = self._extract_size_from_hit(hit)
+                            ext = self._extract_ext_from_hit(hit)

                            urls = self._extract_urls_from_hit(hit)
                            if not urls:
@@ -505,6 +394,7 @@ class Get_Url(Cmdlet):
                                except Exception:
                                    urls = []

+                            hit_added = False
                            for url in (urls or []):
                                if len(items) >= MAX_RESULTS:
                                    break
@@ -526,7 +416,9 @@ class Get_Url(Cmdlet):
                                        ext=str(ext or ""),
                                    )
                                )
-                            found_stores.add(str(store_name))
+                                hit_added = True
+                            if hit_added:
+                                found_stores.add(str(store_name))
                            if len(items) >= MAX_RESULTS:
                                break
                    except Exception as exc: