f

2026-01-24 01:38:12 -08:00
parent 4e4c374908
commit 3a4d3f029d
5 changed files with 210 additions and 229 deletions
@@ -1,5 +1,5 @@
 import re
-from typing import Any, Dict, List, Set
+from typing import Any, Dict, List, Optional, Set
 def value_normalize(value: Any) -> str:
@@ -19,6 +19,18 @@ def _add_tag(tags: List[str], namespace: str, value: str) -> None:
        tags.append(candidate)
 def _extract_channel_from_tag(tag_value: str) -> Optional[str]:
    """Return the channel value if tag_value is namespaced with channel."""
    if not tag_value:
        return None
    normalized = tag_value.strip().lower()
    if not normalized.startswith("channel:"):
        return None
    _, _, remainder = normalized.partition(":")
    remainder = remainder.strip()
    return remainder or None
 def extract_ytdlp_tags(entry: Dict[str, Any]) -> List[str]:
    """ """
    tags: List[str] = []
@@ -67,7 +79,12 @@ def extract_ytdlp_tags(entry: Dict[str, Any]) -> List[str]:
            for tag_value in tags_field:
                if tag_value:
                    normalized = value_normalize(str(tag_value))
-                    if normalized and normalized not in tags:
+                    if not normalized:
                        continue
                    channel_candidate = _extract_channel_from_tag(normalized)
                    if channel_candidate:
                        _add_tag(tags, "channel", channel_candidate)
                    if normalized not in tags:
                        tags.append(normalized)
        elif isinstance(tags_field, dict):
            # Tags is dict: {"key": "val"} → tag:key:val
@@ -83,10 +100,16 @@ def extract_ytdlp_tags(entry: Dict[str, Any]) -> List[str]:
            if tag_str:
                for tag_value in re.split(r'[,\s]+', tag_str):
                    tag_value = tag_value.strip()
-                    if tag_value:
+                    if not tag_value:
-                        normalized = value_normalize(tag_value)
+                        continue
-                        if normalized and normalized not in tags:
+                    normalized = value_normalize(tag_value)
-                            tags.append(normalized)
+                    if not normalized:
                        continue
                    channel_candidate = _extract_channel_from_tag(normalized)
                    if channel_candidate:
                        _add_tag(tags, "channel", channel_candidate)
                    if normalized not in tags:
                        tags.append(normalized)
    # Extract chapters as tags if present
    chapters = entry.get("chapters")
@@ -454,6 +454,7 @@ class HydrusNetwork(Store):
            results = storage["hydrus"].search("Simple Man")
        """
        limit = kwargs.get("limit", 100)
        minimal = bool(kwargs.get("minimal", False))
        try:
            client = self._client
@@ -518,6 +519,8 @@ class HydrusNetwork(Store):
                fetch_limit: int,
                scan_limit: int | None = None,
                needles: Optional[Sequence[str]] = None,
                *,
                minimal: bool = False,
            ) -> list[dict[str, Any]]:
                """Best-effort URL search by scanning Hydrus metadata with include_file_url=True."""
@@ -620,19 +623,19 @@ class HydrusNetwork(Store):
                                payload = client.fetch_file_metadata(
                                    hashes=chunk,
                                    include_file_url=True,
-                                    include_service_keys_to_tags=True,
+                                    include_service_keys_to_tags=not minimal,
-                                    include_duration=True,
+                                    include_duration=not minimal,
-                                    include_size=True,
+                                    include_size=not minimal,
-                                    include_mime=True,
+                                    include_mime=not minimal,
                                )
                            else:
                                payload = client.fetch_file_metadata(
                                    file_ids=chunk,
                                    include_file_url=True,
-                                    include_service_keys_to_tags=True,
+                                    include_service_keys_to_tags=not minimal,
-                                    include_duration=True,
+                                    include_duration=not minimal,
-                                    include_size=True,
+                                    include_size=not minimal,
-                                    include_mime=True,
+                                    include_mime=not minimal,
                                )
                        except Exception:
                            continue
@@ -739,12 +742,14 @@ class HydrusNetwork(Store):
                                want_any=False,
                                fetch_limit=int(limit) if limit else 100,
                                needles=pattern_hints,
                                minimal=minimal,
                            )
                        else:
                            metadata_list = _iter_url_filtered_metadata(
                                None,
                                want_any=True,
-                                fetch_limit=int(limit) if limit else 100
+                                fetch_limit=int(limit) if limit else 100,
                                minimal=minimal,
                            )
                    else:
                        def _clean_url_search_token(value: str | None) -> str:
@@ -792,10 +797,10 @@ class HydrusNetwork(Store):
                                    payload = client.fetch_file_metadata(
                                        file_ids=file_ids,
                                        include_file_url=True,
-                                        include_service_keys_to_tags=True,
+                                        include_service_keys_to_tags=not minimal,
-                                        include_duration=True,
+                                        include_duration=not minimal,
-                                        include_size=True,
+                                        include_size=not minimal,
-                                        include_mime=True,
+                                        include_mime=not minimal,
                                    )
                                    metas = (
                                        payload.get("metadata",
@@ -810,10 +815,10 @@ class HydrusNetwork(Store):
                                    payload = client.fetch_file_metadata(
                                        hashes=hashes,
                                        include_file_url=True,
-                                        include_service_keys_to_tags=True,
+                                        include_service_keys_to_tags=not minimal,
-                                        include_duration=True,
+                                        include_duration=not minimal,
-                                        include_size=True,
+                                        include_size=not minimal,
-                                        include_mime=True,
+                                        include_mime=not minimal,
                                    )
                                    metas = (
                                        payload.get("metadata",
@@ -844,6 +849,7 @@ class HydrusNetwork(Store):
                                fetch_limit=int(limit) if limit else 100,
                                scan_limit=scan_limit_override,
                                needles=pattern_hints if pattern_hints else None,
                                minimal=minimal,
                            )
                elif namespace == "system":
                    normalized_system_predicate = pattern.strip()
@@ -857,6 +863,7 @@ class HydrusNetwork(Store):
                            want_any=not bool(pattern_hints),
                            fetch_limit=fetch_limit,
                            needles=pattern_hints if pattern_hints else None,
                            minimal=minimal,
                        )
            # Parse the query into tags
@@ -3253,6 +3253,20 @@ def check_url_exists_in_storage(
        return out
    def _dedupe_needles(raw_needles: Sequence[str]) -> List[str]:
        output: List[str] = []
        seen: set[str] = set()
        for candidate in (raw_needles or []):
            candidate_text = str(candidate or "").strip()
            if not candidate_text:
                continue
            key = candidate_text.lower()
            if key in seen:
                continue
            seen.add(key)
            output.append(candidate_text)
        return output
    url_needles: Dict[str, List[str]] = {}
    for u in unique_urls:
        needles: List[str] = []
@@ -3301,7 +3315,8 @@ def check_url_exists_in_storage(
                    normalized.append(norm_extra)
        combined = filtered + expanded + lowered + normalized
-        url_needles[u] = combined if combined else [u]
+        deduped = _dedupe_needles(combined)
        url_needles[u] = deduped if deduped else [u]
    if in_pipeline:
        preflight_cache = _load_preflight_cache()
@@ -3341,7 +3356,10 @@ def check_url_exists_in_storage(
    if _timed_out("before backend scan"):
        return True
-    bulk_mode = len(unique_urls) > 1
+    # Use bulk mode only if we have a significant number of URLs.
    # For small sets (1-3 URLs), individual targeted searches are faster
    # and more accurate than scanning all files with URLs in the backend.
    bulk_mode = len(unique_urls) > 3
    def _build_bulk_patterns(needles_map: Dict[str, List[str]], max_per_url: int = 3, max_total: int = 240) -> List[str]:
        patterns: List[str] = []
@@ -3455,6 +3473,24 @@ def check_url_exists_in_storage(
        needles: Sequence[str],
    ) -> Optional[Dict[str, Any]]:
        backend_hits: List[Dict[str, Any]] = []
        # 1) Try exact match first (no wildcards).
        # This is extremely fast for Hydrus and others that support direct URL lookup.
        for needle in (needles or [])[:5]:
            needle_stripped = str(needle or "").strip()
            if not needle_stripped or not _httpish(needle_stripped):
                continue
            try:
                # Use 'url:' prefix to ensure storage layers (like Hydrus) recognize it as a URL lookup
                query = f"url:{needle_stripped}"
                backend_hits = backend.search(query, limit=1, minimal=True) or []
                if backend_hits:
                    return _build_display_row_for_hit(backend_hits[0], backend_name, original_url)
            except Exception:
                continue
        # 2) Fallback to wildcard substring search for normalized variants.
        # This is for backends where the URL might be stored differently (partial match).
        for needle in (needles or [])[:3]:
            needle_text = str(needle or "").strip()
            if not needle_text:
@@ -3462,7 +3498,7 @@ def check_url_exists_in_storage(
            search_needle = _normalize_url_for_search(needle_text) or needle_text
            query = f"url:*{search_needle}*"
            try:
-                backend_hits = backend.search(query, limit=1) or []
+                backend_hits = backend.search(query, limit=1, minimal=True) or []
                if backend_hits:
                    break
            except Exception:
@@ -3540,61 +3576,6 @@ def check_url_exists_in_storage(
            if _timed_out("hydrus scan"):
                return True
            if bulk_mode and bulk_patterns:
                bulk_hits: Optional[List[Any]] = None
                bulk_limit = min(2000, max(200, len(unique_urls) * 8))
                try:
                    bulk_hits = backend.search(
                        "url:*",
                        limit=bulk_limit,
                        pattern_hint=bulk_patterns,
                    ) or []
                except Exception:
                    try:
                        bulk_hits = backend.search("url:*", limit=bulk_limit) or []
                    except Exception:
                        bulk_hits = None
                if bulk_hits is None:
                    debug("Bulk URL preflight: Hydrus bulk scan failed; skipping per-URL checks")
                    continue
                for hit in bulk_hits:
                    if _timed_out("hydrus bulk scan"):
                        return True
                    if len(match_rows) >= max_rows:
                        break
                    url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=False)
                    if not url_values:
                        continue
                    for original_url, needles in url_needles.items():
                        if _timed_out("hydrus bulk scan"):
                            return True
                        if len(match_rows) >= max_rows:
                            break
                        if (original_url, str(backend_name)) in seen_pairs:
                            continue
                        matched = False
                        for url_value in url_values:
                            for needle in (needles or []):
                                if _match_normalized_url(str(needle or ""), str(url_value or "")):
                                    matched = True
                                    break
                            if matched:
                                break
                        if not matched:
                            continue
                        seen_pairs.add((original_url, str(backend_name)))
                        matched_urls.add(original_url)
                        match_rows.append(
                            _build_display_row_for_hit(hit, str(backend_name), original_url)
                        )
                continue
            for original_url, needles in url_needles.items():
                if _timed_out("hydrus per-url scan"):
                    return True
@@ -3616,7 +3597,6 @@ def check_url_exists_in_storage(
                            endpoint="/add_urls/get_url_files",
                            query={"url": needle},
                        )
                        # Access internal client safely if possible, else skip check
                        if hasattr(client, "_perform_request"):
                            response = client._perform_request(spec)
                            raw_hashes = None
@@ -3638,11 +3618,6 @@ def check_url_exists_in_storage(
                        continue
                if not found:
                    fallback_row = _search_backend_url_hits(backend, str(backend_name), original_url, needles)
                    if fallback_row:
                        seen_pairs.add((original_url, str(backend_name)))
                        matched_urls.add(original_url)
                        match_rows.append(fallback_row)
                    continue
                seen_pairs.add((original_url, str(backend_name)))
@@ -7,6 +7,7 @@ import sys
 import shutil
 import tempfile
 import re
 from urllib.parse import urlparse
 from SYS import models
 from SYS import pipeline as ctx
@@ -14,6 +15,7 @@ from SYS.logger import log, debug, is_debug_enabled
 from SYS.pipeline_progress import PipelineProgress
 from SYS.utils_constant import ALL_SUPPORTED_EXTENSIONS
 from Store import Store
 from API.HTTP import _download_direct_file
 from . import _shared as sh
 Cmdlet = sh.Cmdlet
@@ -34,7 +36,7 @@ coerce_to_path = sh.coerce_to_path
 build_pipeline_preview = sh.build_pipeline_preview
 get_field = sh.get_field
-from SYS.utils import sha256_file, unique_path
+from SYS.utils import sha256_file, unique_path, sanitize_filename
 from SYS.metadata import write_metadata
 # Canonical supported filetypes for all stores/cmdlets
@@ -1079,6 +1081,62 @@ class Add_File(Cmdlet):
                pass
        return None, None
    @staticmethod
    def _build_provider_filename(
        pipe_obj: models.PipeObject,
        fallback_hash: Optional[str] = None,
        source_url: Optional[str] = None,
    ) -> str:
        title_candidates: List[str] = []
        title_value = getattr(pipe_obj, "title", "")
        if title_value:
            title_candidates.append(str(title_value))
        extra = getattr(pipe_obj, "extra", {})
        if isinstance(extra, dict):
            candid = extra.get("name") or extra.get("title")
            if candid:
                title_candidates.append(str(candid))
        metadata = getattr(pipe_obj, "metadata", {})
        if isinstance(metadata, dict):
            meta_name = metadata.get("title") or metadata.get("name")
            if meta_name:
                title_candidates.append(str(meta_name))
        text = ""
        for candidate in title_candidates:
            if candidate:
                text = candidate.strip()
                if text:
                    break
        if not text and fallback_hash:
            text = fallback_hash[:8]
        safe_name = sanitize_filename(text or "download")
        ext = ""
        if isinstance(metadata, dict):
            ext = metadata.get("ext") or metadata.get("extension") or ""
        if not ext and isinstance(extra, dict):
            ext = extra.get("ext") or ""
        if not ext and source_url:
            try:
                parsed = urlparse(source_url)
                ext = Path(parsed.path).suffix.lstrip(".")
            except Exception:
                ext = ""
        if ext:
            ext_text = str(ext)
            if not ext_text.startswith("."):
                ext_text = "." + ext_text.lstrip(".")
            if not safe_name.lower().endswith(ext_text.lower()):
                safe_name = f"{safe_name}{ext_text}"
        return safe_name or "download"
    @staticmethod
    def _resolve_backend_by_name(store: Any, backend_name: str) -> Optional[Any]:
        if not store or not backend_name:
@@ -1219,6 +1277,32 @@ class Add_File(Cmdlet):
                )
                if dl_path and dl_path.exists():
                    return dl_path, str(r_hash), tmp_dir
                source_url = str(source).strip()
                if source_url.lower().startswith(("http://", "https://")):
                    download_dir = Path(tempfile.mkdtemp(prefix="add-file-src-"))
                    try:
                        filename = Add_File._build_provider_filename(
                            pipe_obj,
                            str(r_hash),
                            source_url,
                        )
                        downloaded = _download_direct_file(
                            source_url,
                            download_dir,
                            quiet=True,
                            suggested_filename=filename,
                        )
                        downloaded_path = downloaded.path
                        if downloaded_path and downloaded_path.exists():
                            pipe_obj.is_temp = True
                            pipe_obj.path = str(downloaded_path)
                            return downloaded_path, str(r_hash), download_dir
                    except Exception as exc:
                        debug(f"[add-file] Provider download failed: {exc}")
                    try:
                        shutil.rmtree(download_dir, ignore_errors=True)
                    except Exception:
                        pass
        except Exception:
            pass
@@ -241,95 +241,32 @@ class Get_Url(Cmdlet):
        return None
    @staticmethod
-    def _resolve_title_for_hash(backend: Any, file_hash: str, hit: Any = None) -> str:
+    def _extract_size_from_hit(hit: Any) -> int | None:
-        """Best-effort title resolution for a found hash.
+        for key in ("size", "file_size", "filesize", "size_bytes"):
-
+            try:
-        Strategy:
+                val = get_field(hit, key)
-        - Use the hit's existing title/columns when present.
+            except Exception:
-        - Prefer backend.get_metadata(hash) when available (direct lookup).
+                val = None
-        - Fallback to backend.search('hash:<sha>', limit=1) and read title.
+            if val is None:
-        """
+                continue
-        try:
+            if isinstance(val, (int, float)):
-            if hit is not None:
+                return int(val)
-                from_hit = Get_Url._extract_title_from_result(hit)
+            try:
-                if from_hit:
+                return int(val)
-                    return from_hit
+            except Exception:
-        except Exception:
+                continue
-            pass
+        return None
        try:
            if hasattr(backend, "get_metadata"):
                meta = backend.get_metadata(file_hash)
                if isinstance(meta, dict):
                    t = meta.get("title")
                    if isinstance(t, str) and t.strip():
                        return t.strip()
        except Exception:
            pass
        try:
            if hasattr(backend, "search"):
                hits = backend.search(f"hash:{file_hash}", limit=1)
                if isinstance(hits, list) and hits:
                    t2 = Get_Url._extract_title_from_result(hits[0])
                    if t2:
                        return t2
        except Exception:
            pass
        return ""
    @staticmethod
-    def _resolve_size_ext_for_hash(backend: Any, file_hash: str, hit: Any = None) -> tuple[int | None, str]:
+    def _extract_ext_from_hit(hit: Any) -> str:
-        """Best-effort (size, ext) resolution for a found hash."""
+        for key in ("ext", "extension"):
-        # First: see if the hit already includes these fields.
+            try:
-        try:
+                ext_val = get_field(hit, key)
-            size_val = get_field(hit, "size")
+            except Exception:
-            if size_val is None:
+                ext_val = None
-                size_val = get_field(hit, "file_size")
+            if isinstance(ext_val, str) and ext_val.strip():
-            if size_val is None:
+                return ext_val.strip().lstrip(".")
-                size_val = get_field(hit, "filesize")
+        return ""
            if size_val is None:
                size_val = get_field(hit, "size_bytes")
            size_int = int(size_val) if isinstance(size_val, (int, float)) else None
        except Exception:
            size_int = None
        try:
            ext_val = get_field(hit, "ext")
            if ext_val is None:
                ext_val = get_field(hit, "extension")
            ext = str(ext_val).strip().lstrip(".") if isinstance(ext_val, str) else ""
        except Exception:
            ext = ""
        if size_int is not None or ext:
            return size_int, ext
        # Next: backend.get_metadata(hash) when available.
        try:
            if hasattr(backend, "get_metadata"):
                meta = backend.get_metadata(file_hash)
                if isinstance(meta, dict):
                    size_val2 = meta.get("size")
                    if size_val2 is None:
                        size_val2 = meta.get("file_size")
                    if size_val2 is None:
                        size_val2 = meta.get("filesize")
                    if size_val2 is None:
                        size_val2 = meta.get("size_bytes")
                    if isinstance(size_val2, (int, float)):
                        size_int = int(size_val2)
                    ext_val2 = meta.get("ext")
                    if ext_val2 is None:
                        ext_val2 = meta.get("extension")
                    if isinstance(ext_val2, str) and ext_val2.strip():
                        ext = ext_val2.strip().lstrip(".")
        except Exception:
            pass
        return size_int, ext
    def _search_urls_across_stores(self,
                                   pattern: str,
@@ -360,9 +297,6 @@ class Get_Url(Cmdlet):
                try:
                    backend = storage[store_name]
                    title_cache: Dict[str, str] = {}
                    meta_cache: Dict[str, tuple[int | None, str]] = {}
                    # Search only URL-bearing records using the backend's URL search capability.
                    # This avoids the expensive/incorrect "search('*')" scan.
                    try:
@@ -431,22 +365,12 @@ class Get_Url(Cmdlet):
                            search_limit,
                            store_name,
                            pattern_hint=target_pattern,
                            minimal=True,
                        )
                        if search_results is None:
                            continue
                        search_results = search_results or []
                        if not search_results and target_pattern and not has_wildcards:
                            fallback_results = self._execute_search_with_timeout(
                                backend,
                                "url:*",
                                search_limit,
                                store_name,
                                pattern_hint=target_pattern,
                            )
                            if fallback_results is None:
                                continue
                            search_results = fallback_results or []
                        for hit in (search_results or []):
                            if len(items) >= MAX_RESULTS:
@@ -459,44 +383,9 @@ class Get_Url(Cmdlet):
                            file_hash = str(file_hash)
-                            title = title_cache.get(file_hash, "")
+                            title = self._extract_title_from_result(hit) or ""
-                            if not title:
+                            size = self._extract_size_from_hit(hit)
-                                try:
+                            ext = self._extract_ext_from_hit(hit)
                                    title = (
                                        get_field(hit, "title")
                                        or get_field(hit, "name")
                                        or get_field(hit, "file_title")
                                        or ""
                                    )
                                except Exception:
                                    title = ""
                            if not title:
                                title = self._resolve_title_for_hash(backend, file_hash, hit)
                            title_cache[file_hash] = title
                            size, ext = meta_cache.get(file_hash, (None, ""))
                            if size is None and not ext:
                                try:
                                    size = get_field(hit, "size")
                                    if size is None:
                                        size = get_field(hit, "size_bytes")
                                    if size is None:
                                        size = get_field(hit, "file_size")
                                    if size is None:
                                        size = get_field(hit, "filesize")
                                    size = int(size) if isinstance(size, (int, float)) else None
                                except Exception:
                                    size = None
                                try:
                                    ext = get_field(hit, "ext") or get_field(hit, "extension")
                                    ext = str(ext).strip().lstrip(".") if isinstance(ext, str) else ""
                                except Exception:
                                    ext = ""
                            if size is None and not ext:
                                size, ext = self._resolve_size_ext_for_hash(backend, file_hash, hit)
                            meta_cache[file_hash] = (size, ext)
                            urls = self._extract_urls_from_hit(hit)
                            if not urls:
@@ -505,6 +394,7 @@ class Get_Url(Cmdlet):
                                except Exception:
                                    urls = []
                            hit_added = False
                            for url in (urls or []):
                                if len(items) >= MAX_RESULTS:
                                    break
@@ -526,7 +416,9 @@ class Get_Url(Cmdlet):
                                        ext=str(ext or ""),
                                    )
                                )
-                            found_stores.add(str(store_name))
+                                hit_added = True
                            if hit_added:
                                found_stores.add(str(store_name))
                            if len(items) >= MAX_RESULTS:
                                break
                    except Exception as exc: