f

2026-01-16 01:47:00 -08:00
parent 41e95d0360
commit 12436e5a6a
4 changed files with 492 additions and 130 deletions
@@ -4,7 +4,7 @@ import json
 import re
 import shutil
 import sys
-from fnmatch import translate
+from fnmatch import fnmatch, translate
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple

@@ -30,6 +30,28 @@ def _resolve_file_hash(db_hash: Optional[str], file_path: Path) -> Optional[str]
    return _normalize_hash(file_path.stem)


+def _normalize_url_for_search(url: str) -> str:
+    value = str(url or "").strip()
+    value = re.sub(r"^[a-z][a-z0-9+.-]*://", "", value, flags=re.IGNORECASE)
+    value = re.sub(r"^www\.", "", value, flags=re.IGNORECASE)
+    return value.lower()
+
+
+def _match_url_pattern(url: str, pattern: str) -> bool:
+    normalized_url = _normalize_url_for_search(url)
+    normalized_pattern = _normalize_url_for_search(pattern)
+    if not normalized_pattern:
+        return False
+    has_wildcards = any(ch in normalized_pattern for ch in ("*", "?"))
+    if has_wildcards:
+        return fnmatch(normalized_url, normalized_pattern)
+    normalized_url_no_slash = normalized_url.rstrip("/")
+    normalized_pattern_no_slash = normalized_pattern.rstrip("/")
+    if normalized_pattern_no_slash and normalized_pattern_no_slash == normalized_url_no_slash:
+        return True
+    return normalized_pattern in normalized_url
+
+
 class Folder(Store):
    """"""

@@ -690,6 +712,12 @@ class Folder(Store):
        match_all = query == "*" or (not query and bool(ext_filter))
        results = []
        search_dir = expand_path(self._location)
+        backend_label = str(
+            getattr(self, "_name", "") or getattr(self, "NAME", "") or "folder"
+        )
+        debug(
+            f"[folder:{backend_label}] search start: query={query} limit={limit} root={search_dir}"
+        )

        def _url_like_pattern(value: str) -> str:
            # Interpret user patterns as substring matches (with optional glob wildcards).
@@ -1002,7 +1030,7 @@ class Folder(Store):
                        namespace, pattern = query.split(":", 1)
                        namespace = namespace.strip().lower()
                        pattern = pattern.strip().lower()
-                        debug(f"Performing namespace search: {namespace}:{pattern}")
+                        debug(f"[folder:{backend_label}] namespace search: {namespace}:{pattern}")

                        if namespace == "hash":
                            normalized_hash = _normalize_hash(pattern)
@@ -1041,14 +1069,50 @@ class Folder(Store):
                            return results

                        if namespace == "url":
+                            pattern_hint = kwargs.get("pattern_hint")
+
+                            def _parse_url_value(raw: Any) -> list[str]:
+                                if raw is None:
+                                    return []
+                                if isinstance(raw, list):
+                                    return [str(u).strip() for u in raw if str(u).strip()]
+                                if isinstance(raw, str):
+                                    text = raw.strip()
+                                    if not text:
+                                        return []
+                                    try:
+                                        parsed = json.loads(text)
+                                        if isinstance(parsed, list):
+                                            return [
+                                                str(u).strip()
+                                                for u in parsed
+                                                if str(u).strip()
+                                            ]
+                                    except Exception:
+                                        pass
+                                    return [text]
+                                return []
+
+                            def _matches_pattern(url_list: list[str]) -> bool:
+                                if not pattern_hint:
+                                    return True
+                                for candidate_url in url_list:
+                                    if _match_url_pattern(candidate_url, pattern_hint):
+                                        return True
+                                return False
+
                            if not pattern or pattern == "*":
+                                debug(f"[folder:{backend_label}] url search: any-url (limit={limit})")
                                rows = api.get_files_with_any_url(limit)
                            else:
+                                debug(
+                                    f"[folder:{backend_label}] url search: like={pattern} (limit={limit})"
+                                )
                                rows = api.get_files_by_url_like(
                                    _url_like_pattern(pattern),
                                    limit
                                )
-                            for file_hash, file_path_str, size_bytes, ext in rows:
+                            for file_hash, file_path_str, size_bytes, ext, url_raw in rows:
                                if not file_path_str:
                                    continue
                                file_path = search_dir / str(file_path_str)
@@ -1059,6 +1123,9 @@ class Folder(Store):
                                        size_bytes = file_path.stat().st_size
                                    except OSError:
                                        size_bytes = None
+                                urls = _parse_url_value(url_raw)
+                                if not urls or not _matches_pattern(urls):
+                                    continue
                                tags = api.get_tags_for_file(file_hash)
                                entry = _create_entry(
                                    file_path,
@@ -1066,6 +1133,7 @@ class Folder(Store):
                                    size_bytes,
                                    file_hash
                                )
+                                entry["urls"] = urls
                                results.append(entry)
                                if limit is not None and len(results) >= limit:
                                    return results
@@ -466,7 +466,9 @@ class HydrusNetwork(Store):
            def _extract_urls(meta_obj: Any) -> list[str]:
                if not isinstance(meta_obj, dict):
                    return []
-                raw = meta_obj.get("url")
+                raw = meta_obj.get("known_urls")
+                if raw is None:
+                    raw = meta_obj.get("url")
                if raw is None:
                    raw = meta_obj.get("urls")
                if isinstance(raw, str):
@@ -483,100 +485,178 @@ class HydrusNetwork(Store):
                    return out
                return []

+            def _extract_search_ids(payload: Any) -> tuple[list[int], list[str]]:
+                if not isinstance(payload, dict):
+                    return [], []
+                raw_ids = payload.get("file_ids", [])
+                raw_hashes = payload.get("hashes", [])
+                ids_out: list[int] = []
+                hashes_out: list[str] = []
+                if isinstance(raw_ids, list):
+                    for item in raw_ids:
+                        try:
+                            if isinstance(item, (int, float)):
+                                ids_out.append(int(item))
+                                continue
+                            if isinstance(item, str) and item.strip().isdigit():
+                                ids_out.append(int(item.strip()))
+                        except Exception:
+                            continue
+                if isinstance(raw_hashes, list):
+                    for item in raw_hashes:
+                        try:
+                            candidate = str(item or "").strip().lower()
+                            if candidate:
+                                hashes_out.append(candidate)
+                        except Exception:
+                            continue
+                return ids_out, hashes_out
+
            def _iter_url_filtered_metadata(
                url_value: str | None,
                want_any: bool,
-                fetch_limit: int
-            ) -> list[dict[str,
-                           Any]]:
+                fetch_limit: int,
+                scan_limit: int | None = None
+            ) -> list[dict[str, Any]]:
                """Best-effort URL search by scanning Hydrus metadata with include_file_url=True."""

-                # First try a fast system predicate if Hydrus supports it.
                candidate_file_ids: list[int] = []
-                try:
-                    if want_any:
+                candidate_hashes: list[str] = []
+                seen_file_ids: set[int] = set()
+                seen_hashes: set[str] = set()
+
+                def _add_candidates(ids: list[int], hashes: list[str]) -> None:
+                    for fid in ids:
+                        if fid in seen_file_ids:
+                            continue
+                        seen_file_ids.add(fid)
+                        candidate_file_ids.append(fid)
+                    for hh in hashes:
+                        if hh in seen_hashes:
+                            continue
+                        seen_hashes.add(hh)
+                        candidate_hashes.append(hh)
+
+                predicate_supported = getattr(self, "_has_url_predicate", None)
+                if predicate_supported is not False:
+                    try:
                        predicate = "system:has url"
                        url_search = client.search_files(
                            tags=[predicate],
-                            return_hashes=False,
-                            return_file_ids=True,
+                            return_hashes=True,
+                            return_file_ids=False,
                            return_file_count=False,
                        )
-                        ids = url_search.get("file_ids",
-                                             []) if isinstance(url_search,
-                                                               dict) else []
-                        if isinstance(ids, list):
-                            candidate_file_ids = [
-                                int(x) for x in ids
-                                if isinstance(x, (int, float,
-                                                  str)) and str(x).strip().isdigit()
-                            ]
-                except Exception:
-                    candidate_file_ids = []
+                        ids, hashes = _extract_search_ids(url_search)
+                        _add_candidates(ids, hashes)
+                        self._has_url_predicate = True
+                    except Exception as exc:
+                        try:
+                            from API.HydrusNetwork import HydrusRequestError

-                if not candidate_file_ids:
-                    # Fallback: scan from system:everything and filter by URL substring.
+                            if isinstance(exc, HydrusRequestError) and getattr(exc, "status", None) == 400:
+                                self._has_url_predicate = False
+                        except Exception:
+                            pass
+
+                if not candidate_file_ids and not candidate_hashes:
                    everything = client.search_files(
                        tags=["system:everything"],
-                        return_hashes=False,
-                        return_file_ids=True,
+                        return_hashes=True,
+                        return_file_ids=False,
                        return_file_count=False,
                    )
-                    ids = everything.get("file_ids",
-                                         []) if isinstance(everything,
-                                                           dict) else []
-                    if isinstance(ids, list):
-                        candidate_file_ids = [
-                            int(x) for x in ids if isinstance(x, (int, float))
-                        ]
+                    ids, hashes = _extract_search_ids(everything)
+                    _add_candidates(ids, hashes)

-                if not candidate_file_ids:
+                if not candidate_file_ids and not candidate_hashes:
                    return []

                needle = (url_value or "").strip().lower()
                chunk_size = 200
                out: list[dict[str, Any]] = []
+                if scan_limit is None:
+                    try:
+                        if not want_any and url_value:
+                            scan_limit = max(200, min(int(fetch_limit), 400))
+                        else:
+                            scan_limit = max(int(fetch_limit) * 5, 1000)
+                    except Exception:
+                        scan_limit = 400 if (not want_any and url_value) else 1000
+                if scan_limit is not None:
+                    scan_limit = min(int(scan_limit), 10000)
+                scanned = 0

-                for start in range(0, len(candidate_file_ids), chunk_size):
+                def _process_source(items: list[Any], kind: str) -> None:
+                    nonlocal scanned
+                    for start in range(0, len(items), chunk_size):
+                        if len(out) >= fetch_limit:
+                            return
+                        if scan_limit is not None and scanned >= scan_limit:
+                            return
+                        chunk = items[start:start + chunk_size]
+                        if scan_limit is not None:
+                            remaining = scan_limit - scanned
+                            if remaining <= 0:
+                                return
+                            if len(chunk) > remaining:
+                                chunk = chunk[:remaining]
+                        scanned += len(chunk)
+                        try:
+                            if kind == "hashes":
+                                payload = client.fetch_file_metadata(
+                                    hashes=chunk,
+                                    include_file_url=True,
+                                    include_service_keys_to_tags=True,
+                                    include_duration=True,
+                                    include_size=True,
+                                    include_mime=True,
+                                )
+                            else:
+                                payload = client.fetch_file_metadata(
+                                    file_ids=chunk,
+                                    include_file_url=True,
+                                    include_service_keys_to_tags=True,
+                                    include_duration=True,
+                                    include_size=True,
+                                    include_mime=True,
+                                )
+                        except Exception:
+                            continue
+
+                        metas = payload.get("metadata",
+                                            []) if isinstance(payload,
+                                                              dict) else []
+                        if not isinstance(metas, list):
+                            continue
+
+                        for meta in metas:
+                            if len(out) >= fetch_limit:
+                                break
+                            if not isinstance(meta, dict):
+                                continue
+                            urls = _extract_urls(meta)
+                            if not urls:
+                                continue
+                            if want_any:
+                                out.append(meta)
+                                continue
+                            if not needle:
+                                continue
+                            if any(needle in u.lower() for u in urls):
+                                out.append(meta)
+                                continue
+
+                sources: list[tuple[str, list[Any]]] = []
+                if candidate_hashes:
+                    sources.append(("hashes", candidate_hashes))
+                elif candidate_file_ids:
+                    sources.append(("file_ids", candidate_file_ids))
+
+                for kind, items in sources:
                    if len(out) >= fetch_limit:
                        break
-                    chunk = candidate_file_ids[start:start + chunk_size]
-                    try:
-                        payload = client.fetch_file_metadata(
-                            file_ids=chunk,
-                            include_file_url=True,
-                            include_service_keys_to_tags=True,
-                            include_duration=True,
-                            include_size=True,
-                            include_mime=True,
-                        )
-                    except Exception:
-                        continue
-
-                    metas = payload.get("metadata",
-                                        []) if isinstance(payload,
-                                                          dict) else []
-                    if not isinstance(metas, list):
-                        continue
-
-                    for meta in metas:
-                        if not isinstance(meta, dict):
-                            continue
-                        urls = _extract_urls(meta)
-                        if not urls:
-                            continue
-                        if want_any:
-                            out.append(meta)
-                            if len(out) >= fetch_limit:
-                                break
-                            continue
-
-                        if not needle:
-                            continue
-                        if any(needle in u.lower() for u in urls):
-                            out.append(meta)
-                            if len(out) >= fetch_limit:
-                                break
+                    _process_source(items, kind)

                return out

@@ -618,6 +698,7 @@ class HydrusNetwork(Store):

            # Special case: url:* and url:<value>
            metadata_list: list[dict[str, Any]] | None = None
+            pattern_hint = str(kwargs.get("pattern_hint") or "").strip().lower()
            if ":" in query_lower and not query_lower.startswith(":"):
                namespace, pattern = query_lower.split(":", 1)
                namespace = namespace.strip().lower()
@@ -630,6 +711,12 @@ class HydrusNetwork(Store):
                            fetch_limit=int(limit) if limit else 100
                        )
                    else:
+                        def _clean_url_search_token(value: str | None) -> str:
+                            token = str(value or "").strip().lower()
+                            if not token:
+                                return ""
+                            return token.replace("*", "").replace("?", "")
+
                        # Fast-path: exact URL via /add_urls/get_url_files when a full URL is provided.
                        try:
                            if pattern.startswith("http://") or pattern.startswith(
@@ -706,10 +793,20 @@ class HydrusNetwork(Store):

                        # Fallback: substring scan
                        if metadata_list is None:
+                            search_token = _clean_url_search_token(pattern_hint or pattern)
+                            scan_limit_override: int | None = None
+                            if search_token:
+                                is_domain_only = ("://" not in search_token and "/" not in search_token)
+                                if is_domain_only:
+                                    try:
+                                        scan_limit_override = max(int(limit or 100) * 20, 2000)
+                                    except Exception:
+                                        scan_limit_override = 2000
                            metadata_list = _iter_url_filtered_metadata(
-                                pattern,
+                                search_token,
                                want_any=False,
-                                fetch_limit=int(limit) if limit else 100
+                                fetch_limit=int(limit) if limit else 100,
+                                scan_limit=scan_limit_override,
                            )

            # Parse the query into tags
@@ -742,26 +839,6 @@ class HydrusNetwork(Store):
            # Search files with the tags (unless url: search already produced metadata)
            results = []

-            def _extract_search_ids(payload: Any) -> tuple[list[int], list[str]]:
-                if not isinstance(payload, dict):
-                    return [], []
-                raw_ids = payload.get("file_ids", [])
-                raw_hashes = payload.get("hashes", [])
-                ids_out: list[int] = []
-                hashes_out: list[str] = []
-                if isinstance(raw_ids, list):
-                    for item in raw_ids:
-                        try:
-                            ids_out.append(int(item))
-                        except (TypeError, ValueError):
-                            continue
-                if isinstance(raw_hashes, list):
-                    hashes_out = [
-                        str(h).strip() for h in raw_hashes
-                        if isinstance(h, str) and str(h).strip()
-                    ]
-                return ids_out, hashes_out
-
            if metadata_list is None:
                file_ids: list[int] = []
                hashes: list[str] = []