diff --git a/API/folder.py b/API/folder.py index 5604fa9..7ec1f02 100644 --- a/API/folder.py +++ b/API/folder.py @@ -217,6 +217,7 @@ class API_folder_store: self.connection: Optional[sqlite3.Connection] = None # Use the shared lock self._db_lock = self._shared_db_lock + mm_debug(f"[folder-db] init: root={self.library_root} db={self.db_path}") self._init_db() @contextmanager @@ -284,6 +285,7 @@ class API_folder_store: """Initialize database connection and create tables if needed.""" with self._with_db_lock(): try: + mm_debug(f"[folder-db] opening sqlite db: {self.db_path}") # Ensure the library root exists; sqlite cannot create parent dirs. try: # User safety: Folder store must be created in a blank folder/no files in it. @@ -326,6 +328,7 @@ class API_folder_store: timeout=20.0 ) self.connection.row_factory = sqlite3.Row + mm_debug(f"[folder-db] sqlite connection opened: {self.db_path}") # Ensure busy_timeout is set immediately for all subsequent ops (including pragmas) try: @@ -337,7 +340,14 @@ class API_folder_store: # 1. WAL mode for better concurrency and fewer locks self.connection.execute("PRAGMA journal_mode=WAL") # 2. auto_vacuum=FULL to automatically reclaim space from deleted rows/logs - self.connection.execute("PRAGMA auto_vacuum = FULL") + try: + self.connection.execute("PRAGMA auto_vacuum = FULL") + except sqlite3.OperationalError as exc: + if "locked" not in str(exc).lower(): + raise + logger.warning( + "Database locked; skipping PRAGMA auto_vacuum setup for this session." + ) # 3. Increase page size for modern file systems self.connection.execute("PRAGMA page_size = 4096") # 4. Memory and Sync optimizations @@ -2657,12 +2667,30 @@ class DatabaseAPI: def __init__(self, search_dir: Path): self.search_dir = expand_path(search_dir).resolve() self.db = API_folder_store(self.search_dir) + try: + mm_debug( + f"[folder-db] DatabaseAPI init: root={self.search_dir} db={self.db.db_path}" + ) + except Exception: + pass def __enter__(self): + try: + mm_debug( + f"[folder-db] DatabaseAPI enter: root={self.search_dir} db={self.db.db_path}" + ) + except Exception: + pass self.db.__enter__() return self def __exit__(self, *args): + try: + mm_debug( + f"[folder-db] DatabaseAPI exit: root={self.search_dir} db={self.db.db_path}" + ) + except Exception: + pass return self.db.__exit__(*args) def get_cursor(self): @@ -2730,6 +2758,9 @@ class DatabaseAPI: def get_file_hashes_with_any_url(self, limit: Optional[int] = None) -> Set[str]: """Get hashes of files that have any non-empty URL metadata.""" + mm_debug( + f"[folder-db] get_file_hashes_with_any_url start: limit={limit or 10000}" + ) cursor = self.get_cursor() cursor.execute( """ @@ -2744,8 +2775,11 @@ class DatabaseAPI: (limit or 10000, ), ) - return {row[0] - for row in cursor.fetchall()} + rows = cursor.fetchall() + mm_debug( + f"[folder-db] get_file_hashes_with_any_url done: {len(rows)} row(s)" + ) + return {row[0] for row in rows} def get_file_hashes_by_url_like( self, @@ -2753,6 +2787,9 @@ class DatabaseAPI: limit: Optional[int] = None ) -> Set[str]: """Get hashes of files whose URL metadata contains a substring (case-insensitive).""" + mm_debug( + f"[folder-db] get_file_hashes_by_url_like start: pattern={like_pattern} limit={limit or 10000}" + ) cursor = self.get_cursor() cursor.execute( """ @@ -2766,8 +2803,11 @@ class DatabaseAPI: (like_pattern.lower(), limit or 10000), ) - return {row[0] - for row in cursor.fetchall()} + rows = cursor.fetchall() + mm_debug( + f"[folder-db] get_file_hashes_by_url_like done: {len(rows)} row(s)" + ) + return {row[0] for row in rows} def get_file_hashes_by_ext(self, ext_value: str, @@ -2847,14 +2887,18 @@ class DatabaseAPI: def get_files_with_any_url(self, limit: Optional[int] = None) -> List[tuple]: """Get files that have any non-empty URL metadata. - Returns (hash, file_path, size, ext) tuples. + Returns (hash, file_path, size, ext, url) tuples. """ + mm_debug( + f"[folder-db] get_files_with_any_url start: limit={limit or 10000}" + ) cursor = self.get_cursor() cursor.execute( """ - SELECT f.hash, f.file_path, - COALESCE((SELECT size FROM metadata WHERE hash = f.hash), 0) as size, - COALESCE((SELECT ext FROM metadata WHERE hash = f.hash), '') as ext + SELECT f.hash, f.file_path, + COALESCE((SELECT size FROM metadata WHERE hash = f.hash), 0) as size, + COALESCE((SELECT ext FROM metadata WHERE hash = f.hash), '') as ext, + COALESCE(m.url, '') as url FROM file f JOIN metadata m ON f.hash = m.hash WHERE m.url IS NOT NULL @@ -2866,21 +2910,29 @@ class DatabaseAPI: (limit or 10000, ), ) - return cursor.fetchall() + rows = cursor.fetchall() + mm_debug( + f"[folder-db] get_files_with_any_url done: {len(rows)} row(s)" + ) + return rows def get_files_by_url_like(self, like_pattern: str, limit: Optional[int] = None) -> List[tuple]: """Get files whose URL metadata contains a substring (case-insensitive). - Returns (hash, file_path, size, ext) tuples. + Returns (hash, file_path, size, ext, url) tuples. """ + mm_debug( + f"[folder-db] get_files_by_url_like start: pattern={like_pattern} limit={limit or 10000}" + ) cursor = self.get_cursor() cursor.execute( """ - SELECT f.hash, f.file_path, - COALESCE((SELECT size FROM metadata WHERE hash = f.hash), 0) as size, - COALESCE((SELECT ext FROM metadata WHERE hash = f.hash), '') as ext + SELECT f.hash, f.file_path, + COALESCE((SELECT size FROM metadata WHERE hash = f.hash), 0) as size, + COALESCE((SELECT ext FROM metadata WHERE hash = f.hash), '') as ext, + COALESCE(m.url, '') as url FROM file f JOIN metadata m ON f.hash = m.hash WHERE m.url IS NOT NULL @@ -2891,7 +2943,11 @@ class DatabaseAPI: (like_pattern.lower(), limit or 10000), ) - return cursor.fetchall() + rows = cursor.fetchall() + mm_debug( + f"[folder-db] get_files_by_url_like done: {len(rows)} row(s)" + ) + return rows def get_file_metadata(self, file_hashes: Set[str], @@ -2899,6 +2955,9 @@ class DatabaseAPI: """Get metadata for files given their hashes. Returns (hash, file_path, size, extension) tuples.""" if not file_hashes: return [] + mm_debug( + f"[folder-db] get_file_metadata start: hashes={len(file_hashes)} limit={limit or len(file_hashes)}" + ) cursor = self.get_cursor() placeholders = ",".join(["?"] * len(file_hashes)) fetch_sql = f""" @@ -2911,7 +2970,11 @@ class DatabaseAPI: LIMIT ? """ cursor.execute(fetch_sql, (*file_hashes, limit or len(file_hashes))) - return cursor.fetchall() + rows = cursor.fetchall() + mm_debug( + f"[folder-db] get_file_metadata done: {len(rows)} row(s)" + ) + return rows def get_all_files(self, limit: Optional[int] = None) -> List[tuple]: """Get all files in database. Returns (hash, file_path, size, ext) tuples.""" @@ -2932,11 +2995,18 @@ class DatabaseAPI: def get_tags_for_file(self, file_hash: str) -> List[str]: """Get all tags for a file given its hash.""" + mm_debug( + f"[folder-db] get_tags_for_file start: hash={file_hash}" + ) cursor = self.get_cursor() cursor.execute("SELECT tag FROM tag WHERE hash = ?", (file_hash, )) - return [row[0] for row in cursor.fetchall()] + rows = cursor.fetchall() + mm_debug( + f"[folder-db] get_tags_for_file done: {len(rows)} row(s)" + ) + return [row[0] for row in rows] def get_tags_by_namespace_and_file(self, file_hash: str, diff --git a/Store/Folder.py b/Store/Folder.py index 3878338..91914d3 100644 --- a/Store/Folder.py +++ b/Store/Folder.py @@ -4,7 +4,7 @@ import json import re import shutil import sys -from fnmatch import translate +from fnmatch import fnmatch, translate from pathlib import Path from typing import Any, Dict, List, Optional, Tuple @@ -30,6 +30,28 @@ def _resolve_file_hash(db_hash: Optional[str], file_path: Path) -> Optional[str] return _normalize_hash(file_path.stem) +def _normalize_url_for_search(url: str) -> str: + value = str(url or "").strip() + value = re.sub(r"^[a-z][a-z0-9+.-]*://", "", value, flags=re.IGNORECASE) + value = re.sub(r"^www\.", "", value, flags=re.IGNORECASE) + return value.lower() + + +def _match_url_pattern(url: str, pattern: str) -> bool: + normalized_url = _normalize_url_for_search(url) + normalized_pattern = _normalize_url_for_search(pattern) + if not normalized_pattern: + return False + has_wildcards = any(ch in normalized_pattern for ch in ("*", "?")) + if has_wildcards: + return fnmatch(normalized_url, normalized_pattern) + normalized_url_no_slash = normalized_url.rstrip("/") + normalized_pattern_no_slash = normalized_pattern.rstrip("/") + if normalized_pattern_no_slash and normalized_pattern_no_slash == normalized_url_no_slash: + return True + return normalized_pattern in normalized_url + + class Folder(Store): """""" @@ -690,6 +712,12 @@ class Folder(Store): match_all = query == "*" or (not query and bool(ext_filter)) results = [] search_dir = expand_path(self._location) + backend_label = str( + getattr(self, "_name", "") or getattr(self, "NAME", "") or "folder" + ) + debug( + f"[folder:{backend_label}] search start: query={query} limit={limit} root={search_dir}" + ) def _url_like_pattern(value: str) -> str: # Interpret user patterns as substring matches (with optional glob wildcards). @@ -1002,7 +1030,7 @@ class Folder(Store): namespace, pattern = query.split(":", 1) namespace = namespace.strip().lower() pattern = pattern.strip().lower() - debug(f"Performing namespace search: {namespace}:{pattern}") + debug(f"[folder:{backend_label}] namespace search: {namespace}:{pattern}") if namespace == "hash": normalized_hash = _normalize_hash(pattern) @@ -1041,14 +1069,50 @@ class Folder(Store): return results if namespace == "url": + pattern_hint = kwargs.get("pattern_hint") + + def _parse_url_value(raw: Any) -> list[str]: + if raw is None: + return [] + if isinstance(raw, list): + return [str(u).strip() for u in raw if str(u).strip()] + if isinstance(raw, str): + text = raw.strip() + if not text: + return [] + try: + parsed = json.loads(text) + if isinstance(parsed, list): + return [ + str(u).strip() + for u in parsed + if str(u).strip() + ] + except Exception: + pass + return [text] + return [] + + def _matches_pattern(url_list: list[str]) -> bool: + if not pattern_hint: + return True + for candidate_url in url_list: + if _match_url_pattern(candidate_url, pattern_hint): + return True + return False + if not pattern or pattern == "*": + debug(f"[folder:{backend_label}] url search: any-url (limit={limit})") rows = api.get_files_with_any_url(limit) else: + debug( + f"[folder:{backend_label}] url search: like={pattern} (limit={limit})" + ) rows = api.get_files_by_url_like( _url_like_pattern(pattern), limit ) - for file_hash, file_path_str, size_bytes, ext in rows: + for file_hash, file_path_str, size_bytes, ext, url_raw in rows: if not file_path_str: continue file_path = search_dir / str(file_path_str) @@ -1059,6 +1123,9 @@ class Folder(Store): size_bytes = file_path.stat().st_size except OSError: size_bytes = None + urls = _parse_url_value(url_raw) + if not urls or not _matches_pattern(urls): + continue tags = api.get_tags_for_file(file_hash) entry = _create_entry( file_path, @@ -1066,6 +1133,7 @@ class Folder(Store): size_bytes, file_hash ) + entry["urls"] = urls results.append(entry) if limit is not None and len(results) >= limit: return results diff --git a/Store/HydrusNetwork.py b/Store/HydrusNetwork.py index eb205c3..076f76c 100644 --- a/Store/HydrusNetwork.py +++ b/Store/HydrusNetwork.py @@ -466,7 +466,9 @@ class HydrusNetwork(Store): def _extract_urls(meta_obj: Any) -> list[str]: if not isinstance(meta_obj, dict): return [] - raw = meta_obj.get("url") + raw = meta_obj.get("known_urls") + if raw is None: + raw = meta_obj.get("url") if raw is None: raw = meta_obj.get("urls") if isinstance(raw, str): @@ -483,100 +485,178 @@ class HydrusNetwork(Store): return out return [] + def _extract_search_ids(payload: Any) -> tuple[list[int], list[str]]: + if not isinstance(payload, dict): + return [], [] + raw_ids = payload.get("file_ids", []) + raw_hashes = payload.get("hashes", []) + ids_out: list[int] = [] + hashes_out: list[str] = [] + if isinstance(raw_ids, list): + for item in raw_ids: + try: + if isinstance(item, (int, float)): + ids_out.append(int(item)) + continue + if isinstance(item, str) and item.strip().isdigit(): + ids_out.append(int(item.strip())) + except Exception: + continue + if isinstance(raw_hashes, list): + for item in raw_hashes: + try: + candidate = str(item or "").strip().lower() + if candidate: + hashes_out.append(candidate) + except Exception: + continue + return ids_out, hashes_out + def _iter_url_filtered_metadata( url_value: str | None, want_any: bool, - fetch_limit: int - ) -> list[dict[str, - Any]]: + fetch_limit: int, + scan_limit: int | None = None + ) -> list[dict[str, Any]]: """Best-effort URL search by scanning Hydrus metadata with include_file_url=True.""" - # First try a fast system predicate if Hydrus supports it. candidate_file_ids: list[int] = [] - try: - if want_any: + candidate_hashes: list[str] = [] + seen_file_ids: set[int] = set() + seen_hashes: set[str] = set() + + def _add_candidates(ids: list[int], hashes: list[str]) -> None: + for fid in ids: + if fid in seen_file_ids: + continue + seen_file_ids.add(fid) + candidate_file_ids.append(fid) + for hh in hashes: + if hh in seen_hashes: + continue + seen_hashes.add(hh) + candidate_hashes.append(hh) + + predicate_supported = getattr(self, "_has_url_predicate", None) + if predicate_supported is not False: + try: predicate = "system:has url" url_search = client.search_files( tags=[predicate], - return_hashes=False, - return_file_ids=True, + return_hashes=True, + return_file_ids=False, return_file_count=False, ) - ids = url_search.get("file_ids", - []) if isinstance(url_search, - dict) else [] - if isinstance(ids, list): - candidate_file_ids = [ - int(x) for x in ids - if isinstance(x, (int, float, - str)) and str(x).strip().isdigit() - ] - except Exception: - candidate_file_ids = [] + ids, hashes = _extract_search_ids(url_search) + _add_candidates(ids, hashes) + self._has_url_predicate = True + except Exception as exc: + try: + from API.HydrusNetwork import HydrusRequestError - if not candidate_file_ids: - # Fallback: scan from system:everything and filter by URL substring. + if isinstance(exc, HydrusRequestError) and getattr(exc, "status", None) == 400: + self._has_url_predicate = False + except Exception: + pass + + if not candidate_file_ids and not candidate_hashes: everything = client.search_files( tags=["system:everything"], - return_hashes=False, - return_file_ids=True, + return_hashes=True, + return_file_ids=False, return_file_count=False, ) - ids = everything.get("file_ids", - []) if isinstance(everything, - dict) else [] - if isinstance(ids, list): - candidate_file_ids = [ - int(x) for x in ids if isinstance(x, (int, float)) - ] + ids, hashes = _extract_search_ids(everything) + _add_candidates(ids, hashes) - if not candidate_file_ids: + if not candidate_file_ids and not candidate_hashes: return [] needle = (url_value or "").strip().lower() chunk_size = 200 out: list[dict[str, Any]] = [] + if scan_limit is None: + try: + if not want_any and url_value: + scan_limit = max(200, min(int(fetch_limit), 400)) + else: + scan_limit = max(int(fetch_limit) * 5, 1000) + except Exception: + scan_limit = 400 if (not want_any and url_value) else 1000 + if scan_limit is not None: + scan_limit = min(int(scan_limit), 10000) + scanned = 0 - for start in range(0, len(candidate_file_ids), chunk_size): + def _process_source(items: list[Any], kind: str) -> None: + nonlocal scanned + for start in range(0, len(items), chunk_size): + if len(out) >= fetch_limit: + return + if scan_limit is not None and scanned >= scan_limit: + return + chunk = items[start:start + chunk_size] + if scan_limit is not None: + remaining = scan_limit - scanned + if remaining <= 0: + return + if len(chunk) > remaining: + chunk = chunk[:remaining] + scanned += len(chunk) + try: + if kind == "hashes": + payload = client.fetch_file_metadata( + hashes=chunk, + include_file_url=True, + include_service_keys_to_tags=True, + include_duration=True, + include_size=True, + include_mime=True, + ) + else: + payload = client.fetch_file_metadata( + file_ids=chunk, + include_file_url=True, + include_service_keys_to_tags=True, + include_duration=True, + include_size=True, + include_mime=True, + ) + except Exception: + continue + + metas = payload.get("metadata", + []) if isinstance(payload, + dict) else [] + if not isinstance(metas, list): + continue + + for meta in metas: + if len(out) >= fetch_limit: + break + if not isinstance(meta, dict): + continue + urls = _extract_urls(meta) + if not urls: + continue + if want_any: + out.append(meta) + continue + if not needle: + continue + if any(needle in u.lower() for u in urls): + out.append(meta) + continue + + sources: list[tuple[str, list[Any]]] = [] + if candidate_hashes: + sources.append(("hashes", candidate_hashes)) + elif candidate_file_ids: + sources.append(("file_ids", candidate_file_ids)) + + for kind, items in sources: if len(out) >= fetch_limit: break - chunk = candidate_file_ids[start:start + chunk_size] - try: - payload = client.fetch_file_metadata( - file_ids=chunk, - include_file_url=True, - include_service_keys_to_tags=True, - include_duration=True, - include_size=True, - include_mime=True, - ) - except Exception: - continue - - metas = payload.get("metadata", - []) if isinstance(payload, - dict) else [] - if not isinstance(metas, list): - continue - - for meta in metas: - if not isinstance(meta, dict): - continue - urls = _extract_urls(meta) - if not urls: - continue - if want_any: - out.append(meta) - if len(out) >= fetch_limit: - break - continue - - if not needle: - continue - if any(needle in u.lower() for u in urls): - out.append(meta) - if len(out) >= fetch_limit: - break + _process_source(items, kind) return out @@ -618,6 +698,7 @@ class HydrusNetwork(Store): # Special case: url:* and url: metadata_list: list[dict[str, Any]] | None = None + pattern_hint = str(kwargs.get("pattern_hint") or "").strip().lower() if ":" in query_lower and not query_lower.startswith(":"): namespace, pattern = query_lower.split(":", 1) namespace = namespace.strip().lower() @@ -630,6 +711,12 @@ class HydrusNetwork(Store): fetch_limit=int(limit) if limit else 100 ) else: + def _clean_url_search_token(value: str | None) -> str: + token = str(value or "").strip().lower() + if not token: + return "" + return token.replace("*", "").replace("?", "") + # Fast-path: exact URL via /add_urls/get_url_files when a full URL is provided. try: if pattern.startswith("http://") or pattern.startswith( @@ -706,10 +793,20 @@ class HydrusNetwork(Store): # Fallback: substring scan if metadata_list is None: + search_token = _clean_url_search_token(pattern_hint or pattern) + scan_limit_override: int | None = None + if search_token: + is_domain_only = ("://" not in search_token and "/" not in search_token) + if is_domain_only: + try: + scan_limit_override = max(int(limit or 100) * 20, 2000) + except Exception: + scan_limit_override = 2000 metadata_list = _iter_url_filtered_metadata( - pattern, + search_token, want_any=False, - fetch_limit=int(limit) if limit else 100 + fetch_limit=int(limit) if limit else 100, + scan_limit=scan_limit_override, ) # Parse the query into tags @@ -742,26 +839,6 @@ class HydrusNetwork(Store): # Search files with the tags (unless url: search already produced metadata) results = [] - def _extract_search_ids(payload: Any) -> tuple[list[int], list[str]]: - if not isinstance(payload, dict): - return [], [] - raw_ids = payload.get("file_ids", []) - raw_hashes = payload.get("hashes", []) - ids_out: list[int] = [] - hashes_out: list[str] = [] - if isinstance(raw_ids, list): - for item in raw_ids: - try: - ids_out.append(int(item)) - except (TypeError, ValueError): - continue - if isinstance(raw_hashes, list): - hashes_out = [ - str(h).strip() for h in raw_hashes - if isinstance(h, str) and str(h).strip() - ] - return ids_out, hashes_out - if metadata_list is None: file_ids: list[int] = [] hashes: list[str] = [] diff --git a/cmdlet/get_url.py b/cmdlet/get_url.py index cb6f593..009fdb3 100644 --- a/cmdlet/get_url.py +++ b/cmdlet/get_url.py @@ -1,5 +1,7 @@ from __future__ import annotations +from queue import SimpleQueue +from threading import Thread from dataclasses import dataclass from typing import Any, Dict, List, Sequence, Optional, Set, Tuple import sys @@ -34,6 +36,8 @@ class UrlItem: class Get_Url(Cmdlet): """Get url associated with files via hash+store, or search urls by pattern.""" + STORE_SEARCH_TIMEOUT_SECONDS = 6.0 + def __init__(self) -> None: super().__init__( name="get-url", @@ -81,8 +85,56 @@ class Get_Url(Cmdlet): normalized_url = Get_Url._normalize_url_for_search(url) normalized_pattern = Get_Url._normalize_url_for_search(pattern) - # Use fnmatch for wildcard matching (* and ?) - return fnmatch(normalized_url, normalized_pattern) + has_wildcards = any(ch in normalized_pattern for ch in ("*", "?")) + if has_wildcards: + return fnmatch(normalized_url, normalized_pattern) + + normalized_url_no_slash = normalized_url.rstrip("/") + normalized_pattern_no_slash = normalized_pattern.rstrip("/") + if normalized_pattern_no_slash and normalized_pattern_no_slash == normalized_url_no_slash: + return True + + return normalized_pattern in normalized_url + + def _execute_search_with_timeout( + self, + backend: Any, + query: str, + limit: int, + store_name: str, + **kwargs: Any, + ) -> Optional[List[Any]]: + queue: SimpleQueue[tuple[str, Any]] = SimpleQueue() + + def _worker() -> None: + try: + queue.put(("ok", backend.search(query, limit=limit, **kwargs))) + except Exception as exc: + queue.put(("err", exc)) + + worker = Thread(target=_worker, daemon=True) + worker.start() + worker.join(timeout=self.STORE_SEARCH_TIMEOUT_SECONDS) + + if worker.is_alive(): + debug( + f"Store '{store_name}' search timed out after {self.STORE_SEARCH_TIMEOUT_SECONDS}s", + file=sys.stderr, + ) + return None + + if queue.empty(): + return [] + + status, payload = queue.get() + if status == "err": + debug( + f"Store '{store_name}' search failed: {payload}", + file=sys.stderr, + ) + return [] + + return payload or [] @staticmethod def _extract_first_url(value: Any) -> Optional[str]: @@ -95,6 +147,35 @@ class Get_Url(Cmdlet): return item.strip() return None + @staticmethod + def _extract_urls_from_hit(hit: Any) -> List[str]: + """Extract candidate URLs directly from a search hit, if present.""" + raw = None + try: + raw = get_field(hit, "known_urls") + if not raw: + raw = get_field(hit, "urls") + if not raw: + raw = get_field(hit, "url") + if not raw: + raw = get_field(hit, "source_url") or get_field(hit, "source_urls") + except Exception: + raw = None + + if isinstance(raw, str): + val = raw.strip() + return [val] if val else [] + if isinstance(raw, (list, tuple)): + out: list[str] = [] + for item in raw: + if not isinstance(item, str): + continue + v = item.strip() + if v: + out.append(v) + return out + return [] + @staticmethod def _extract_title_from_result(result: Any) -> Optional[str]: # Prefer explicit title field. @@ -219,6 +300,7 @@ class Get_Url(Cmdlet): """ items: List[UrlItem] = [] found_stores: Set[str] = set() + MAX_RESULTS = 256 try: storage = Store(config) @@ -230,6 +312,8 @@ class Get_Url(Cmdlet): return items, list(found_stores) for store_name in store_names: + if len(items) >= MAX_RESULTS: + break try: backend = storage[store_name] @@ -243,9 +327,12 @@ class Get_Url(Cmdlet): has_wildcards = any(ch in raw_pattern for ch in ("*", "?")) # If this is a Hydrus backend and the pattern is a single URL, - # normalize it through the official API. + # normalize it through the official API. Skip for bare domains. normalized_url = None - if not has_wildcards and hasattr(backend, "get_url_info"): + looks_like_url = ( + "://" in raw_pattern or raw_pattern.startswith("magnet:") + ) + if not has_wildcards and looks_like_url and hasattr(backend, "get_url_info"): try: info = backend.get_url_info(raw_pattern) # type: ignore[attr-defined] if isinstance(info, dict): @@ -255,13 +342,39 @@ class Get_Url(Cmdlet): except Exception: normalized_url = None - search_query = "url:*" if has_wildcards else f"url:{normalized_url or raw_pattern}" - try: - search_results = backend.search(search_query, limit=1000) - except Exception: - search_results = [] + target_pattern = normalized_url or raw_pattern + if has_wildcards or not target_pattern: + search_query = "url:*" + else: + wrapped_pattern = f"*{target_pattern}*" + search_query = f"url:{wrapped_pattern}" + search_limit = max(1, min(MAX_RESULTS, 1000)) + search_results = self._execute_search_with_timeout( + backend, + search_query, + search_limit, + store_name, + pattern_hint=target_pattern, + ) + if search_results is None: + continue + + search_results = search_results or [] + if not search_results and target_pattern and not has_wildcards: + fallback_results = self._execute_search_with_timeout( + backend, + "url:*", + search_limit, + store_name, + pattern_hint=target_pattern, + ) + if fallback_results is None: + continue + search_results = fallback_results or [] for hit in (search_results or []): + if len(items) >= MAX_RESULTS: + break file_hash = None if isinstance(hit, dict): file_hash = hit.get("hash") or hit.get("file_hash") @@ -271,25 +384,57 @@ class Get_Url(Cmdlet): file_hash = str(file_hash) title = title_cache.get(file_hash, "") + if not title: + try: + title = ( + get_field(hit, "title") + or get_field(hit, "name") + or get_field(hit, "file_title") + or "" + ) + except Exception: + title = "" if not title: title = self._resolve_title_for_hash(backend, file_hash, hit) - title_cache[file_hash] = title + title_cache[file_hash] = title size, ext = meta_cache.get(file_hash, (None, "")) if size is None and not ext: - size, ext = self._resolve_size_ext_for_hash(backend, file_hash, hit) - meta_cache[file_hash] = (size, ext) + try: + size = get_field(hit, "size") + if size is None: + size = get_field(hit, "size_bytes") + if size is None: + size = get_field(hit, "file_size") + if size is None: + size = get_field(hit, "filesize") + size = int(size) if isinstance(size, (int, float)) else None + except Exception: + size = None - try: - urls = backend.get_url(file_hash) - except Exception: - urls = [] + try: + ext = get_field(hit, "ext") or get_field(hit, "extension") + ext = str(ext).strip().lstrip(".") if isinstance(ext, str) else "" + except Exception: + ext = "" + + if size is None and not ext: + size, ext = self._resolve_size_ext_for_hash(backend, file_hash, hit) + meta_cache[file_hash] = (size, ext) + + urls = self._extract_urls_from_hit(hit) + if not urls: + try: + urls = backend.get_url(file_hash) + except Exception: + urls = [] for url in (urls or []): + if len(items) >= MAX_RESULTS: + break if not self._match_url_pattern(str(url), raw_pattern): continue - - # Double-check it looks like a URL to avoid data leakage from dirty DBs + from SYS.metadata import normalize_urls valid = normalize_urls([str(url)]) if not valid: @@ -306,6 +451,8 @@ class Get_Url(Cmdlet): ) ) found_stores.add(str(store_name)) + if len(items) >= MAX_RESULTS: + break except Exception as exc: debug( f"Error searching store '{store_name}': {exc}",