From 3f874af54a5212a3dec1e594203d6729428f92e5 Mon Sep 17 00:00:00 2001 From: Nose Date: Sat, 17 Jan 2026 21:32:44 -0800 Subject: [PATCH] f --- API/folder.py | 43 +++++- Store/Folder.py | 39 +++++- Store/HydrusNetwork.py | 61 +++++++-- cmdlet/_shared.py | 298 ++++++++++++++++++++++++++++------------- 4 files changed, 329 insertions(+), 112 deletions(-) diff --git a/API/folder.py b/API/folder.py index 3d6d31b..a9756e9 100644 --- a/API/folder.py +++ b/API/folder.py @@ -21,7 +21,7 @@ from contextlib import contextmanager from datetime import datetime from pathlib import Path, PurePosixPath from threading import RLock -from typing import Optional, Dict, Any, List, Tuple, Set +from typing import Optional, Dict, Any, List, Tuple, Set, Sequence from SYS.utils import sha256_file, expand_path from SYS.logger import debug as _debug @@ -3001,6 +3001,47 @@ class DatabaseAPI: ) return rows + def get_files_by_url_like_any( + self, + like_patterns: Sequence[str], + limit: Optional[int] = None, + ) -> List[tuple]: + """Get files whose URL metadata matches any of the provided LIKE patterns. + + Returns (hash, file_path, size, ext, url) tuples. + """ + patterns = [str(p or "").strip() for p in (like_patterns or [])] + patterns = [p for p in patterns if p] + if not patterns: + return [] + + mm_debug( + f"[folder-db] get_files_by_url_like_any start: patterns={len(patterns)} limit={limit or 10000}" + ) + cursor = self.get_cursor() + where_or = " OR ".join(["LOWER(m.url) LIKE ?"] * len(patterns)) + query = f""" + SELECT f.hash, f.file_path, + COALESCE((SELECT size FROM metadata WHERE hash = f.hash), 0) as size, + COALESCE((SELECT ext FROM metadata WHERE hash = f.hash), '') as ext, + COALESCE(m.url, '') as url + FROM file f + JOIN metadata m ON f.hash = m.hash + WHERE m.url IS NOT NULL + AND ({where_or}) + ORDER BY f.file_path + LIMIT ? + """ + cursor.execute( + query, + (*[p.lower() for p in patterns], limit or 10000), + ) + rows = cursor.fetchall() + mm_debug( + f"[folder-db] get_files_by_url_like_any done: {len(rows)} row(s)" + ) + return rows + def get_file_metadata(self, file_hashes: Set[str], limit: Optional[int] = None) -> List[tuple]: diff --git a/Store/Folder.py b/Store/Folder.py index d77e8bb..7290f86 100644 --- a/Store/Folder.py +++ b/Store/Folder.py @@ -1071,6 +1071,25 @@ class Folder(Store): if namespace == "url": pattern_hint = kwargs.get("pattern_hint") + def _pattern_candidates(raw: Any) -> List[str]: + if raw is None: + return [] + if isinstance(raw, (list, tuple, set)): + out: List[str] = [] + for item in raw: + text = str(item or "").strip() + if text and text not in out: + out.append(text) + return out + if isinstance(raw, str): + text = raw.strip() + return [text] if text else [] + return [] + + pattern_candidates = _pattern_candidates(pattern_hint) + if len(pattern_candidates) > 200: + pattern_candidates = pattern_candidates[:200] + def _parse_url_value(raw: Any) -> list[str]: if raw is None: return [] @@ -1094,16 +1113,26 @@ class Folder(Store): return [] def _matches_pattern(url_list: list[str]) -> bool: - if not pattern_hint: + if not pattern_candidates: return True for candidate_url in url_list: - if _match_url_pattern(candidate_url, pattern_hint): - return True + for pat in pattern_candidates: + if _match_url_pattern(candidate_url, pat): + return True return False if not pattern or pattern == "*": - debug(f"[folder:{backend_label}] url search: any-url (limit={limit})") - rows = api.get_files_with_any_url(limit) + if pattern_candidates: + debug( + f"[folder:{backend_label}] url search: any-url (limit={limit}) pattern_hint={len(pattern_candidates)}" + ) + rows = api.get_files_by_url_like_any( + [_url_like_pattern(p) for p in pattern_candidates], + limit, + ) + else: + debug(f"[folder:{backend_label}] url search: any-url (limit={limit})") + rows = api.get_files_with_any_url(limit) else: debug( f"[folder:{backend_label}] url search: like={pattern} (limit={limit})" diff --git a/Store/HydrusNetwork.py b/Store/HydrusNetwork.py index 64d7eb0..ac39ddd 100644 --- a/Store/HydrusNetwork.py +++ b/Store/HydrusNetwork.py @@ -5,7 +5,7 @@ import sys import tempfile import shutil from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Sequence, Tuple from urllib.parse import quote @@ -516,7 +516,8 @@ class HydrusNetwork(Store): url_value: str | None, want_any: bool, fetch_limit: int, - scan_limit: int | None = None + scan_limit: int | None = None, + needles: Optional[Sequence[str]] = None, ) -> list[dict[str, Any]]: """Best-effort URL search by scanning Hydrus metadata with include_file_url=True.""" @@ -572,17 +573,29 @@ class HydrusNetwork(Store): if not candidate_file_ids and not candidate_hashes: return [] - needle = (url_value or "").strip().lower() + needle_list: list[str] = [] + if isinstance(needles, (list, tuple, set)): + for item in needles: + text = str(item or "").strip().lower() + if text and text not in needle_list: + needle_list.append(text) + if not needle_list: + needle = (url_value or "").strip().lower() + if needle: + needle_list = [needle] chunk_size = 200 out: list[dict[str, Any]] = [] if scan_limit is None: try: - if not want_any and url_value: - scan_limit = max(200, min(int(fetch_limit), 400)) + if not want_any and needle_list: + if len(needle_list) > 1: + scan_limit = max(int(fetch_limit) * 20, 2000) + else: + scan_limit = max(200, min(int(fetch_limit), 400)) else: scan_limit = max(int(fetch_limit) * 5, 1000) except Exception: - scan_limit = 400 if (not want_any and url_value) else 1000 + scan_limit = 400 if (not want_any and needle_list) else 1000 if scan_limit is not None: scan_limit = min(int(scan_limit), 10000) scanned = 0 @@ -641,9 +654,9 @@ class HydrusNetwork(Store): if want_any: out.append(meta) continue - if not needle: + if not needle_list: continue - if any(needle in u.lower() for u in urls): + if any(any(n in u.lower() for n in needle_list) for u in urls): out.append(meta) continue @@ -698,18 +711,37 @@ class HydrusNetwork(Store): # Special case: url:* and url: metadata_list: list[dict[str, Any]] | None = None - pattern_hint = str(kwargs.get("pattern_hint") or "").strip().lower() + pattern_hint_raw = kwargs.get("pattern_hint") + pattern_hints: list[str] = [] + if isinstance(pattern_hint_raw, (list, tuple, set)): + for item in pattern_hint_raw: + text = str(item or "").strip().lower() + if text and text not in pattern_hints: + pattern_hints.append(text) + elif isinstance(pattern_hint_raw, str): + text = pattern_hint_raw.strip().lower() + if text: + pattern_hints.append(text) + pattern_hint = pattern_hints[0] if pattern_hints else "" if ":" in query_lower and not query_lower.startswith(":"): namespace, pattern = query_lower.split(":", 1) namespace = namespace.strip().lower() pattern = pattern.strip() if namespace == "url": if not pattern or pattern == "*": - metadata_list = _iter_url_filtered_metadata( - None, - want_any=True, - fetch_limit=int(limit) if limit else 100 - ) + if pattern_hints: + metadata_list = _iter_url_filtered_metadata( + None, + want_any=False, + fetch_limit=int(limit) if limit else 100, + needles=pattern_hints, + ) + else: + metadata_list = _iter_url_filtered_metadata( + None, + want_any=True, + fetch_limit=int(limit) if limit else 100 + ) else: def _clean_url_search_token(value: str | None) -> str: token = str(value or "").strip().lower() @@ -807,6 +839,7 @@ class HydrusNetwork(Store): want_any=False, fetch_limit=int(limit) if limit else 100, scan_limit=scan_limit_override, + needles=pattern_hints if pattern_hints else None, ) # Parse the query into tags diff --git a/cmdlet/_shared.py b/cmdlet/_shared.py index 17d31da..55f8bcf 100644 --- a/cmdlet/_shared.py +++ b/cmdlet/_shared.py @@ -3358,6 +3358,113 @@ def check_url_exists_in_storage( _mark_preflight_checked() return True + bulk_mode = len(unique_urls) >= 8 + + def _build_bulk_patterns(needles_map: Dict[str, List[str]], max_per_url: int = 3, max_total: int = 240) -> List[str]: + patterns: List[str] = [] + for _original, needles in needles_map.items(): + for needle in (needles or [])[:max_per_url]: + needle_text = str(needle or "").strip() + if not needle_text: + continue + if needle_text not in patterns: + patterns.append(needle_text) + if len(patterns) >= max_total: + return patterns + return patterns + + bulk_patterns = _build_bulk_patterns(url_needles) + + def _match_normalized_url(pattern_text: str, candidate_url: str) -> bool: + pattern_norm = _normalize_url_for_search(pattern_text) + candidate_norm = _normalize_url_for_search(candidate_url) + if not pattern_norm or not candidate_norm: + return False + if pattern_norm == candidate_norm: + return True + return pattern_norm in candidate_norm + + def _extract_urls_from_hit( + hit: Any, + backend: Any, + *, + allow_backend_lookup: bool = True, + ) -> List[str]: + url_values: List[str] = [] + try: + raw_urls = get_field(hit, "known_urls") or get_field(hit, "urls") or get_field(hit, "url") + if isinstance(raw_urls, str) and raw_urls.strip(): + url_values.append(raw_urls.strip()) + elif isinstance(raw_urls, (list, tuple, set)): + for item in raw_urls: + if isinstance(item, str) and item.strip(): + url_values.append(item.strip()) + except Exception: + url_values = [] + + if url_values or not allow_backend_lookup: + return url_values + + try: + file_hash = get_field(hit, "hash") or get_field(hit, "file_hash") or get_field(hit, "sha256") or "" + except Exception: + file_hash = "" + + if file_hash: + try: + fetched = backend.get_url(str(file_hash)) + if isinstance(fetched, str) and fetched.strip(): + url_values.append(fetched.strip()) + elif isinstance(fetched, (list, tuple, set)): + for item in fetched: + if isinstance(item, str) and item.strip(): + url_values.append(item.strip()) + except Exception: + pass + + return url_values + + def _build_display_row_for_hit( + hit: Any, + backend_name: str, + original_url: str, + ) -> Dict[str, Any]: + try: + from SYS.result_table import build_display_row + extracted = build_display_row(hit, keys=["title", "store", "hash", "ext", "size"]) + except Exception: + extracted = {} + + try: + title = extracted.get("title") or get_field(hit, "title") or get_field(hit, "name") or get_field(hit, "target") or get_field(hit, "path") or "(exists)" + except Exception: + title = "(exists)" + + try: + file_hash = extracted.get("hash") or get_field(hit, "hash") or get_field(hit, "file_hash") or get_field(hit, "sha256") or "" + except Exception: + file_hash = "" + + ext = extracted.get("ext") if isinstance(extracted, dict) else "" + size_val = extracted.get("size") if isinstance(extracted, dict) else None + + return { + "title": str(title), + "store": str(get_field(hit, "store") or backend_name), + "hash": str(file_hash or ""), + "ext": str(ext or ""), + "size": size_val, + "url": original_url, + "columns": [ + ("Title", str(title)), + ("Store", str(get_field(hit, "store") or backend_name)), + ("Hash", str(file_hash or "")), + ("Ext", str(ext or "")), + ("Size", size_val), + ("URL", original_url), + ], + } + def _search_backend_url_hits( backend: Any, backend_name: str, @@ -3379,15 +3486,6 @@ def check_url_exists_in_storage( continue if not backend_hits: - def _match_normalized_url(pattern_text: str, candidate_url: str) -> bool: - pattern_norm = _normalize_url_for_search(pattern_text) - candidate_norm = _normalize_url_for_search(candidate_url) - if not pattern_norm or not candidate_norm: - return False - if pattern_norm == candidate_norm: - return True - return pattern_norm in candidate_norm - fallback_hits: List[Dict[str, Any]] = [] try: fallback_hits = backend.search("url:*", limit=200) or [] @@ -3395,31 +3493,7 @@ def check_url_exists_in_storage( fallback_hits = [] for hit in fallback_hits: - url_values: List[str] = [] - try: - raw_urls = get_field(hit, "known_urls") or get_field(hit, "urls") or get_field(hit, "url") - if isinstance(raw_urls, str) and raw_urls.strip(): - url_values.append(raw_urls.strip()) - elif isinstance(raw_urls, (list, tuple, set)): - for item in raw_urls: - if isinstance(item, str) and item.strip(): - url_values.append(item.strip()) - except Exception: - url_values = [] - - if not url_values: - try: - file_hash = hit.get("hash") if isinstance(hit, dict) else None - if file_hash: - fetched = backend.get_url(str(file_hash)) - if isinstance(fetched, str) and fetched.strip(): - url_values.append(fetched.strip()) - elif isinstance(fetched, (list, tuple, set)): - for item in fetched: - if isinstance(item, str) and item.strip(): - url_values.append(item.strip()) - except Exception: - pass + url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=True) if not url_values: continue @@ -3436,68 +3510,12 @@ def check_url_exists_in_storage( if not matched: continue - title = "(exists)" - try: - title = hit.get("title") or hit.get("name") or hit.get("target") or hit.get("path") or "(exists)" - except Exception: - title = "(exists)" - - file_hash = "" - try: - file_hash = hit.get("hash") or hit.get("file_hash") or hit.get("sha256") or "" - except Exception: - file_hash = "" - - return { - "title": str(title), - "store": str(hit.get("store") or backend_name), - "hash": str(file_hash or ""), - "ext": "", - "size": None, - "url": original_url, - "columns": [ - ("Title", str(title)), - ("Store", str(hit.get("store") or backend_name)), - ("Hash", str(file_hash or "")), - ("URL", original_url), - ], - } + return _build_display_row_for_hit(hit, backend_name, original_url) return None hit = backend_hits[0] - title = hit.get("title") or hit.get("name") or hit.get("target") or hit.get("path") or "(exists)" - file_hash = hit.get("hash") or hit.get("file_hash") or hit.get("sha256") or "" - - try: - from SYS.result_table import build_display_row - extracted = build_display_row(hit, keys=["title", "store", "hash", "ext", "size"]) - except Exception: - extracted = {} - - extracted["title"] = str(title) - extracted["store"] = str(hit.get("store") or backend_name) - extracted["hash"] = str(file_hash or "") - - ext = extracted.get("ext") - size_val = extracted.get("size") - - return { - "title": str(title), - "store": str(hit.get("store") or backend_name), - "hash": str(file_hash or ""), - "ext": str(ext or ""), - "size": size_val, - "url": original_url, - "columns": [ - ("Title", str(title)), - ("Store", str(hit.get("store") or backend_name)), - ("Hash", str(file_hash or "")), - ("Ext", str(ext or "")), - ("Size", size_val), - ("URL", original_url), - ], - } + return _build_display_row_for_hit(hit, backend_name, original_url) backend_names: List[str] = [] try: @@ -3558,6 +3576,54 @@ def check_url_exists_in_storage( if not hydrus_available: debug("Bulk URL preflight: hydrus availability check failed; attempting best-effort lookup") + if bulk_mode and bulk_patterns: + bulk_hits: Optional[List[Any]] = None + bulk_limit = min(2000, max(200, len(unique_urls) * 8)) + try: + bulk_hits = backend.search( + "url:*", + limit=bulk_limit, + pattern_hint=bulk_patterns, + ) or [] + except Exception: + try: + bulk_hits = backend.search("url:*", limit=bulk_limit) or [] + except Exception: + bulk_hits = None + + if bulk_hits is not None: + for hit in bulk_hits: + if len(match_rows) >= max_rows: + break + url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=False) + if not url_values: + continue + + for original_url, needles in url_needles.items(): + if len(match_rows) >= max_rows: + break + if (original_url, str(backend_name)) in seen_pairs: + continue + + matched = False + for url_value in url_values: + for needle in (needles or []): + if _match_normalized_url(str(needle or ""), str(url_value or "")): + matched = True + break + if matched: + break + + if not matched: + continue + + seen_pairs.add((original_url, str(backend_name))) + matched_urls.add(original_url) + match_rows.append( + _build_display_row_for_hit(hit, str(backend_name), original_url) + ) + continue + for original_url, needles in url_needles.items(): if len(match_rows) >= max_rows: break @@ -3622,6 +3688,54 @@ def check_url_exists_in_storage( match_rows.append(display_row) continue + if bulk_mode and bulk_patterns: + bulk_hits: Optional[List[Any]] = None + bulk_limit = min(2000, max(200, len(unique_urls) * 8)) + try: + bulk_hits = backend.search( + "url:*", + limit=bulk_limit, + pattern_hint=bulk_patterns, + ) or [] + except Exception: + try: + bulk_hits = backend.search("url:*", limit=bulk_limit) or [] + except Exception: + bulk_hits = None + + if bulk_hits is not None: + for hit in bulk_hits: + if len(match_rows) >= max_rows: + break + url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=False) + if not url_values: + continue + + for original_url, needles in url_needles.items(): + if len(match_rows) >= max_rows: + break + if (original_url, str(backend_name)) in seen_pairs: + continue + + matched = False + for url_value in url_values: + for needle in (needles or []): + if _match_normalized_url(str(needle or ""), str(url_value or "")): + matched = True + break + if matched: + break + + if not matched: + continue + + seen_pairs.add((original_url, str(backend_name))) + matched_urls.add(original_url) + match_rows.append( + _build_display_row_for_hit(hit, str(backend_name), original_url) + ) + continue + for original_url, needles in url_needles.items(): if len(match_rows) >= max_rows: break