This commit is contained in:
2026-01-16 01:47:00 -08:00
parent 41e95d0360
commit 12436e5a6a
4 changed files with 492 additions and 130 deletions

View File

@@ -217,6 +217,7 @@ class API_folder_store:
self.connection: Optional[sqlite3.Connection] = None self.connection: Optional[sqlite3.Connection] = None
# Use the shared lock # Use the shared lock
self._db_lock = self._shared_db_lock self._db_lock = self._shared_db_lock
mm_debug(f"[folder-db] init: root={self.library_root} db={self.db_path}")
self._init_db() self._init_db()
@contextmanager @contextmanager
@@ -284,6 +285,7 @@ class API_folder_store:
"""Initialize database connection and create tables if needed.""" """Initialize database connection and create tables if needed."""
with self._with_db_lock(): with self._with_db_lock():
try: try:
mm_debug(f"[folder-db] opening sqlite db: {self.db_path}")
# Ensure the library root exists; sqlite cannot create parent dirs. # Ensure the library root exists; sqlite cannot create parent dirs.
try: try:
# User safety: Folder store must be created in a blank folder/no files in it. # User safety: Folder store must be created in a blank folder/no files in it.
@@ -326,6 +328,7 @@ class API_folder_store:
timeout=20.0 timeout=20.0
) )
self.connection.row_factory = sqlite3.Row self.connection.row_factory = sqlite3.Row
mm_debug(f"[folder-db] sqlite connection opened: {self.db_path}")
# Ensure busy_timeout is set immediately for all subsequent ops (including pragmas) # Ensure busy_timeout is set immediately for all subsequent ops (including pragmas)
try: try:
@@ -337,7 +340,14 @@ class API_folder_store:
# 1. WAL mode for better concurrency and fewer locks # 1. WAL mode for better concurrency and fewer locks
self.connection.execute("PRAGMA journal_mode=WAL") self.connection.execute("PRAGMA journal_mode=WAL")
# 2. auto_vacuum=FULL to automatically reclaim space from deleted rows/logs # 2. auto_vacuum=FULL to automatically reclaim space from deleted rows/logs
self.connection.execute("PRAGMA auto_vacuum = FULL") try:
self.connection.execute("PRAGMA auto_vacuum = FULL")
except sqlite3.OperationalError as exc:
if "locked" not in str(exc).lower():
raise
logger.warning(
"Database locked; skipping PRAGMA auto_vacuum setup for this session."
)
# 3. Increase page size for modern file systems # 3. Increase page size for modern file systems
self.connection.execute("PRAGMA page_size = 4096") self.connection.execute("PRAGMA page_size = 4096")
# 4. Memory and Sync optimizations # 4. Memory and Sync optimizations
@@ -2657,12 +2667,30 @@ class DatabaseAPI:
def __init__(self, search_dir: Path): def __init__(self, search_dir: Path):
self.search_dir = expand_path(search_dir).resolve() self.search_dir = expand_path(search_dir).resolve()
self.db = API_folder_store(self.search_dir) self.db = API_folder_store(self.search_dir)
try:
mm_debug(
f"[folder-db] DatabaseAPI init: root={self.search_dir} db={self.db.db_path}"
)
except Exception:
pass
def __enter__(self): def __enter__(self):
try:
mm_debug(
f"[folder-db] DatabaseAPI enter: root={self.search_dir} db={self.db.db_path}"
)
except Exception:
pass
self.db.__enter__() self.db.__enter__()
return self return self
def __exit__(self, *args): def __exit__(self, *args):
try:
mm_debug(
f"[folder-db] DatabaseAPI exit: root={self.search_dir} db={self.db.db_path}"
)
except Exception:
pass
return self.db.__exit__(*args) return self.db.__exit__(*args)
def get_cursor(self): def get_cursor(self):
@@ -2730,6 +2758,9 @@ class DatabaseAPI:
def get_file_hashes_with_any_url(self, limit: Optional[int] = None) -> Set[str]: def get_file_hashes_with_any_url(self, limit: Optional[int] = None) -> Set[str]:
"""Get hashes of files that have any non-empty URL metadata.""" """Get hashes of files that have any non-empty URL metadata."""
mm_debug(
f"[folder-db] get_file_hashes_with_any_url start: limit={limit or 10000}"
)
cursor = self.get_cursor() cursor = self.get_cursor()
cursor.execute( cursor.execute(
""" """
@@ -2744,8 +2775,11 @@ class DatabaseAPI:
(limit or 10000, (limit or 10000,
), ),
) )
return {row[0] rows = cursor.fetchall()
for row in cursor.fetchall()} mm_debug(
f"[folder-db] get_file_hashes_with_any_url done: {len(rows)} row(s)"
)
return {row[0] for row in rows}
def get_file_hashes_by_url_like( def get_file_hashes_by_url_like(
self, self,
@@ -2753,6 +2787,9 @@ class DatabaseAPI:
limit: Optional[int] = None limit: Optional[int] = None
) -> Set[str]: ) -> Set[str]:
"""Get hashes of files whose URL metadata contains a substring (case-insensitive).""" """Get hashes of files whose URL metadata contains a substring (case-insensitive)."""
mm_debug(
f"[folder-db] get_file_hashes_by_url_like start: pattern={like_pattern} limit={limit or 10000}"
)
cursor = self.get_cursor() cursor = self.get_cursor()
cursor.execute( cursor.execute(
""" """
@@ -2766,8 +2803,11 @@ class DatabaseAPI:
(like_pattern.lower(), (like_pattern.lower(),
limit or 10000), limit or 10000),
) )
return {row[0] rows = cursor.fetchall()
for row in cursor.fetchall()} mm_debug(
f"[folder-db] get_file_hashes_by_url_like done: {len(rows)} row(s)"
)
return {row[0] for row in rows}
def get_file_hashes_by_ext(self, def get_file_hashes_by_ext(self,
ext_value: str, ext_value: str,
@@ -2847,14 +2887,18 @@ class DatabaseAPI:
def get_files_with_any_url(self, limit: Optional[int] = None) -> List[tuple]: def get_files_with_any_url(self, limit: Optional[int] = None) -> List[tuple]:
"""Get files that have any non-empty URL metadata. """Get files that have any non-empty URL metadata.
Returns (hash, file_path, size, ext) tuples. Returns (hash, file_path, size, ext, url) tuples.
""" """
mm_debug(
f"[folder-db] get_files_with_any_url start: limit={limit or 10000}"
)
cursor = self.get_cursor() cursor = self.get_cursor()
cursor.execute( cursor.execute(
""" """
SELECT f.hash, f.file_path, SELECT f.hash, f.file_path,
COALESCE((SELECT size FROM metadata WHERE hash = f.hash), 0) as size, COALESCE((SELECT size FROM metadata WHERE hash = f.hash), 0) as size,
COALESCE((SELECT ext FROM metadata WHERE hash = f.hash), '') as ext COALESCE((SELECT ext FROM metadata WHERE hash = f.hash), '') as ext,
COALESCE(m.url, '') as url
FROM file f FROM file f
JOIN metadata m ON f.hash = m.hash JOIN metadata m ON f.hash = m.hash
WHERE m.url IS NOT NULL WHERE m.url IS NOT NULL
@@ -2866,21 +2910,29 @@ class DatabaseAPI:
(limit or 10000, (limit or 10000,
), ),
) )
return cursor.fetchall() rows = cursor.fetchall()
mm_debug(
f"[folder-db] get_files_with_any_url done: {len(rows)} row(s)"
)
return rows
def get_files_by_url_like(self, def get_files_by_url_like(self,
like_pattern: str, like_pattern: str,
limit: Optional[int] = None) -> List[tuple]: limit: Optional[int] = None) -> List[tuple]:
"""Get files whose URL metadata contains a substring (case-insensitive). """Get files whose URL metadata contains a substring (case-insensitive).
Returns (hash, file_path, size, ext) tuples. Returns (hash, file_path, size, ext, url) tuples.
""" """
mm_debug(
f"[folder-db] get_files_by_url_like start: pattern={like_pattern} limit={limit or 10000}"
)
cursor = self.get_cursor() cursor = self.get_cursor()
cursor.execute( cursor.execute(
""" """
SELECT f.hash, f.file_path, SELECT f.hash, f.file_path,
COALESCE((SELECT size FROM metadata WHERE hash = f.hash), 0) as size, COALESCE((SELECT size FROM metadata WHERE hash = f.hash), 0) as size,
COALESCE((SELECT ext FROM metadata WHERE hash = f.hash), '') as ext COALESCE((SELECT ext FROM metadata WHERE hash = f.hash), '') as ext,
COALESCE(m.url, '') as url
FROM file f FROM file f
JOIN metadata m ON f.hash = m.hash JOIN metadata m ON f.hash = m.hash
WHERE m.url IS NOT NULL WHERE m.url IS NOT NULL
@@ -2891,7 +2943,11 @@ class DatabaseAPI:
(like_pattern.lower(), (like_pattern.lower(),
limit or 10000), limit or 10000),
) )
return cursor.fetchall() rows = cursor.fetchall()
mm_debug(
f"[folder-db] get_files_by_url_like done: {len(rows)} row(s)"
)
return rows
def get_file_metadata(self, def get_file_metadata(self,
file_hashes: Set[str], file_hashes: Set[str],
@@ -2899,6 +2955,9 @@ class DatabaseAPI:
"""Get metadata for files given their hashes. Returns (hash, file_path, size, extension) tuples.""" """Get metadata for files given their hashes. Returns (hash, file_path, size, extension) tuples."""
if not file_hashes: if not file_hashes:
return [] return []
mm_debug(
f"[folder-db] get_file_metadata start: hashes={len(file_hashes)} limit={limit or len(file_hashes)}"
)
cursor = self.get_cursor() cursor = self.get_cursor()
placeholders = ",".join(["?"] * len(file_hashes)) placeholders = ",".join(["?"] * len(file_hashes))
fetch_sql = f""" fetch_sql = f"""
@@ -2911,7 +2970,11 @@ class DatabaseAPI:
LIMIT ? LIMIT ?
""" """
cursor.execute(fetch_sql, (*file_hashes, limit or len(file_hashes))) cursor.execute(fetch_sql, (*file_hashes, limit or len(file_hashes)))
return cursor.fetchall() rows = cursor.fetchall()
mm_debug(
f"[folder-db] get_file_metadata done: {len(rows)} row(s)"
)
return rows
def get_all_files(self, limit: Optional[int] = None) -> List[tuple]: def get_all_files(self, limit: Optional[int] = None) -> List[tuple]:
"""Get all files in database. Returns (hash, file_path, size, ext) tuples.""" """Get all files in database. Returns (hash, file_path, size, ext) tuples."""
@@ -2932,11 +2995,18 @@ class DatabaseAPI:
def get_tags_for_file(self, file_hash: str) -> List[str]: def get_tags_for_file(self, file_hash: str) -> List[str]:
"""Get all tags for a file given its hash.""" """Get all tags for a file given its hash."""
mm_debug(
f"[folder-db] get_tags_for_file start: hash={file_hash}"
)
cursor = self.get_cursor() cursor = self.get_cursor()
cursor.execute("SELECT tag FROM tag WHERE hash = ?", cursor.execute("SELECT tag FROM tag WHERE hash = ?",
(file_hash, (file_hash,
)) ))
return [row[0] for row in cursor.fetchall()] rows = cursor.fetchall()
mm_debug(
f"[folder-db] get_tags_for_file done: {len(rows)} row(s)"
)
return [row[0] for row in rows]
def get_tags_by_namespace_and_file(self, def get_tags_by_namespace_and_file(self,
file_hash: str, file_hash: str,

View File

@@ -4,7 +4,7 @@ import json
import re import re
import shutil import shutil
import sys import sys
from fnmatch import translate from fnmatch import fnmatch, translate
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
@@ -30,6 +30,28 @@ def _resolve_file_hash(db_hash: Optional[str], file_path: Path) -> Optional[str]
return _normalize_hash(file_path.stem) return _normalize_hash(file_path.stem)
def _normalize_url_for_search(url: str) -> str:
value = str(url or "").strip()
value = re.sub(r"^[a-z][a-z0-9+.-]*://", "", value, flags=re.IGNORECASE)
value = re.sub(r"^www\.", "", value, flags=re.IGNORECASE)
return value.lower()
def _match_url_pattern(url: str, pattern: str) -> bool:
normalized_url = _normalize_url_for_search(url)
normalized_pattern = _normalize_url_for_search(pattern)
if not normalized_pattern:
return False
has_wildcards = any(ch in normalized_pattern for ch in ("*", "?"))
if has_wildcards:
return fnmatch(normalized_url, normalized_pattern)
normalized_url_no_slash = normalized_url.rstrip("/")
normalized_pattern_no_slash = normalized_pattern.rstrip("/")
if normalized_pattern_no_slash and normalized_pattern_no_slash == normalized_url_no_slash:
return True
return normalized_pattern in normalized_url
class Folder(Store): class Folder(Store):
"""""" """"""
@@ -690,6 +712,12 @@ class Folder(Store):
match_all = query == "*" or (not query and bool(ext_filter)) match_all = query == "*" or (not query and bool(ext_filter))
results = [] results = []
search_dir = expand_path(self._location) search_dir = expand_path(self._location)
backend_label = str(
getattr(self, "_name", "") or getattr(self, "NAME", "") or "folder"
)
debug(
f"[folder:{backend_label}] search start: query={query} limit={limit} root={search_dir}"
)
def _url_like_pattern(value: str) -> str: def _url_like_pattern(value: str) -> str:
# Interpret user patterns as substring matches (with optional glob wildcards). # Interpret user patterns as substring matches (with optional glob wildcards).
@@ -1002,7 +1030,7 @@ class Folder(Store):
namespace, pattern = query.split(":", 1) namespace, pattern = query.split(":", 1)
namespace = namespace.strip().lower() namespace = namespace.strip().lower()
pattern = pattern.strip().lower() pattern = pattern.strip().lower()
debug(f"Performing namespace search: {namespace}:{pattern}") debug(f"[folder:{backend_label}] namespace search: {namespace}:{pattern}")
if namespace == "hash": if namespace == "hash":
normalized_hash = _normalize_hash(pattern) normalized_hash = _normalize_hash(pattern)
@@ -1041,14 +1069,50 @@ class Folder(Store):
return results return results
if namespace == "url": if namespace == "url":
pattern_hint = kwargs.get("pattern_hint")
def _parse_url_value(raw: Any) -> list[str]:
if raw is None:
return []
if isinstance(raw, list):
return [str(u).strip() for u in raw if str(u).strip()]
if isinstance(raw, str):
text = raw.strip()
if not text:
return []
try:
parsed = json.loads(text)
if isinstance(parsed, list):
return [
str(u).strip()
for u in parsed
if str(u).strip()
]
except Exception:
pass
return [text]
return []
def _matches_pattern(url_list: list[str]) -> bool:
if not pattern_hint:
return True
for candidate_url in url_list:
if _match_url_pattern(candidate_url, pattern_hint):
return True
return False
if not pattern or pattern == "*": if not pattern or pattern == "*":
debug(f"[folder:{backend_label}] url search: any-url (limit={limit})")
rows = api.get_files_with_any_url(limit) rows = api.get_files_with_any_url(limit)
else: else:
debug(
f"[folder:{backend_label}] url search: like={pattern} (limit={limit})"
)
rows = api.get_files_by_url_like( rows = api.get_files_by_url_like(
_url_like_pattern(pattern), _url_like_pattern(pattern),
limit limit
) )
for file_hash, file_path_str, size_bytes, ext in rows: for file_hash, file_path_str, size_bytes, ext, url_raw in rows:
if not file_path_str: if not file_path_str:
continue continue
file_path = search_dir / str(file_path_str) file_path = search_dir / str(file_path_str)
@@ -1059,6 +1123,9 @@ class Folder(Store):
size_bytes = file_path.stat().st_size size_bytes = file_path.stat().st_size
except OSError: except OSError:
size_bytes = None size_bytes = None
urls = _parse_url_value(url_raw)
if not urls or not _matches_pattern(urls):
continue
tags = api.get_tags_for_file(file_hash) tags = api.get_tags_for_file(file_hash)
entry = _create_entry( entry = _create_entry(
file_path, file_path,
@@ -1066,6 +1133,7 @@ class Folder(Store):
size_bytes, size_bytes,
file_hash file_hash
) )
entry["urls"] = urls
results.append(entry) results.append(entry)
if limit is not None and len(results) >= limit: if limit is not None and len(results) >= limit:
return results return results

View File

@@ -466,7 +466,9 @@ class HydrusNetwork(Store):
def _extract_urls(meta_obj: Any) -> list[str]: def _extract_urls(meta_obj: Any) -> list[str]:
if not isinstance(meta_obj, dict): if not isinstance(meta_obj, dict):
return [] return []
raw = meta_obj.get("url") raw = meta_obj.get("known_urls")
if raw is None:
raw = meta_obj.get("url")
if raw is None: if raw is None:
raw = meta_obj.get("urls") raw = meta_obj.get("urls")
if isinstance(raw, str): if isinstance(raw, str):
@@ -483,100 +485,178 @@ class HydrusNetwork(Store):
return out return out
return [] return []
def _extract_search_ids(payload: Any) -> tuple[list[int], list[str]]:
if not isinstance(payload, dict):
return [], []
raw_ids = payload.get("file_ids", [])
raw_hashes = payload.get("hashes", [])
ids_out: list[int] = []
hashes_out: list[str] = []
if isinstance(raw_ids, list):
for item in raw_ids:
try:
if isinstance(item, (int, float)):
ids_out.append(int(item))
continue
if isinstance(item, str) and item.strip().isdigit():
ids_out.append(int(item.strip()))
except Exception:
continue
if isinstance(raw_hashes, list):
for item in raw_hashes:
try:
candidate = str(item or "").strip().lower()
if candidate:
hashes_out.append(candidate)
except Exception:
continue
return ids_out, hashes_out
def _iter_url_filtered_metadata( def _iter_url_filtered_metadata(
url_value: str | None, url_value: str | None,
want_any: bool, want_any: bool,
fetch_limit: int fetch_limit: int,
) -> list[dict[str, scan_limit: int | None = None
Any]]: ) -> list[dict[str, Any]]:
"""Best-effort URL search by scanning Hydrus metadata with include_file_url=True.""" """Best-effort URL search by scanning Hydrus metadata with include_file_url=True."""
# First try a fast system predicate if Hydrus supports it.
candidate_file_ids: list[int] = [] candidate_file_ids: list[int] = []
try: candidate_hashes: list[str] = []
if want_any: seen_file_ids: set[int] = set()
seen_hashes: set[str] = set()
def _add_candidates(ids: list[int], hashes: list[str]) -> None:
for fid in ids:
if fid in seen_file_ids:
continue
seen_file_ids.add(fid)
candidate_file_ids.append(fid)
for hh in hashes:
if hh in seen_hashes:
continue
seen_hashes.add(hh)
candidate_hashes.append(hh)
predicate_supported = getattr(self, "_has_url_predicate", None)
if predicate_supported is not False:
try:
predicate = "system:has url" predicate = "system:has url"
url_search = client.search_files( url_search = client.search_files(
tags=[predicate], tags=[predicate],
return_hashes=False, return_hashes=True,
return_file_ids=True, return_file_ids=False,
return_file_count=False, return_file_count=False,
) )
ids = url_search.get("file_ids", ids, hashes = _extract_search_ids(url_search)
[]) if isinstance(url_search, _add_candidates(ids, hashes)
dict) else [] self._has_url_predicate = True
if isinstance(ids, list): except Exception as exc:
candidate_file_ids = [ try:
int(x) for x in ids from API.HydrusNetwork import HydrusRequestError
if isinstance(x, (int, float,
str)) and str(x).strip().isdigit()
]
except Exception:
candidate_file_ids = []
if not candidate_file_ids: if isinstance(exc, HydrusRequestError) and getattr(exc, "status", None) == 400:
# Fallback: scan from system:everything and filter by URL substring. self._has_url_predicate = False
except Exception:
pass
if not candidate_file_ids and not candidate_hashes:
everything = client.search_files( everything = client.search_files(
tags=["system:everything"], tags=["system:everything"],
return_hashes=False, return_hashes=True,
return_file_ids=True, return_file_ids=False,
return_file_count=False, return_file_count=False,
) )
ids = everything.get("file_ids", ids, hashes = _extract_search_ids(everything)
[]) if isinstance(everything, _add_candidates(ids, hashes)
dict) else []
if isinstance(ids, list):
candidate_file_ids = [
int(x) for x in ids if isinstance(x, (int, float))
]
if not candidate_file_ids: if not candidate_file_ids and not candidate_hashes:
return [] return []
needle = (url_value or "").strip().lower() needle = (url_value or "").strip().lower()
chunk_size = 200 chunk_size = 200
out: list[dict[str, Any]] = [] out: list[dict[str, Any]] = []
if scan_limit is None:
try:
if not want_any and url_value:
scan_limit = max(200, min(int(fetch_limit), 400))
else:
scan_limit = max(int(fetch_limit) * 5, 1000)
except Exception:
scan_limit = 400 if (not want_any and url_value) else 1000
if scan_limit is not None:
scan_limit = min(int(scan_limit), 10000)
scanned = 0
for start in range(0, len(candidate_file_ids), chunk_size): def _process_source(items: list[Any], kind: str) -> None:
nonlocal scanned
for start in range(0, len(items), chunk_size):
if len(out) >= fetch_limit:
return
if scan_limit is not None and scanned >= scan_limit:
return
chunk = items[start:start + chunk_size]
if scan_limit is not None:
remaining = scan_limit - scanned
if remaining <= 0:
return
if len(chunk) > remaining:
chunk = chunk[:remaining]
scanned += len(chunk)
try:
if kind == "hashes":
payload = client.fetch_file_metadata(
hashes=chunk,
include_file_url=True,
include_service_keys_to_tags=True,
include_duration=True,
include_size=True,
include_mime=True,
)
else:
payload = client.fetch_file_metadata(
file_ids=chunk,
include_file_url=True,
include_service_keys_to_tags=True,
include_duration=True,
include_size=True,
include_mime=True,
)
except Exception:
continue
metas = payload.get("metadata",
[]) if isinstance(payload,
dict) else []
if not isinstance(metas, list):
continue
for meta in metas:
if len(out) >= fetch_limit:
break
if not isinstance(meta, dict):
continue
urls = _extract_urls(meta)
if not urls:
continue
if want_any:
out.append(meta)
continue
if not needle:
continue
if any(needle in u.lower() for u in urls):
out.append(meta)
continue
sources: list[tuple[str, list[Any]]] = []
if candidate_hashes:
sources.append(("hashes", candidate_hashes))
elif candidate_file_ids:
sources.append(("file_ids", candidate_file_ids))
for kind, items in sources:
if len(out) >= fetch_limit: if len(out) >= fetch_limit:
break break
chunk = candidate_file_ids[start:start + chunk_size] _process_source(items, kind)
try:
payload = client.fetch_file_metadata(
file_ids=chunk,
include_file_url=True,
include_service_keys_to_tags=True,
include_duration=True,
include_size=True,
include_mime=True,
)
except Exception:
continue
metas = payload.get("metadata",
[]) if isinstance(payload,
dict) else []
if not isinstance(metas, list):
continue
for meta in metas:
if not isinstance(meta, dict):
continue
urls = _extract_urls(meta)
if not urls:
continue
if want_any:
out.append(meta)
if len(out) >= fetch_limit:
break
continue
if not needle:
continue
if any(needle in u.lower() for u in urls):
out.append(meta)
if len(out) >= fetch_limit:
break
return out return out
@@ -618,6 +698,7 @@ class HydrusNetwork(Store):
# Special case: url:* and url:<value> # Special case: url:* and url:<value>
metadata_list: list[dict[str, Any]] | None = None metadata_list: list[dict[str, Any]] | None = None
pattern_hint = str(kwargs.get("pattern_hint") or "").strip().lower()
if ":" in query_lower and not query_lower.startswith(":"): if ":" in query_lower and not query_lower.startswith(":"):
namespace, pattern = query_lower.split(":", 1) namespace, pattern = query_lower.split(":", 1)
namespace = namespace.strip().lower() namespace = namespace.strip().lower()
@@ -630,6 +711,12 @@ class HydrusNetwork(Store):
fetch_limit=int(limit) if limit else 100 fetch_limit=int(limit) if limit else 100
) )
else: else:
def _clean_url_search_token(value: str | None) -> str:
token = str(value or "").strip().lower()
if not token:
return ""
return token.replace("*", "").replace("?", "")
# Fast-path: exact URL via /add_urls/get_url_files when a full URL is provided. # Fast-path: exact URL via /add_urls/get_url_files when a full URL is provided.
try: try:
if pattern.startswith("http://") or pattern.startswith( if pattern.startswith("http://") or pattern.startswith(
@@ -706,10 +793,20 @@ class HydrusNetwork(Store):
# Fallback: substring scan # Fallback: substring scan
if metadata_list is None: if metadata_list is None:
search_token = _clean_url_search_token(pattern_hint or pattern)
scan_limit_override: int | None = None
if search_token:
is_domain_only = ("://" not in search_token and "/" not in search_token)
if is_domain_only:
try:
scan_limit_override = max(int(limit or 100) * 20, 2000)
except Exception:
scan_limit_override = 2000
metadata_list = _iter_url_filtered_metadata( metadata_list = _iter_url_filtered_metadata(
pattern, search_token,
want_any=False, want_any=False,
fetch_limit=int(limit) if limit else 100 fetch_limit=int(limit) if limit else 100,
scan_limit=scan_limit_override,
) )
# Parse the query into tags # Parse the query into tags
@@ -742,26 +839,6 @@ class HydrusNetwork(Store):
# Search files with the tags (unless url: search already produced metadata) # Search files with the tags (unless url: search already produced metadata)
results = [] results = []
def _extract_search_ids(payload: Any) -> tuple[list[int], list[str]]:
if not isinstance(payload, dict):
return [], []
raw_ids = payload.get("file_ids", [])
raw_hashes = payload.get("hashes", [])
ids_out: list[int] = []
hashes_out: list[str] = []
if isinstance(raw_ids, list):
for item in raw_ids:
try:
ids_out.append(int(item))
except (TypeError, ValueError):
continue
if isinstance(raw_hashes, list):
hashes_out = [
str(h).strip() for h in raw_hashes
if isinstance(h, str) and str(h).strip()
]
return ids_out, hashes_out
if metadata_list is None: if metadata_list is None:
file_ids: list[int] = [] file_ids: list[int] = []
hashes: list[str] = [] hashes: list[str] = []

View File

@@ -1,5 +1,7 @@
from __future__ import annotations from __future__ import annotations
from queue import SimpleQueue
from threading import Thread
from dataclasses import dataclass from dataclasses import dataclass
from typing import Any, Dict, List, Sequence, Optional, Set, Tuple from typing import Any, Dict, List, Sequence, Optional, Set, Tuple
import sys import sys
@@ -34,6 +36,8 @@ class UrlItem:
class Get_Url(Cmdlet): class Get_Url(Cmdlet):
"""Get url associated with files via hash+store, or search urls by pattern.""" """Get url associated with files via hash+store, or search urls by pattern."""
STORE_SEARCH_TIMEOUT_SECONDS = 6.0
def __init__(self) -> None: def __init__(self) -> None:
super().__init__( super().__init__(
name="get-url", name="get-url",
@@ -81,8 +85,56 @@ class Get_Url(Cmdlet):
normalized_url = Get_Url._normalize_url_for_search(url) normalized_url = Get_Url._normalize_url_for_search(url)
normalized_pattern = Get_Url._normalize_url_for_search(pattern) normalized_pattern = Get_Url._normalize_url_for_search(pattern)
# Use fnmatch for wildcard matching (* and ?) has_wildcards = any(ch in normalized_pattern for ch in ("*", "?"))
return fnmatch(normalized_url, normalized_pattern) if has_wildcards:
return fnmatch(normalized_url, normalized_pattern)
normalized_url_no_slash = normalized_url.rstrip("/")
normalized_pattern_no_slash = normalized_pattern.rstrip("/")
if normalized_pattern_no_slash and normalized_pattern_no_slash == normalized_url_no_slash:
return True
return normalized_pattern in normalized_url
def _execute_search_with_timeout(
self,
backend: Any,
query: str,
limit: int,
store_name: str,
**kwargs: Any,
) -> Optional[List[Any]]:
queue: SimpleQueue[tuple[str, Any]] = SimpleQueue()
def _worker() -> None:
try:
queue.put(("ok", backend.search(query, limit=limit, **kwargs)))
except Exception as exc:
queue.put(("err", exc))
worker = Thread(target=_worker, daemon=True)
worker.start()
worker.join(timeout=self.STORE_SEARCH_TIMEOUT_SECONDS)
if worker.is_alive():
debug(
f"Store '{store_name}' search timed out after {self.STORE_SEARCH_TIMEOUT_SECONDS}s",
file=sys.stderr,
)
return None
if queue.empty():
return []
status, payload = queue.get()
if status == "err":
debug(
f"Store '{store_name}' search failed: {payload}",
file=sys.stderr,
)
return []
return payload or []
@staticmethod @staticmethod
def _extract_first_url(value: Any) -> Optional[str]: def _extract_first_url(value: Any) -> Optional[str]:
@@ -95,6 +147,35 @@ class Get_Url(Cmdlet):
return item.strip() return item.strip()
return None return None
@staticmethod
def _extract_urls_from_hit(hit: Any) -> List[str]:
"""Extract candidate URLs directly from a search hit, if present."""
raw = None
try:
raw = get_field(hit, "known_urls")
if not raw:
raw = get_field(hit, "urls")
if not raw:
raw = get_field(hit, "url")
if not raw:
raw = get_field(hit, "source_url") or get_field(hit, "source_urls")
except Exception:
raw = None
if isinstance(raw, str):
val = raw.strip()
return [val] if val else []
if isinstance(raw, (list, tuple)):
out: list[str] = []
for item in raw:
if not isinstance(item, str):
continue
v = item.strip()
if v:
out.append(v)
return out
return []
@staticmethod @staticmethod
def _extract_title_from_result(result: Any) -> Optional[str]: def _extract_title_from_result(result: Any) -> Optional[str]:
# Prefer explicit title field. # Prefer explicit title field.
@@ -219,6 +300,7 @@ class Get_Url(Cmdlet):
""" """
items: List[UrlItem] = [] items: List[UrlItem] = []
found_stores: Set[str] = set() found_stores: Set[str] = set()
MAX_RESULTS = 256
try: try:
storage = Store(config) storage = Store(config)
@@ -230,6 +312,8 @@ class Get_Url(Cmdlet):
return items, list(found_stores) return items, list(found_stores)
for store_name in store_names: for store_name in store_names:
if len(items) >= MAX_RESULTS:
break
try: try:
backend = storage[store_name] backend = storage[store_name]
@@ -243,9 +327,12 @@ class Get_Url(Cmdlet):
has_wildcards = any(ch in raw_pattern for ch in ("*", "?")) has_wildcards = any(ch in raw_pattern for ch in ("*", "?"))
# If this is a Hydrus backend and the pattern is a single URL, # If this is a Hydrus backend and the pattern is a single URL,
# normalize it through the official API. # normalize it through the official API. Skip for bare domains.
normalized_url = None normalized_url = None
if not has_wildcards and hasattr(backend, "get_url_info"): looks_like_url = (
"://" in raw_pattern or raw_pattern.startswith("magnet:")
)
if not has_wildcards and looks_like_url and hasattr(backend, "get_url_info"):
try: try:
info = backend.get_url_info(raw_pattern) # type: ignore[attr-defined] info = backend.get_url_info(raw_pattern) # type: ignore[attr-defined]
if isinstance(info, dict): if isinstance(info, dict):
@@ -255,13 +342,39 @@ class Get_Url(Cmdlet):
except Exception: except Exception:
normalized_url = None normalized_url = None
search_query = "url:*" if has_wildcards else f"url:{normalized_url or raw_pattern}" target_pattern = normalized_url or raw_pattern
try: if has_wildcards or not target_pattern:
search_results = backend.search(search_query, limit=1000) search_query = "url:*"
except Exception: else:
search_results = [] wrapped_pattern = f"*{target_pattern}*"
search_query = f"url:{wrapped_pattern}"
search_limit = max(1, min(MAX_RESULTS, 1000))
search_results = self._execute_search_with_timeout(
backend,
search_query,
search_limit,
store_name,
pattern_hint=target_pattern,
)
if search_results is None:
continue
search_results = search_results or []
if not search_results and target_pattern and not has_wildcards:
fallback_results = self._execute_search_with_timeout(
backend,
"url:*",
search_limit,
store_name,
pattern_hint=target_pattern,
)
if fallback_results is None:
continue
search_results = fallback_results or []
for hit in (search_results or []): for hit in (search_results or []):
if len(items) >= MAX_RESULTS:
break
file_hash = None file_hash = None
if isinstance(hit, dict): if isinstance(hit, dict):
file_hash = hit.get("hash") or hit.get("file_hash") file_hash = hit.get("hash") or hit.get("file_hash")
@@ -271,25 +384,57 @@ class Get_Url(Cmdlet):
file_hash = str(file_hash) file_hash = str(file_hash)
title = title_cache.get(file_hash, "") title = title_cache.get(file_hash, "")
if not title:
try:
title = (
get_field(hit, "title")
or get_field(hit, "name")
or get_field(hit, "file_title")
or ""
)
except Exception:
title = ""
if not title: if not title:
title = self._resolve_title_for_hash(backend, file_hash, hit) title = self._resolve_title_for_hash(backend, file_hash, hit)
title_cache[file_hash] = title title_cache[file_hash] = title
size, ext = meta_cache.get(file_hash, (None, "")) size, ext = meta_cache.get(file_hash, (None, ""))
if size is None and not ext: if size is None and not ext:
size, ext = self._resolve_size_ext_for_hash(backend, file_hash, hit) try:
meta_cache[file_hash] = (size, ext) size = get_field(hit, "size")
if size is None:
size = get_field(hit, "size_bytes")
if size is None:
size = get_field(hit, "file_size")
if size is None:
size = get_field(hit, "filesize")
size = int(size) if isinstance(size, (int, float)) else None
except Exception:
size = None
try: try:
urls = backend.get_url(file_hash) ext = get_field(hit, "ext") or get_field(hit, "extension")
except Exception: ext = str(ext).strip().lstrip(".") if isinstance(ext, str) else ""
urls = [] except Exception:
ext = ""
if size is None and not ext:
size, ext = self._resolve_size_ext_for_hash(backend, file_hash, hit)
meta_cache[file_hash] = (size, ext)
urls = self._extract_urls_from_hit(hit)
if not urls:
try:
urls = backend.get_url(file_hash)
except Exception:
urls = []
for url in (urls or []): for url in (urls or []):
if len(items) >= MAX_RESULTS:
break
if not self._match_url_pattern(str(url), raw_pattern): if not self._match_url_pattern(str(url), raw_pattern):
continue continue
# Double-check it looks like a URL to avoid data leakage from dirty DBs
from SYS.metadata import normalize_urls from SYS.metadata import normalize_urls
valid = normalize_urls([str(url)]) valid = normalize_urls([str(url)])
if not valid: if not valid:
@@ -306,6 +451,8 @@ class Get_Url(Cmdlet):
) )
) )
found_stores.add(str(store_name)) found_stores.add(str(store_name))
if len(items) >= MAX_RESULTS:
break
except Exception as exc: except Exception as exc:
debug( debug(
f"Error searching store '{store_name}': {exc}", f"Error searching store '{store_name}': {exc}",