This commit is contained in:
2026-01-16 01:47:00 -08:00
parent 41e95d0360
commit 12436e5a6a
4 changed files with 492 additions and 130 deletions

View File

@@ -4,7 +4,7 @@ import json
import re
import shutil
import sys
from fnmatch import translate
from fnmatch import fnmatch, translate
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
@@ -30,6 +30,28 @@ def _resolve_file_hash(db_hash: Optional[str], file_path: Path) -> Optional[str]
return _normalize_hash(file_path.stem)
def _normalize_url_for_search(url: str) -> str:
value = str(url or "").strip()
value = re.sub(r"^[a-z][a-z0-9+.-]*://", "", value, flags=re.IGNORECASE)
value = re.sub(r"^www\.", "", value, flags=re.IGNORECASE)
return value.lower()
def _match_url_pattern(url: str, pattern: str) -> bool:
normalized_url = _normalize_url_for_search(url)
normalized_pattern = _normalize_url_for_search(pattern)
if not normalized_pattern:
return False
has_wildcards = any(ch in normalized_pattern for ch in ("*", "?"))
if has_wildcards:
return fnmatch(normalized_url, normalized_pattern)
normalized_url_no_slash = normalized_url.rstrip("/")
normalized_pattern_no_slash = normalized_pattern.rstrip("/")
if normalized_pattern_no_slash and normalized_pattern_no_slash == normalized_url_no_slash:
return True
return normalized_pattern in normalized_url
class Folder(Store):
""""""
@@ -690,6 +712,12 @@ class Folder(Store):
match_all = query == "*" or (not query and bool(ext_filter))
results = []
search_dir = expand_path(self._location)
backend_label = str(
getattr(self, "_name", "") or getattr(self, "NAME", "") or "folder"
)
debug(
f"[folder:{backend_label}] search start: query={query} limit={limit} root={search_dir}"
)
def _url_like_pattern(value: str) -> str:
# Interpret user patterns as substring matches (with optional glob wildcards).
@@ -1002,7 +1030,7 @@ class Folder(Store):
namespace, pattern = query.split(":", 1)
namespace = namespace.strip().lower()
pattern = pattern.strip().lower()
debug(f"Performing namespace search: {namespace}:{pattern}")
debug(f"[folder:{backend_label}] namespace search: {namespace}:{pattern}")
if namespace == "hash":
normalized_hash = _normalize_hash(pattern)
@@ -1041,14 +1069,50 @@ class Folder(Store):
return results
if namespace == "url":
pattern_hint = kwargs.get("pattern_hint")
def _parse_url_value(raw: Any) -> list[str]:
if raw is None:
return []
if isinstance(raw, list):
return [str(u).strip() for u in raw if str(u).strip()]
if isinstance(raw, str):
text = raw.strip()
if not text:
return []
try:
parsed = json.loads(text)
if isinstance(parsed, list):
return [
str(u).strip()
for u in parsed
if str(u).strip()
]
except Exception:
pass
return [text]
return []
def _matches_pattern(url_list: list[str]) -> bool:
if not pattern_hint:
return True
for candidate_url in url_list:
if _match_url_pattern(candidate_url, pattern_hint):
return True
return False
if not pattern or pattern == "*":
debug(f"[folder:{backend_label}] url search: any-url (limit={limit})")
rows = api.get_files_with_any_url(limit)
else:
debug(
f"[folder:{backend_label}] url search: like={pattern} (limit={limit})"
)
rows = api.get_files_by_url_like(
_url_like_pattern(pattern),
limit
)
for file_hash, file_path_str, size_bytes, ext in rows:
for file_hash, file_path_str, size_bytes, ext, url_raw in rows:
if not file_path_str:
continue
file_path = search_dir / str(file_path_str)
@@ -1059,6 +1123,9 @@ class Folder(Store):
size_bytes = file_path.stat().st_size
except OSError:
size_bytes = None
urls = _parse_url_value(url_raw)
if not urls or not _matches_pattern(urls):
continue
tags = api.get_tags_for_file(file_hash)
entry = _create_entry(
file_path,
@@ -1066,6 +1133,7 @@ class Folder(Store):
size_bytes,
file_hash
)
entry["urls"] = urls
results.append(entry)
if limit is not None and len(results) >= limit:
return results

View File

@@ -466,7 +466,9 @@ class HydrusNetwork(Store):
def _extract_urls(meta_obj: Any) -> list[str]:
if not isinstance(meta_obj, dict):
return []
raw = meta_obj.get("url")
raw = meta_obj.get("known_urls")
if raw is None:
raw = meta_obj.get("url")
if raw is None:
raw = meta_obj.get("urls")
if isinstance(raw, str):
@@ -483,100 +485,178 @@ class HydrusNetwork(Store):
return out
return []
def _extract_search_ids(payload: Any) -> tuple[list[int], list[str]]:
if not isinstance(payload, dict):
return [], []
raw_ids = payload.get("file_ids", [])
raw_hashes = payload.get("hashes", [])
ids_out: list[int] = []
hashes_out: list[str] = []
if isinstance(raw_ids, list):
for item in raw_ids:
try:
if isinstance(item, (int, float)):
ids_out.append(int(item))
continue
if isinstance(item, str) and item.strip().isdigit():
ids_out.append(int(item.strip()))
except Exception:
continue
if isinstance(raw_hashes, list):
for item in raw_hashes:
try:
candidate = str(item or "").strip().lower()
if candidate:
hashes_out.append(candidate)
except Exception:
continue
return ids_out, hashes_out
def _iter_url_filtered_metadata(
url_value: str | None,
want_any: bool,
fetch_limit: int
) -> list[dict[str,
Any]]:
fetch_limit: int,
scan_limit: int | None = None
) -> list[dict[str, Any]]:
"""Best-effort URL search by scanning Hydrus metadata with include_file_url=True."""
# First try a fast system predicate if Hydrus supports it.
candidate_file_ids: list[int] = []
try:
if want_any:
candidate_hashes: list[str] = []
seen_file_ids: set[int] = set()
seen_hashes: set[str] = set()
def _add_candidates(ids: list[int], hashes: list[str]) -> None:
for fid in ids:
if fid in seen_file_ids:
continue
seen_file_ids.add(fid)
candidate_file_ids.append(fid)
for hh in hashes:
if hh in seen_hashes:
continue
seen_hashes.add(hh)
candidate_hashes.append(hh)
predicate_supported = getattr(self, "_has_url_predicate", None)
if predicate_supported is not False:
try:
predicate = "system:has url"
url_search = client.search_files(
tags=[predicate],
return_hashes=False,
return_file_ids=True,
return_hashes=True,
return_file_ids=False,
return_file_count=False,
)
ids = url_search.get("file_ids",
[]) if isinstance(url_search,
dict) else []
if isinstance(ids, list):
candidate_file_ids = [
int(x) for x in ids
if isinstance(x, (int, float,
str)) and str(x).strip().isdigit()
]
except Exception:
candidate_file_ids = []
ids, hashes = _extract_search_ids(url_search)
_add_candidates(ids, hashes)
self._has_url_predicate = True
except Exception as exc:
try:
from API.HydrusNetwork import HydrusRequestError
if not candidate_file_ids:
# Fallback: scan from system:everything and filter by URL substring.
if isinstance(exc, HydrusRequestError) and getattr(exc, "status", None) == 400:
self._has_url_predicate = False
except Exception:
pass
if not candidate_file_ids and not candidate_hashes:
everything = client.search_files(
tags=["system:everything"],
return_hashes=False,
return_file_ids=True,
return_hashes=True,
return_file_ids=False,
return_file_count=False,
)
ids = everything.get("file_ids",
[]) if isinstance(everything,
dict) else []
if isinstance(ids, list):
candidate_file_ids = [
int(x) for x in ids if isinstance(x, (int, float))
]
ids, hashes = _extract_search_ids(everything)
_add_candidates(ids, hashes)
if not candidate_file_ids:
if not candidate_file_ids and not candidate_hashes:
return []
needle = (url_value or "").strip().lower()
chunk_size = 200
out: list[dict[str, Any]] = []
if scan_limit is None:
try:
if not want_any and url_value:
scan_limit = max(200, min(int(fetch_limit), 400))
else:
scan_limit = max(int(fetch_limit) * 5, 1000)
except Exception:
scan_limit = 400 if (not want_any and url_value) else 1000
if scan_limit is not None:
scan_limit = min(int(scan_limit), 10000)
scanned = 0
for start in range(0, len(candidate_file_ids), chunk_size):
def _process_source(items: list[Any], kind: str) -> None:
nonlocal scanned
for start in range(0, len(items), chunk_size):
if len(out) >= fetch_limit:
return
if scan_limit is not None and scanned >= scan_limit:
return
chunk = items[start:start + chunk_size]
if scan_limit is not None:
remaining = scan_limit - scanned
if remaining <= 0:
return
if len(chunk) > remaining:
chunk = chunk[:remaining]
scanned += len(chunk)
try:
if kind == "hashes":
payload = client.fetch_file_metadata(
hashes=chunk,
include_file_url=True,
include_service_keys_to_tags=True,
include_duration=True,
include_size=True,
include_mime=True,
)
else:
payload = client.fetch_file_metadata(
file_ids=chunk,
include_file_url=True,
include_service_keys_to_tags=True,
include_duration=True,
include_size=True,
include_mime=True,
)
except Exception:
continue
metas = payload.get("metadata",
[]) if isinstance(payload,
dict) else []
if not isinstance(metas, list):
continue
for meta in metas:
if len(out) >= fetch_limit:
break
if not isinstance(meta, dict):
continue
urls = _extract_urls(meta)
if not urls:
continue
if want_any:
out.append(meta)
continue
if not needle:
continue
if any(needle in u.lower() for u in urls):
out.append(meta)
continue
sources: list[tuple[str, list[Any]]] = []
if candidate_hashes:
sources.append(("hashes", candidate_hashes))
elif candidate_file_ids:
sources.append(("file_ids", candidate_file_ids))
for kind, items in sources:
if len(out) >= fetch_limit:
break
chunk = candidate_file_ids[start:start + chunk_size]
try:
payload = client.fetch_file_metadata(
file_ids=chunk,
include_file_url=True,
include_service_keys_to_tags=True,
include_duration=True,
include_size=True,
include_mime=True,
)
except Exception:
continue
metas = payload.get("metadata",
[]) if isinstance(payload,
dict) else []
if not isinstance(metas, list):
continue
for meta in metas:
if not isinstance(meta, dict):
continue
urls = _extract_urls(meta)
if not urls:
continue
if want_any:
out.append(meta)
if len(out) >= fetch_limit:
break
continue
if not needle:
continue
if any(needle in u.lower() for u in urls):
out.append(meta)
if len(out) >= fetch_limit:
break
_process_source(items, kind)
return out
@@ -618,6 +698,7 @@ class HydrusNetwork(Store):
# Special case: url:* and url:<value>
metadata_list: list[dict[str, Any]] | None = None
pattern_hint = str(kwargs.get("pattern_hint") or "").strip().lower()
if ":" in query_lower and not query_lower.startswith(":"):
namespace, pattern = query_lower.split(":", 1)
namespace = namespace.strip().lower()
@@ -630,6 +711,12 @@ class HydrusNetwork(Store):
fetch_limit=int(limit) if limit else 100
)
else:
def _clean_url_search_token(value: str | None) -> str:
token = str(value or "").strip().lower()
if not token:
return ""
return token.replace("*", "").replace("?", "")
# Fast-path: exact URL via /add_urls/get_url_files when a full URL is provided.
try:
if pattern.startswith("http://") or pattern.startswith(
@@ -706,10 +793,20 @@ class HydrusNetwork(Store):
# Fallback: substring scan
if metadata_list is None:
search_token = _clean_url_search_token(pattern_hint or pattern)
scan_limit_override: int | None = None
if search_token:
is_domain_only = ("://" not in search_token and "/" not in search_token)
if is_domain_only:
try:
scan_limit_override = max(int(limit or 100) * 20, 2000)
except Exception:
scan_limit_override = 2000
metadata_list = _iter_url_filtered_metadata(
pattern,
search_token,
want_any=False,
fetch_limit=int(limit) if limit else 100
fetch_limit=int(limit) if limit else 100,
scan_limit=scan_limit_override,
)
# Parse the query into tags
@@ -742,26 +839,6 @@ class HydrusNetwork(Store):
# Search files with the tags (unless url: search already produced metadata)
results = []
def _extract_search_ids(payload: Any) -> tuple[list[int], list[str]]:
if not isinstance(payload, dict):
return [], []
raw_ids = payload.get("file_ids", [])
raw_hashes = payload.get("hashes", [])
ids_out: list[int] = []
hashes_out: list[str] = []
if isinstance(raw_ids, list):
for item in raw_ids:
try:
ids_out.append(int(item))
except (TypeError, ValueError):
continue
if isinstance(raw_hashes, list):
hashes_out = [
str(h).strip() for h in raw_hashes
if isinstance(h, str) and str(h).strip()
]
return ids_out, hashes_out
if metadata_list is None:
file_ids: list[int] = []
hashes: list[str] = []