This commit is contained in:
2026-01-17 21:32:44 -08:00
parent 193fa5aec3
commit 3f874af54a
4 changed files with 329 additions and 112 deletions

View File

@@ -21,7 +21,7 @@ from contextlib import contextmanager
from datetime import datetime from datetime import datetime
from pathlib import Path, PurePosixPath from pathlib import Path, PurePosixPath
from threading import RLock from threading import RLock
from typing import Optional, Dict, Any, List, Tuple, Set from typing import Optional, Dict, Any, List, Tuple, Set, Sequence
from SYS.utils import sha256_file, expand_path from SYS.utils import sha256_file, expand_path
from SYS.logger import debug as _debug from SYS.logger import debug as _debug
@@ -3001,6 +3001,47 @@ class DatabaseAPI:
) )
return rows return rows
def get_files_by_url_like_any(
self,
like_patterns: Sequence[str],
limit: Optional[int] = None,
) -> List[tuple]:
"""Get files whose URL metadata matches any of the provided LIKE patterns.
Returns (hash, file_path, size, ext, url) tuples.
"""
patterns = [str(p or "").strip() for p in (like_patterns or [])]
patterns = [p for p in patterns if p]
if not patterns:
return []
mm_debug(
f"[folder-db] get_files_by_url_like_any start: patterns={len(patterns)} limit={limit or 10000}"
)
cursor = self.get_cursor()
where_or = " OR ".join(["LOWER(m.url) LIKE ?"] * len(patterns))
query = f"""
SELECT f.hash, f.file_path,
COALESCE((SELECT size FROM metadata WHERE hash = f.hash), 0) as size,
COALESCE((SELECT ext FROM metadata WHERE hash = f.hash), '') as ext,
COALESCE(m.url, '') as url
FROM file f
JOIN metadata m ON f.hash = m.hash
WHERE m.url IS NOT NULL
AND ({where_or})
ORDER BY f.file_path
LIMIT ?
"""
cursor.execute(
query,
(*[p.lower() for p in patterns], limit or 10000),
)
rows = cursor.fetchall()
mm_debug(
f"[folder-db] get_files_by_url_like_any done: {len(rows)} row(s)"
)
return rows
def get_file_metadata(self, def get_file_metadata(self,
file_hashes: Set[str], file_hashes: Set[str],
limit: Optional[int] = None) -> List[tuple]: limit: Optional[int] = None) -> List[tuple]:

View File

@@ -1071,6 +1071,25 @@ class Folder(Store):
if namespace == "url": if namespace == "url":
pattern_hint = kwargs.get("pattern_hint") pattern_hint = kwargs.get("pattern_hint")
def _pattern_candidates(raw: Any) -> List[str]:
if raw is None:
return []
if isinstance(raw, (list, tuple, set)):
out: List[str] = []
for item in raw:
text = str(item or "").strip()
if text and text not in out:
out.append(text)
return out
if isinstance(raw, str):
text = raw.strip()
return [text] if text else []
return []
pattern_candidates = _pattern_candidates(pattern_hint)
if len(pattern_candidates) > 200:
pattern_candidates = pattern_candidates[:200]
def _parse_url_value(raw: Any) -> list[str]: def _parse_url_value(raw: Any) -> list[str]:
if raw is None: if raw is None:
return [] return []
@@ -1094,14 +1113,24 @@ class Folder(Store):
return [] return []
def _matches_pattern(url_list: list[str]) -> bool: def _matches_pattern(url_list: list[str]) -> bool:
if not pattern_hint: if not pattern_candidates:
return True return True
for candidate_url in url_list: for candidate_url in url_list:
if _match_url_pattern(candidate_url, pattern_hint): for pat in pattern_candidates:
if _match_url_pattern(candidate_url, pat):
return True return True
return False return False
if not pattern or pattern == "*": if not pattern or pattern == "*":
if pattern_candidates:
debug(
f"[folder:{backend_label}] url search: any-url (limit={limit}) pattern_hint={len(pattern_candidates)}"
)
rows = api.get_files_by_url_like_any(
[_url_like_pattern(p) for p in pattern_candidates],
limit,
)
else:
debug(f"[folder:{backend_label}] url search: any-url (limit={limit})") debug(f"[folder:{backend_label}] url search: any-url (limit={limit})")
rows = api.get_files_with_any_url(limit) rows = api.get_files_with_any_url(limit)
else: else:

View File

@@ -5,7 +5,7 @@ import sys
import tempfile import tempfile
import shutil import shutil
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Sequence, Tuple
from urllib.parse import quote from urllib.parse import quote
@@ -516,7 +516,8 @@ class HydrusNetwork(Store):
url_value: str | None, url_value: str | None,
want_any: bool, want_any: bool,
fetch_limit: int, fetch_limit: int,
scan_limit: int | None = None scan_limit: int | None = None,
needles: Optional[Sequence[str]] = None,
) -> list[dict[str, Any]]: ) -> list[dict[str, Any]]:
"""Best-effort URL search by scanning Hydrus metadata with include_file_url=True.""" """Best-effort URL search by scanning Hydrus metadata with include_file_url=True."""
@@ -572,17 +573,29 @@ class HydrusNetwork(Store):
if not candidate_file_ids and not candidate_hashes: if not candidate_file_ids and not candidate_hashes:
return [] return []
needle_list: list[str] = []
if isinstance(needles, (list, tuple, set)):
for item in needles:
text = str(item or "").strip().lower()
if text and text not in needle_list:
needle_list.append(text)
if not needle_list:
needle = (url_value or "").strip().lower() needle = (url_value or "").strip().lower()
if needle:
needle_list = [needle]
chunk_size = 200 chunk_size = 200
out: list[dict[str, Any]] = [] out: list[dict[str, Any]] = []
if scan_limit is None: if scan_limit is None:
try: try:
if not want_any and url_value: if not want_any and needle_list:
if len(needle_list) > 1:
scan_limit = max(int(fetch_limit) * 20, 2000)
else:
scan_limit = max(200, min(int(fetch_limit), 400)) scan_limit = max(200, min(int(fetch_limit), 400))
else: else:
scan_limit = max(int(fetch_limit) * 5, 1000) scan_limit = max(int(fetch_limit) * 5, 1000)
except Exception: except Exception:
scan_limit = 400 if (not want_any and url_value) else 1000 scan_limit = 400 if (not want_any and needle_list) else 1000
if scan_limit is not None: if scan_limit is not None:
scan_limit = min(int(scan_limit), 10000) scan_limit = min(int(scan_limit), 10000)
scanned = 0 scanned = 0
@@ -641,9 +654,9 @@ class HydrusNetwork(Store):
if want_any: if want_any:
out.append(meta) out.append(meta)
continue continue
if not needle: if not needle_list:
continue continue
if any(needle in u.lower() for u in urls): if any(any(n in u.lower() for n in needle_list) for u in urls):
out.append(meta) out.append(meta)
continue continue
@@ -698,13 +711,32 @@ class HydrusNetwork(Store):
# Special case: url:* and url:<value> # Special case: url:* and url:<value>
metadata_list: list[dict[str, Any]] | None = None metadata_list: list[dict[str, Any]] | None = None
pattern_hint = str(kwargs.get("pattern_hint") or "").strip().lower() pattern_hint_raw = kwargs.get("pattern_hint")
pattern_hints: list[str] = []
if isinstance(pattern_hint_raw, (list, tuple, set)):
for item in pattern_hint_raw:
text = str(item or "").strip().lower()
if text and text not in pattern_hints:
pattern_hints.append(text)
elif isinstance(pattern_hint_raw, str):
text = pattern_hint_raw.strip().lower()
if text:
pattern_hints.append(text)
pattern_hint = pattern_hints[0] if pattern_hints else ""
if ":" in query_lower and not query_lower.startswith(":"): if ":" in query_lower and not query_lower.startswith(":"):
namespace, pattern = query_lower.split(":", 1) namespace, pattern = query_lower.split(":", 1)
namespace = namespace.strip().lower() namespace = namespace.strip().lower()
pattern = pattern.strip() pattern = pattern.strip()
if namespace == "url": if namespace == "url":
if not pattern or pattern == "*": if not pattern or pattern == "*":
if pattern_hints:
metadata_list = _iter_url_filtered_metadata(
None,
want_any=False,
fetch_limit=int(limit) if limit else 100,
needles=pattern_hints,
)
else:
metadata_list = _iter_url_filtered_metadata( metadata_list = _iter_url_filtered_metadata(
None, None,
want_any=True, want_any=True,
@@ -807,6 +839,7 @@ class HydrusNetwork(Store):
want_any=False, want_any=False,
fetch_limit=int(limit) if limit else 100, fetch_limit=int(limit) if limit else 100,
scan_limit=scan_limit_override, scan_limit=scan_limit_override,
needles=pattern_hints if pattern_hints else None,
) )
# Parse the query into tags # Parse the query into tags

View File

@@ -3358,6 +3358,113 @@ def check_url_exists_in_storage(
_mark_preflight_checked() _mark_preflight_checked()
return True return True
bulk_mode = len(unique_urls) >= 8
def _build_bulk_patterns(needles_map: Dict[str, List[str]], max_per_url: int = 3, max_total: int = 240) -> List[str]:
patterns: List[str] = []
for _original, needles in needles_map.items():
for needle in (needles or [])[:max_per_url]:
needle_text = str(needle or "").strip()
if not needle_text:
continue
if needle_text not in patterns:
patterns.append(needle_text)
if len(patterns) >= max_total:
return patterns
return patterns
bulk_patterns = _build_bulk_patterns(url_needles)
def _match_normalized_url(pattern_text: str, candidate_url: str) -> bool:
pattern_norm = _normalize_url_for_search(pattern_text)
candidate_norm = _normalize_url_for_search(candidate_url)
if not pattern_norm or not candidate_norm:
return False
if pattern_norm == candidate_norm:
return True
return pattern_norm in candidate_norm
def _extract_urls_from_hit(
hit: Any,
backend: Any,
*,
allow_backend_lookup: bool = True,
) -> List[str]:
url_values: List[str] = []
try:
raw_urls = get_field(hit, "known_urls") or get_field(hit, "urls") or get_field(hit, "url")
if isinstance(raw_urls, str) and raw_urls.strip():
url_values.append(raw_urls.strip())
elif isinstance(raw_urls, (list, tuple, set)):
for item in raw_urls:
if isinstance(item, str) and item.strip():
url_values.append(item.strip())
except Exception:
url_values = []
if url_values or not allow_backend_lookup:
return url_values
try:
file_hash = get_field(hit, "hash") or get_field(hit, "file_hash") or get_field(hit, "sha256") or ""
except Exception:
file_hash = ""
if file_hash:
try:
fetched = backend.get_url(str(file_hash))
if isinstance(fetched, str) and fetched.strip():
url_values.append(fetched.strip())
elif isinstance(fetched, (list, tuple, set)):
for item in fetched:
if isinstance(item, str) and item.strip():
url_values.append(item.strip())
except Exception:
pass
return url_values
def _build_display_row_for_hit(
hit: Any,
backend_name: str,
original_url: str,
) -> Dict[str, Any]:
try:
from SYS.result_table import build_display_row
extracted = build_display_row(hit, keys=["title", "store", "hash", "ext", "size"])
except Exception:
extracted = {}
try:
title = extracted.get("title") or get_field(hit, "title") or get_field(hit, "name") or get_field(hit, "target") or get_field(hit, "path") or "(exists)"
except Exception:
title = "(exists)"
try:
file_hash = extracted.get("hash") or get_field(hit, "hash") or get_field(hit, "file_hash") or get_field(hit, "sha256") or ""
except Exception:
file_hash = ""
ext = extracted.get("ext") if isinstance(extracted, dict) else ""
size_val = extracted.get("size") if isinstance(extracted, dict) else None
return {
"title": str(title),
"store": str(get_field(hit, "store") or backend_name),
"hash": str(file_hash or ""),
"ext": str(ext or ""),
"size": size_val,
"url": original_url,
"columns": [
("Title", str(title)),
("Store", str(get_field(hit, "store") or backend_name)),
("Hash", str(file_hash or "")),
("Ext", str(ext or "")),
("Size", size_val),
("URL", original_url),
],
}
def _search_backend_url_hits( def _search_backend_url_hits(
backend: Any, backend: Any,
backend_name: str, backend_name: str,
@@ -3379,15 +3486,6 @@ def check_url_exists_in_storage(
continue continue
if not backend_hits: if not backend_hits:
def _match_normalized_url(pattern_text: str, candidate_url: str) -> bool:
pattern_norm = _normalize_url_for_search(pattern_text)
candidate_norm = _normalize_url_for_search(candidate_url)
if not pattern_norm or not candidate_norm:
return False
if pattern_norm == candidate_norm:
return True
return pattern_norm in candidate_norm
fallback_hits: List[Dict[str, Any]] = [] fallback_hits: List[Dict[str, Any]] = []
try: try:
fallback_hits = backend.search("url:*", limit=200) or [] fallback_hits = backend.search("url:*", limit=200) or []
@@ -3395,31 +3493,7 @@ def check_url_exists_in_storage(
fallback_hits = [] fallback_hits = []
for hit in fallback_hits: for hit in fallback_hits:
url_values: List[str] = [] url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=True)
try:
raw_urls = get_field(hit, "known_urls") or get_field(hit, "urls") or get_field(hit, "url")
if isinstance(raw_urls, str) and raw_urls.strip():
url_values.append(raw_urls.strip())
elif isinstance(raw_urls, (list, tuple, set)):
for item in raw_urls:
if isinstance(item, str) and item.strip():
url_values.append(item.strip())
except Exception:
url_values = []
if not url_values:
try:
file_hash = hit.get("hash") if isinstance(hit, dict) else None
if file_hash:
fetched = backend.get_url(str(file_hash))
if isinstance(fetched, str) and fetched.strip():
url_values.append(fetched.strip())
elif isinstance(fetched, (list, tuple, set)):
for item in fetched:
if isinstance(item, str) and item.strip():
url_values.append(item.strip())
except Exception:
pass
if not url_values: if not url_values:
continue continue
@@ -3436,68 +3510,12 @@ def check_url_exists_in_storage(
if not matched: if not matched:
continue continue
title = "(exists)" return _build_display_row_for_hit(hit, backend_name, original_url)
try:
title = hit.get("title") or hit.get("name") or hit.get("target") or hit.get("path") or "(exists)"
except Exception:
title = "(exists)"
file_hash = ""
try:
file_hash = hit.get("hash") or hit.get("file_hash") or hit.get("sha256") or ""
except Exception:
file_hash = ""
return {
"title": str(title),
"store": str(hit.get("store") or backend_name),
"hash": str(file_hash or ""),
"ext": "",
"size": None,
"url": original_url,
"columns": [
("Title", str(title)),
("Store", str(hit.get("store") or backend_name)),
("Hash", str(file_hash or "")),
("URL", original_url),
],
}
return None return None
hit = backend_hits[0] hit = backend_hits[0]
title = hit.get("title") or hit.get("name") or hit.get("target") or hit.get("path") or "(exists)" return _build_display_row_for_hit(hit, backend_name, original_url)
file_hash = hit.get("hash") or hit.get("file_hash") or hit.get("sha256") or ""
try:
from SYS.result_table import build_display_row
extracted = build_display_row(hit, keys=["title", "store", "hash", "ext", "size"])
except Exception:
extracted = {}
extracted["title"] = str(title)
extracted["store"] = str(hit.get("store") or backend_name)
extracted["hash"] = str(file_hash or "")
ext = extracted.get("ext")
size_val = extracted.get("size")
return {
"title": str(title),
"store": str(hit.get("store") or backend_name),
"hash": str(file_hash or ""),
"ext": str(ext or ""),
"size": size_val,
"url": original_url,
"columns": [
("Title", str(title)),
("Store", str(hit.get("store") or backend_name)),
("Hash", str(file_hash or "")),
("Ext", str(ext or "")),
("Size", size_val),
("URL", original_url),
],
}
backend_names: List[str] = [] backend_names: List[str] = []
try: try:
@@ -3558,6 +3576,54 @@ def check_url_exists_in_storage(
if not hydrus_available: if not hydrus_available:
debug("Bulk URL preflight: hydrus availability check failed; attempting best-effort lookup") debug("Bulk URL preflight: hydrus availability check failed; attempting best-effort lookup")
if bulk_mode and bulk_patterns:
bulk_hits: Optional[List[Any]] = None
bulk_limit = min(2000, max(200, len(unique_urls) * 8))
try:
bulk_hits = backend.search(
"url:*",
limit=bulk_limit,
pattern_hint=bulk_patterns,
) or []
except Exception:
try:
bulk_hits = backend.search("url:*", limit=bulk_limit) or []
except Exception:
bulk_hits = None
if bulk_hits is not None:
for hit in bulk_hits:
if len(match_rows) >= max_rows:
break
url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=False)
if not url_values:
continue
for original_url, needles in url_needles.items():
if len(match_rows) >= max_rows:
break
if (original_url, str(backend_name)) in seen_pairs:
continue
matched = False
for url_value in url_values:
for needle in (needles or []):
if _match_normalized_url(str(needle or ""), str(url_value or "")):
matched = True
break
if matched:
break
if not matched:
continue
seen_pairs.add((original_url, str(backend_name)))
matched_urls.add(original_url)
match_rows.append(
_build_display_row_for_hit(hit, str(backend_name), original_url)
)
continue
for original_url, needles in url_needles.items(): for original_url, needles in url_needles.items():
if len(match_rows) >= max_rows: if len(match_rows) >= max_rows:
break break
@@ -3622,6 +3688,54 @@ def check_url_exists_in_storage(
match_rows.append(display_row) match_rows.append(display_row)
continue continue
if bulk_mode and bulk_patterns:
bulk_hits: Optional[List[Any]] = None
bulk_limit = min(2000, max(200, len(unique_urls) * 8))
try:
bulk_hits = backend.search(
"url:*",
limit=bulk_limit,
pattern_hint=bulk_patterns,
) or []
except Exception:
try:
bulk_hits = backend.search("url:*", limit=bulk_limit) or []
except Exception:
bulk_hits = None
if bulk_hits is not None:
for hit in bulk_hits:
if len(match_rows) >= max_rows:
break
url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=False)
if not url_values:
continue
for original_url, needles in url_needles.items():
if len(match_rows) >= max_rows:
break
if (original_url, str(backend_name)) in seen_pairs:
continue
matched = False
for url_value in url_values:
for needle in (needles or []):
if _match_normalized_url(str(needle or ""), str(url_value or "")):
matched = True
break
if matched:
break
if not matched:
continue
seen_pairs.add((original_url, str(backend_name)))
matched_urls.add(original_url)
match_rows.append(
_build_display_row_for_hit(hit, str(backend_name), original_url)
)
continue
for original_url, needles in url_needles.items(): for original_url, needles in url_needles.items():
if len(match_rows) >= max_rows: if len(match_rows) >= max_rows:
break break