This commit is contained in:
2026-01-17 21:32:44 -08:00
parent 193fa5aec3
commit 3f874af54a
4 changed files with 329 additions and 112 deletions

View File

@@ -21,7 +21,7 @@ from contextlib import contextmanager
from datetime import datetime
from pathlib import Path, PurePosixPath
from threading import RLock
from typing import Optional, Dict, Any, List, Tuple, Set
from typing import Optional, Dict, Any, List, Tuple, Set, Sequence
from SYS.utils import sha256_file, expand_path
from SYS.logger import debug as _debug
@@ -3001,6 +3001,47 @@ class DatabaseAPI:
)
return rows
def get_files_by_url_like_any(
self,
like_patterns: Sequence[str],
limit: Optional[int] = None,
) -> List[tuple]:
"""Get files whose URL metadata matches any of the provided LIKE patterns.
Returns (hash, file_path, size, ext, url) tuples.
"""
patterns = [str(p or "").strip() for p in (like_patterns or [])]
patterns = [p for p in patterns if p]
if not patterns:
return []
mm_debug(
f"[folder-db] get_files_by_url_like_any start: patterns={len(patterns)} limit={limit or 10000}"
)
cursor = self.get_cursor()
where_or = " OR ".join(["LOWER(m.url) LIKE ?"] * len(patterns))
query = f"""
SELECT f.hash, f.file_path,
COALESCE((SELECT size FROM metadata WHERE hash = f.hash), 0) as size,
COALESCE((SELECT ext FROM metadata WHERE hash = f.hash), '') as ext,
COALESCE(m.url, '') as url
FROM file f
JOIN metadata m ON f.hash = m.hash
WHERE m.url IS NOT NULL
AND ({where_or})
ORDER BY f.file_path
LIMIT ?
"""
cursor.execute(
query,
(*[p.lower() for p in patterns], limit or 10000),
)
rows = cursor.fetchall()
mm_debug(
f"[folder-db] get_files_by_url_like_any done: {len(rows)} row(s)"
)
return rows
def get_file_metadata(self,
file_hashes: Set[str],
limit: Optional[int] = None) -> List[tuple]:

View File

@@ -1071,6 +1071,25 @@ class Folder(Store):
if namespace == "url":
pattern_hint = kwargs.get("pattern_hint")
def _pattern_candidates(raw: Any) -> List[str]:
if raw is None:
return []
if isinstance(raw, (list, tuple, set)):
out: List[str] = []
for item in raw:
text = str(item or "").strip()
if text and text not in out:
out.append(text)
return out
if isinstance(raw, str):
text = raw.strip()
return [text] if text else []
return []
pattern_candidates = _pattern_candidates(pattern_hint)
if len(pattern_candidates) > 200:
pattern_candidates = pattern_candidates[:200]
def _parse_url_value(raw: Any) -> list[str]:
if raw is None:
return []
@@ -1094,16 +1113,26 @@ class Folder(Store):
return []
def _matches_pattern(url_list: list[str]) -> bool:
if not pattern_hint:
if not pattern_candidates:
return True
for candidate_url in url_list:
if _match_url_pattern(candidate_url, pattern_hint):
return True
for pat in pattern_candidates:
if _match_url_pattern(candidate_url, pat):
return True
return False
if not pattern or pattern == "*":
debug(f"[folder:{backend_label}] url search: any-url (limit={limit})")
rows = api.get_files_with_any_url(limit)
if pattern_candidates:
debug(
f"[folder:{backend_label}] url search: any-url (limit={limit}) pattern_hint={len(pattern_candidates)}"
)
rows = api.get_files_by_url_like_any(
[_url_like_pattern(p) for p in pattern_candidates],
limit,
)
else:
debug(f"[folder:{backend_label}] url search: any-url (limit={limit})")
rows = api.get_files_with_any_url(limit)
else:
debug(
f"[folder:{backend_label}] url search: like={pattern} (limit={limit})"

View File

@@ -5,7 +5,7 @@ import sys
import tempfile
import shutil
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from typing import Any, Dict, List, Optional, Sequence, Tuple
from urllib.parse import quote
@@ -516,7 +516,8 @@ class HydrusNetwork(Store):
url_value: str | None,
want_any: bool,
fetch_limit: int,
scan_limit: int | None = None
scan_limit: int | None = None,
needles: Optional[Sequence[str]] = None,
) -> list[dict[str, Any]]:
"""Best-effort URL search by scanning Hydrus metadata with include_file_url=True."""
@@ -572,17 +573,29 @@ class HydrusNetwork(Store):
if not candidate_file_ids and not candidate_hashes:
return []
needle = (url_value or "").strip().lower()
needle_list: list[str] = []
if isinstance(needles, (list, tuple, set)):
for item in needles:
text = str(item or "").strip().lower()
if text and text not in needle_list:
needle_list.append(text)
if not needle_list:
needle = (url_value or "").strip().lower()
if needle:
needle_list = [needle]
chunk_size = 200
out: list[dict[str, Any]] = []
if scan_limit is None:
try:
if not want_any and url_value:
scan_limit = max(200, min(int(fetch_limit), 400))
if not want_any and needle_list:
if len(needle_list) > 1:
scan_limit = max(int(fetch_limit) * 20, 2000)
else:
scan_limit = max(200, min(int(fetch_limit), 400))
else:
scan_limit = max(int(fetch_limit) * 5, 1000)
except Exception:
scan_limit = 400 if (not want_any and url_value) else 1000
scan_limit = 400 if (not want_any and needle_list) else 1000
if scan_limit is not None:
scan_limit = min(int(scan_limit), 10000)
scanned = 0
@@ -641,9 +654,9 @@ class HydrusNetwork(Store):
if want_any:
out.append(meta)
continue
if not needle:
if not needle_list:
continue
if any(needle in u.lower() for u in urls):
if any(any(n in u.lower() for n in needle_list) for u in urls):
out.append(meta)
continue
@@ -698,18 +711,37 @@ class HydrusNetwork(Store):
# Special case: url:* and url:<value>
metadata_list: list[dict[str, Any]] | None = None
pattern_hint = str(kwargs.get("pattern_hint") or "").strip().lower()
pattern_hint_raw = kwargs.get("pattern_hint")
pattern_hints: list[str] = []
if isinstance(pattern_hint_raw, (list, tuple, set)):
for item in pattern_hint_raw:
text = str(item or "").strip().lower()
if text and text not in pattern_hints:
pattern_hints.append(text)
elif isinstance(pattern_hint_raw, str):
text = pattern_hint_raw.strip().lower()
if text:
pattern_hints.append(text)
pattern_hint = pattern_hints[0] if pattern_hints else ""
if ":" in query_lower and not query_lower.startswith(":"):
namespace, pattern = query_lower.split(":", 1)
namespace = namespace.strip().lower()
pattern = pattern.strip()
if namespace == "url":
if not pattern or pattern == "*":
metadata_list = _iter_url_filtered_metadata(
None,
want_any=True,
fetch_limit=int(limit) if limit else 100
)
if pattern_hints:
metadata_list = _iter_url_filtered_metadata(
None,
want_any=False,
fetch_limit=int(limit) if limit else 100,
needles=pattern_hints,
)
else:
metadata_list = _iter_url_filtered_metadata(
None,
want_any=True,
fetch_limit=int(limit) if limit else 100
)
else:
def _clean_url_search_token(value: str | None) -> str:
token = str(value or "").strip().lower()
@@ -807,6 +839,7 @@ class HydrusNetwork(Store):
want_any=False,
fetch_limit=int(limit) if limit else 100,
scan_limit=scan_limit_override,
needles=pattern_hints if pattern_hints else None,
)
# Parse the query into tags

View File

@@ -3358,6 +3358,113 @@ def check_url_exists_in_storage(
_mark_preflight_checked()
return True
bulk_mode = len(unique_urls) >= 8
def _build_bulk_patterns(needles_map: Dict[str, List[str]], max_per_url: int = 3, max_total: int = 240) -> List[str]:
patterns: List[str] = []
for _original, needles in needles_map.items():
for needle in (needles or [])[:max_per_url]:
needle_text = str(needle or "").strip()
if not needle_text:
continue
if needle_text not in patterns:
patterns.append(needle_text)
if len(patterns) >= max_total:
return patterns
return patterns
bulk_patterns = _build_bulk_patterns(url_needles)
def _match_normalized_url(pattern_text: str, candidate_url: str) -> bool:
pattern_norm = _normalize_url_for_search(pattern_text)
candidate_norm = _normalize_url_for_search(candidate_url)
if not pattern_norm or not candidate_norm:
return False
if pattern_norm == candidate_norm:
return True
return pattern_norm in candidate_norm
def _extract_urls_from_hit(
hit: Any,
backend: Any,
*,
allow_backend_lookup: bool = True,
) -> List[str]:
url_values: List[str] = []
try:
raw_urls = get_field(hit, "known_urls") or get_field(hit, "urls") or get_field(hit, "url")
if isinstance(raw_urls, str) and raw_urls.strip():
url_values.append(raw_urls.strip())
elif isinstance(raw_urls, (list, tuple, set)):
for item in raw_urls:
if isinstance(item, str) and item.strip():
url_values.append(item.strip())
except Exception:
url_values = []
if url_values or not allow_backend_lookup:
return url_values
try:
file_hash = get_field(hit, "hash") or get_field(hit, "file_hash") or get_field(hit, "sha256") or ""
except Exception:
file_hash = ""
if file_hash:
try:
fetched = backend.get_url(str(file_hash))
if isinstance(fetched, str) and fetched.strip():
url_values.append(fetched.strip())
elif isinstance(fetched, (list, tuple, set)):
for item in fetched:
if isinstance(item, str) and item.strip():
url_values.append(item.strip())
except Exception:
pass
return url_values
def _build_display_row_for_hit(
hit: Any,
backend_name: str,
original_url: str,
) -> Dict[str, Any]:
try:
from SYS.result_table import build_display_row
extracted = build_display_row(hit, keys=["title", "store", "hash", "ext", "size"])
except Exception:
extracted = {}
try:
title = extracted.get("title") or get_field(hit, "title") or get_field(hit, "name") or get_field(hit, "target") or get_field(hit, "path") or "(exists)"
except Exception:
title = "(exists)"
try:
file_hash = extracted.get("hash") or get_field(hit, "hash") or get_field(hit, "file_hash") or get_field(hit, "sha256") or ""
except Exception:
file_hash = ""
ext = extracted.get("ext") if isinstance(extracted, dict) else ""
size_val = extracted.get("size") if isinstance(extracted, dict) else None
return {
"title": str(title),
"store": str(get_field(hit, "store") or backend_name),
"hash": str(file_hash or ""),
"ext": str(ext or ""),
"size": size_val,
"url": original_url,
"columns": [
("Title", str(title)),
("Store", str(get_field(hit, "store") or backend_name)),
("Hash", str(file_hash or "")),
("Ext", str(ext or "")),
("Size", size_val),
("URL", original_url),
],
}
def _search_backend_url_hits(
backend: Any,
backend_name: str,
@@ -3379,15 +3486,6 @@ def check_url_exists_in_storage(
continue
if not backend_hits:
def _match_normalized_url(pattern_text: str, candidate_url: str) -> bool:
pattern_norm = _normalize_url_for_search(pattern_text)
candidate_norm = _normalize_url_for_search(candidate_url)
if not pattern_norm or not candidate_norm:
return False
if pattern_norm == candidate_norm:
return True
return pattern_norm in candidate_norm
fallback_hits: List[Dict[str, Any]] = []
try:
fallback_hits = backend.search("url:*", limit=200) or []
@@ -3395,31 +3493,7 @@ def check_url_exists_in_storage(
fallback_hits = []
for hit in fallback_hits:
url_values: List[str] = []
try:
raw_urls = get_field(hit, "known_urls") or get_field(hit, "urls") or get_field(hit, "url")
if isinstance(raw_urls, str) and raw_urls.strip():
url_values.append(raw_urls.strip())
elif isinstance(raw_urls, (list, tuple, set)):
for item in raw_urls:
if isinstance(item, str) and item.strip():
url_values.append(item.strip())
except Exception:
url_values = []
if not url_values:
try:
file_hash = hit.get("hash") if isinstance(hit, dict) else None
if file_hash:
fetched = backend.get_url(str(file_hash))
if isinstance(fetched, str) and fetched.strip():
url_values.append(fetched.strip())
elif isinstance(fetched, (list, tuple, set)):
for item in fetched:
if isinstance(item, str) and item.strip():
url_values.append(item.strip())
except Exception:
pass
url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=True)
if not url_values:
continue
@@ -3436,68 +3510,12 @@ def check_url_exists_in_storage(
if not matched:
continue
title = "(exists)"
try:
title = hit.get("title") or hit.get("name") or hit.get("target") or hit.get("path") or "(exists)"
except Exception:
title = "(exists)"
file_hash = ""
try:
file_hash = hit.get("hash") or hit.get("file_hash") or hit.get("sha256") or ""
except Exception:
file_hash = ""
return {
"title": str(title),
"store": str(hit.get("store") or backend_name),
"hash": str(file_hash or ""),
"ext": "",
"size": None,
"url": original_url,
"columns": [
("Title", str(title)),
("Store", str(hit.get("store") or backend_name)),
("Hash", str(file_hash or "")),
("URL", original_url),
],
}
return _build_display_row_for_hit(hit, backend_name, original_url)
return None
hit = backend_hits[0]
title = hit.get("title") or hit.get("name") or hit.get("target") or hit.get("path") or "(exists)"
file_hash = hit.get("hash") or hit.get("file_hash") or hit.get("sha256") or ""
try:
from SYS.result_table import build_display_row
extracted = build_display_row(hit, keys=["title", "store", "hash", "ext", "size"])
except Exception:
extracted = {}
extracted["title"] = str(title)
extracted["store"] = str(hit.get("store") or backend_name)
extracted["hash"] = str(file_hash or "")
ext = extracted.get("ext")
size_val = extracted.get("size")
return {
"title": str(title),
"store": str(hit.get("store") or backend_name),
"hash": str(file_hash or ""),
"ext": str(ext or ""),
"size": size_val,
"url": original_url,
"columns": [
("Title", str(title)),
("Store", str(hit.get("store") or backend_name)),
("Hash", str(file_hash or "")),
("Ext", str(ext or "")),
("Size", size_val),
("URL", original_url),
],
}
return _build_display_row_for_hit(hit, backend_name, original_url)
backend_names: List[str] = []
try:
@@ -3558,6 +3576,54 @@ def check_url_exists_in_storage(
if not hydrus_available:
debug("Bulk URL preflight: hydrus availability check failed; attempting best-effort lookup")
if bulk_mode and bulk_patterns:
bulk_hits: Optional[List[Any]] = None
bulk_limit = min(2000, max(200, len(unique_urls) * 8))
try:
bulk_hits = backend.search(
"url:*",
limit=bulk_limit,
pattern_hint=bulk_patterns,
) or []
except Exception:
try:
bulk_hits = backend.search("url:*", limit=bulk_limit) or []
except Exception:
bulk_hits = None
if bulk_hits is not None:
for hit in bulk_hits:
if len(match_rows) >= max_rows:
break
url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=False)
if not url_values:
continue
for original_url, needles in url_needles.items():
if len(match_rows) >= max_rows:
break
if (original_url, str(backend_name)) in seen_pairs:
continue
matched = False
for url_value in url_values:
for needle in (needles or []):
if _match_normalized_url(str(needle or ""), str(url_value or "")):
matched = True
break
if matched:
break
if not matched:
continue
seen_pairs.add((original_url, str(backend_name)))
matched_urls.add(original_url)
match_rows.append(
_build_display_row_for_hit(hit, str(backend_name), original_url)
)
continue
for original_url, needles in url_needles.items():
if len(match_rows) >= max_rows:
break
@@ -3622,6 +3688,54 @@ def check_url_exists_in_storage(
match_rows.append(display_row)
continue
if bulk_mode and bulk_patterns:
bulk_hits: Optional[List[Any]] = None
bulk_limit = min(2000, max(200, len(unique_urls) * 8))
try:
bulk_hits = backend.search(
"url:*",
limit=bulk_limit,
pattern_hint=bulk_patterns,
) or []
except Exception:
try:
bulk_hits = backend.search("url:*", limit=bulk_limit) or []
except Exception:
bulk_hits = None
if bulk_hits is not None:
for hit in bulk_hits:
if len(match_rows) >= max_rows:
break
url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=False)
if not url_values:
continue
for original_url, needles in url_needles.items():
if len(match_rows) >= max_rows:
break
if (original_url, str(backend_name)) in seen_pairs:
continue
matched = False
for url_value in url_values:
for needle in (needles or []):
if _match_normalized_url(str(needle or ""), str(url_value or "")):
matched = True
break
if matched:
break
if not matched:
continue
seen_pairs.add((original_url, str(backend_name)))
matched_urls.add(original_url)
match_rows.append(
_build_display_row_for_hit(hit, str(backend_name), original_url)
)
continue
for original_url, needles in url_needles.items():
if len(match_rows) >= max_rows:
break