f
This commit is contained in:
@@ -21,7 +21,7 @@ from contextlib import contextmanager
|
||||
from datetime import datetime
|
||||
from pathlib import Path, PurePosixPath
|
||||
from threading import RLock
|
||||
from typing import Optional, Dict, Any, List, Tuple, Set
|
||||
from typing import Optional, Dict, Any, List, Tuple, Set, Sequence
|
||||
|
||||
from SYS.utils import sha256_file, expand_path
|
||||
from SYS.logger import debug as _debug
|
||||
@@ -3001,6 +3001,47 @@ class DatabaseAPI:
|
||||
)
|
||||
return rows
|
||||
|
||||
def get_files_by_url_like_any(
|
||||
self,
|
||||
like_patterns: Sequence[str],
|
||||
limit: Optional[int] = None,
|
||||
) -> List[tuple]:
|
||||
"""Get files whose URL metadata matches any of the provided LIKE patterns.
|
||||
|
||||
Returns (hash, file_path, size, ext, url) tuples.
|
||||
"""
|
||||
patterns = [str(p or "").strip() for p in (like_patterns or [])]
|
||||
patterns = [p for p in patterns if p]
|
||||
if not patterns:
|
||||
return []
|
||||
|
||||
mm_debug(
|
||||
f"[folder-db] get_files_by_url_like_any start: patterns={len(patterns)} limit={limit or 10000}"
|
||||
)
|
||||
cursor = self.get_cursor()
|
||||
where_or = " OR ".join(["LOWER(m.url) LIKE ?"] * len(patterns))
|
||||
query = f"""
|
||||
SELECT f.hash, f.file_path,
|
||||
COALESCE((SELECT size FROM metadata WHERE hash = f.hash), 0) as size,
|
||||
COALESCE((SELECT ext FROM metadata WHERE hash = f.hash), '') as ext,
|
||||
COALESCE(m.url, '') as url
|
||||
FROM file f
|
||||
JOIN metadata m ON f.hash = m.hash
|
||||
WHERE m.url IS NOT NULL
|
||||
AND ({where_or})
|
||||
ORDER BY f.file_path
|
||||
LIMIT ?
|
||||
"""
|
||||
cursor.execute(
|
||||
query,
|
||||
(*[p.lower() for p in patterns], limit or 10000),
|
||||
)
|
||||
rows = cursor.fetchall()
|
||||
mm_debug(
|
||||
f"[folder-db] get_files_by_url_like_any done: {len(rows)} row(s)"
|
||||
)
|
||||
return rows
|
||||
|
||||
def get_file_metadata(self,
|
||||
file_hashes: Set[str],
|
||||
limit: Optional[int] = None) -> List[tuple]:
|
||||
|
||||
@@ -1071,6 +1071,25 @@ class Folder(Store):
|
||||
if namespace == "url":
|
||||
pattern_hint = kwargs.get("pattern_hint")
|
||||
|
||||
def _pattern_candidates(raw: Any) -> List[str]:
|
||||
if raw is None:
|
||||
return []
|
||||
if isinstance(raw, (list, tuple, set)):
|
||||
out: List[str] = []
|
||||
for item in raw:
|
||||
text = str(item or "").strip()
|
||||
if text and text not in out:
|
||||
out.append(text)
|
||||
return out
|
||||
if isinstance(raw, str):
|
||||
text = raw.strip()
|
||||
return [text] if text else []
|
||||
return []
|
||||
|
||||
pattern_candidates = _pattern_candidates(pattern_hint)
|
||||
if len(pattern_candidates) > 200:
|
||||
pattern_candidates = pattern_candidates[:200]
|
||||
|
||||
def _parse_url_value(raw: Any) -> list[str]:
|
||||
if raw is None:
|
||||
return []
|
||||
@@ -1094,16 +1113,26 @@ class Folder(Store):
|
||||
return []
|
||||
|
||||
def _matches_pattern(url_list: list[str]) -> bool:
|
||||
if not pattern_hint:
|
||||
if not pattern_candidates:
|
||||
return True
|
||||
for candidate_url in url_list:
|
||||
if _match_url_pattern(candidate_url, pattern_hint):
|
||||
return True
|
||||
for pat in pattern_candidates:
|
||||
if _match_url_pattern(candidate_url, pat):
|
||||
return True
|
||||
return False
|
||||
|
||||
if not pattern or pattern == "*":
|
||||
debug(f"[folder:{backend_label}] url search: any-url (limit={limit})")
|
||||
rows = api.get_files_with_any_url(limit)
|
||||
if pattern_candidates:
|
||||
debug(
|
||||
f"[folder:{backend_label}] url search: any-url (limit={limit}) pattern_hint={len(pattern_candidates)}"
|
||||
)
|
||||
rows = api.get_files_by_url_like_any(
|
||||
[_url_like_pattern(p) for p in pattern_candidates],
|
||||
limit,
|
||||
)
|
||||
else:
|
||||
debug(f"[folder:{backend_label}] url search: any-url (limit={limit})")
|
||||
rows = api.get_files_with_any_url(limit)
|
||||
else:
|
||||
debug(
|
||||
f"[folder:{backend_label}] url search: like={pattern} (limit={limit})"
|
||||
|
||||
@@ -5,7 +5,7 @@ import sys
|
||||
import tempfile
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from typing import Any, Dict, List, Optional, Sequence, Tuple
|
||||
|
||||
from urllib.parse import quote
|
||||
|
||||
@@ -516,7 +516,8 @@ class HydrusNetwork(Store):
|
||||
url_value: str | None,
|
||||
want_any: bool,
|
||||
fetch_limit: int,
|
||||
scan_limit: int | None = None
|
||||
scan_limit: int | None = None,
|
||||
needles: Optional[Sequence[str]] = None,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Best-effort URL search by scanning Hydrus metadata with include_file_url=True."""
|
||||
|
||||
@@ -572,17 +573,29 @@ class HydrusNetwork(Store):
|
||||
if not candidate_file_ids and not candidate_hashes:
|
||||
return []
|
||||
|
||||
needle = (url_value or "").strip().lower()
|
||||
needle_list: list[str] = []
|
||||
if isinstance(needles, (list, tuple, set)):
|
||||
for item in needles:
|
||||
text = str(item or "").strip().lower()
|
||||
if text and text not in needle_list:
|
||||
needle_list.append(text)
|
||||
if not needle_list:
|
||||
needle = (url_value or "").strip().lower()
|
||||
if needle:
|
||||
needle_list = [needle]
|
||||
chunk_size = 200
|
||||
out: list[dict[str, Any]] = []
|
||||
if scan_limit is None:
|
||||
try:
|
||||
if not want_any and url_value:
|
||||
scan_limit = max(200, min(int(fetch_limit), 400))
|
||||
if not want_any and needle_list:
|
||||
if len(needle_list) > 1:
|
||||
scan_limit = max(int(fetch_limit) * 20, 2000)
|
||||
else:
|
||||
scan_limit = max(200, min(int(fetch_limit), 400))
|
||||
else:
|
||||
scan_limit = max(int(fetch_limit) * 5, 1000)
|
||||
except Exception:
|
||||
scan_limit = 400 if (not want_any and url_value) else 1000
|
||||
scan_limit = 400 if (not want_any and needle_list) else 1000
|
||||
if scan_limit is not None:
|
||||
scan_limit = min(int(scan_limit), 10000)
|
||||
scanned = 0
|
||||
@@ -641,9 +654,9 @@ class HydrusNetwork(Store):
|
||||
if want_any:
|
||||
out.append(meta)
|
||||
continue
|
||||
if not needle:
|
||||
if not needle_list:
|
||||
continue
|
||||
if any(needle in u.lower() for u in urls):
|
||||
if any(any(n in u.lower() for n in needle_list) for u in urls):
|
||||
out.append(meta)
|
||||
continue
|
||||
|
||||
@@ -698,18 +711,37 @@ class HydrusNetwork(Store):
|
||||
|
||||
# Special case: url:* and url:<value>
|
||||
metadata_list: list[dict[str, Any]] | None = None
|
||||
pattern_hint = str(kwargs.get("pattern_hint") or "").strip().lower()
|
||||
pattern_hint_raw = kwargs.get("pattern_hint")
|
||||
pattern_hints: list[str] = []
|
||||
if isinstance(pattern_hint_raw, (list, tuple, set)):
|
||||
for item in pattern_hint_raw:
|
||||
text = str(item or "").strip().lower()
|
||||
if text and text not in pattern_hints:
|
||||
pattern_hints.append(text)
|
||||
elif isinstance(pattern_hint_raw, str):
|
||||
text = pattern_hint_raw.strip().lower()
|
||||
if text:
|
||||
pattern_hints.append(text)
|
||||
pattern_hint = pattern_hints[0] if pattern_hints else ""
|
||||
if ":" in query_lower and not query_lower.startswith(":"):
|
||||
namespace, pattern = query_lower.split(":", 1)
|
||||
namespace = namespace.strip().lower()
|
||||
pattern = pattern.strip()
|
||||
if namespace == "url":
|
||||
if not pattern or pattern == "*":
|
||||
metadata_list = _iter_url_filtered_metadata(
|
||||
None,
|
||||
want_any=True,
|
||||
fetch_limit=int(limit) if limit else 100
|
||||
)
|
||||
if pattern_hints:
|
||||
metadata_list = _iter_url_filtered_metadata(
|
||||
None,
|
||||
want_any=False,
|
||||
fetch_limit=int(limit) if limit else 100,
|
||||
needles=pattern_hints,
|
||||
)
|
||||
else:
|
||||
metadata_list = _iter_url_filtered_metadata(
|
||||
None,
|
||||
want_any=True,
|
||||
fetch_limit=int(limit) if limit else 100
|
||||
)
|
||||
else:
|
||||
def _clean_url_search_token(value: str | None) -> str:
|
||||
token = str(value or "").strip().lower()
|
||||
@@ -807,6 +839,7 @@ class HydrusNetwork(Store):
|
||||
want_any=False,
|
||||
fetch_limit=int(limit) if limit else 100,
|
||||
scan_limit=scan_limit_override,
|
||||
needles=pattern_hints if pattern_hints else None,
|
||||
)
|
||||
|
||||
# Parse the query into tags
|
||||
|
||||
@@ -3358,6 +3358,113 @@ def check_url_exists_in_storage(
|
||||
_mark_preflight_checked()
|
||||
return True
|
||||
|
||||
bulk_mode = len(unique_urls) >= 8
|
||||
|
||||
def _build_bulk_patterns(needles_map: Dict[str, List[str]], max_per_url: int = 3, max_total: int = 240) -> List[str]:
|
||||
patterns: List[str] = []
|
||||
for _original, needles in needles_map.items():
|
||||
for needle in (needles or [])[:max_per_url]:
|
||||
needle_text = str(needle or "").strip()
|
||||
if not needle_text:
|
||||
continue
|
||||
if needle_text not in patterns:
|
||||
patterns.append(needle_text)
|
||||
if len(patterns) >= max_total:
|
||||
return patterns
|
||||
return patterns
|
||||
|
||||
bulk_patterns = _build_bulk_patterns(url_needles)
|
||||
|
||||
def _match_normalized_url(pattern_text: str, candidate_url: str) -> bool:
|
||||
pattern_norm = _normalize_url_for_search(pattern_text)
|
||||
candidate_norm = _normalize_url_for_search(candidate_url)
|
||||
if not pattern_norm or not candidate_norm:
|
||||
return False
|
||||
if pattern_norm == candidate_norm:
|
||||
return True
|
||||
return pattern_norm in candidate_norm
|
||||
|
||||
def _extract_urls_from_hit(
|
||||
hit: Any,
|
||||
backend: Any,
|
||||
*,
|
||||
allow_backend_lookup: bool = True,
|
||||
) -> List[str]:
|
||||
url_values: List[str] = []
|
||||
try:
|
||||
raw_urls = get_field(hit, "known_urls") or get_field(hit, "urls") or get_field(hit, "url")
|
||||
if isinstance(raw_urls, str) and raw_urls.strip():
|
||||
url_values.append(raw_urls.strip())
|
||||
elif isinstance(raw_urls, (list, tuple, set)):
|
||||
for item in raw_urls:
|
||||
if isinstance(item, str) and item.strip():
|
||||
url_values.append(item.strip())
|
||||
except Exception:
|
||||
url_values = []
|
||||
|
||||
if url_values or not allow_backend_lookup:
|
||||
return url_values
|
||||
|
||||
try:
|
||||
file_hash = get_field(hit, "hash") or get_field(hit, "file_hash") or get_field(hit, "sha256") or ""
|
||||
except Exception:
|
||||
file_hash = ""
|
||||
|
||||
if file_hash:
|
||||
try:
|
||||
fetched = backend.get_url(str(file_hash))
|
||||
if isinstance(fetched, str) and fetched.strip():
|
||||
url_values.append(fetched.strip())
|
||||
elif isinstance(fetched, (list, tuple, set)):
|
||||
for item in fetched:
|
||||
if isinstance(item, str) and item.strip():
|
||||
url_values.append(item.strip())
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return url_values
|
||||
|
||||
def _build_display_row_for_hit(
|
||||
hit: Any,
|
||||
backend_name: str,
|
||||
original_url: str,
|
||||
) -> Dict[str, Any]:
|
||||
try:
|
||||
from SYS.result_table import build_display_row
|
||||
extracted = build_display_row(hit, keys=["title", "store", "hash", "ext", "size"])
|
||||
except Exception:
|
||||
extracted = {}
|
||||
|
||||
try:
|
||||
title = extracted.get("title") or get_field(hit, "title") or get_field(hit, "name") or get_field(hit, "target") or get_field(hit, "path") or "(exists)"
|
||||
except Exception:
|
||||
title = "(exists)"
|
||||
|
||||
try:
|
||||
file_hash = extracted.get("hash") or get_field(hit, "hash") or get_field(hit, "file_hash") or get_field(hit, "sha256") or ""
|
||||
except Exception:
|
||||
file_hash = ""
|
||||
|
||||
ext = extracted.get("ext") if isinstance(extracted, dict) else ""
|
||||
size_val = extracted.get("size") if isinstance(extracted, dict) else None
|
||||
|
||||
return {
|
||||
"title": str(title),
|
||||
"store": str(get_field(hit, "store") or backend_name),
|
||||
"hash": str(file_hash or ""),
|
||||
"ext": str(ext or ""),
|
||||
"size": size_val,
|
||||
"url": original_url,
|
||||
"columns": [
|
||||
("Title", str(title)),
|
||||
("Store", str(get_field(hit, "store") or backend_name)),
|
||||
("Hash", str(file_hash or "")),
|
||||
("Ext", str(ext or "")),
|
||||
("Size", size_val),
|
||||
("URL", original_url),
|
||||
],
|
||||
}
|
||||
|
||||
def _search_backend_url_hits(
|
||||
backend: Any,
|
||||
backend_name: str,
|
||||
@@ -3379,15 +3486,6 @@ def check_url_exists_in_storage(
|
||||
continue
|
||||
|
||||
if not backend_hits:
|
||||
def _match_normalized_url(pattern_text: str, candidate_url: str) -> bool:
|
||||
pattern_norm = _normalize_url_for_search(pattern_text)
|
||||
candidate_norm = _normalize_url_for_search(candidate_url)
|
||||
if not pattern_norm or not candidate_norm:
|
||||
return False
|
||||
if pattern_norm == candidate_norm:
|
||||
return True
|
||||
return pattern_norm in candidate_norm
|
||||
|
||||
fallback_hits: List[Dict[str, Any]] = []
|
||||
try:
|
||||
fallback_hits = backend.search("url:*", limit=200) or []
|
||||
@@ -3395,31 +3493,7 @@ def check_url_exists_in_storage(
|
||||
fallback_hits = []
|
||||
|
||||
for hit in fallback_hits:
|
||||
url_values: List[str] = []
|
||||
try:
|
||||
raw_urls = get_field(hit, "known_urls") or get_field(hit, "urls") or get_field(hit, "url")
|
||||
if isinstance(raw_urls, str) and raw_urls.strip():
|
||||
url_values.append(raw_urls.strip())
|
||||
elif isinstance(raw_urls, (list, tuple, set)):
|
||||
for item in raw_urls:
|
||||
if isinstance(item, str) and item.strip():
|
||||
url_values.append(item.strip())
|
||||
except Exception:
|
||||
url_values = []
|
||||
|
||||
if not url_values:
|
||||
try:
|
||||
file_hash = hit.get("hash") if isinstance(hit, dict) else None
|
||||
if file_hash:
|
||||
fetched = backend.get_url(str(file_hash))
|
||||
if isinstance(fetched, str) and fetched.strip():
|
||||
url_values.append(fetched.strip())
|
||||
elif isinstance(fetched, (list, tuple, set)):
|
||||
for item in fetched:
|
||||
if isinstance(item, str) and item.strip():
|
||||
url_values.append(item.strip())
|
||||
except Exception:
|
||||
pass
|
||||
url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=True)
|
||||
|
||||
if not url_values:
|
||||
continue
|
||||
@@ -3436,68 +3510,12 @@ def check_url_exists_in_storage(
|
||||
if not matched:
|
||||
continue
|
||||
|
||||
title = "(exists)"
|
||||
try:
|
||||
title = hit.get("title") or hit.get("name") or hit.get("target") or hit.get("path") or "(exists)"
|
||||
except Exception:
|
||||
title = "(exists)"
|
||||
|
||||
file_hash = ""
|
||||
try:
|
||||
file_hash = hit.get("hash") or hit.get("file_hash") or hit.get("sha256") or ""
|
||||
except Exception:
|
||||
file_hash = ""
|
||||
|
||||
return {
|
||||
"title": str(title),
|
||||
"store": str(hit.get("store") or backend_name),
|
||||
"hash": str(file_hash or ""),
|
||||
"ext": "",
|
||||
"size": None,
|
||||
"url": original_url,
|
||||
"columns": [
|
||||
("Title", str(title)),
|
||||
("Store", str(hit.get("store") or backend_name)),
|
||||
("Hash", str(file_hash or "")),
|
||||
("URL", original_url),
|
||||
],
|
||||
}
|
||||
return _build_display_row_for_hit(hit, backend_name, original_url)
|
||||
|
||||
return None
|
||||
|
||||
hit = backend_hits[0]
|
||||
title = hit.get("title") or hit.get("name") or hit.get("target") or hit.get("path") or "(exists)"
|
||||
file_hash = hit.get("hash") or hit.get("file_hash") or hit.get("sha256") or ""
|
||||
|
||||
try:
|
||||
from SYS.result_table import build_display_row
|
||||
extracted = build_display_row(hit, keys=["title", "store", "hash", "ext", "size"])
|
||||
except Exception:
|
||||
extracted = {}
|
||||
|
||||
extracted["title"] = str(title)
|
||||
extracted["store"] = str(hit.get("store") or backend_name)
|
||||
extracted["hash"] = str(file_hash or "")
|
||||
|
||||
ext = extracted.get("ext")
|
||||
size_val = extracted.get("size")
|
||||
|
||||
return {
|
||||
"title": str(title),
|
||||
"store": str(hit.get("store") or backend_name),
|
||||
"hash": str(file_hash or ""),
|
||||
"ext": str(ext or ""),
|
||||
"size": size_val,
|
||||
"url": original_url,
|
||||
"columns": [
|
||||
("Title", str(title)),
|
||||
("Store", str(hit.get("store") or backend_name)),
|
||||
("Hash", str(file_hash or "")),
|
||||
("Ext", str(ext or "")),
|
||||
("Size", size_val),
|
||||
("URL", original_url),
|
||||
],
|
||||
}
|
||||
return _build_display_row_for_hit(hit, backend_name, original_url)
|
||||
|
||||
backend_names: List[str] = []
|
||||
try:
|
||||
@@ -3558,6 +3576,54 @@ def check_url_exists_in_storage(
|
||||
if not hydrus_available:
|
||||
debug("Bulk URL preflight: hydrus availability check failed; attempting best-effort lookup")
|
||||
|
||||
if bulk_mode and bulk_patterns:
|
||||
bulk_hits: Optional[List[Any]] = None
|
||||
bulk_limit = min(2000, max(200, len(unique_urls) * 8))
|
||||
try:
|
||||
bulk_hits = backend.search(
|
||||
"url:*",
|
||||
limit=bulk_limit,
|
||||
pattern_hint=bulk_patterns,
|
||||
) or []
|
||||
except Exception:
|
||||
try:
|
||||
bulk_hits = backend.search("url:*", limit=bulk_limit) or []
|
||||
except Exception:
|
||||
bulk_hits = None
|
||||
|
||||
if bulk_hits is not None:
|
||||
for hit in bulk_hits:
|
||||
if len(match_rows) >= max_rows:
|
||||
break
|
||||
url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=False)
|
||||
if not url_values:
|
||||
continue
|
||||
|
||||
for original_url, needles in url_needles.items():
|
||||
if len(match_rows) >= max_rows:
|
||||
break
|
||||
if (original_url, str(backend_name)) in seen_pairs:
|
||||
continue
|
||||
|
||||
matched = False
|
||||
for url_value in url_values:
|
||||
for needle in (needles or []):
|
||||
if _match_normalized_url(str(needle or ""), str(url_value or "")):
|
||||
matched = True
|
||||
break
|
||||
if matched:
|
||||
break
|
||||
|
||||
if not matched:
|
||||
continue
|
||||
|
||||
seen_pairs.add((original_url, str(backend_name)))
|
||||
matched_urls.add(original_url)
|
||||
match_rows.append(
|
||||
_build_display_row_for_hit(hit, str(backend_name), original_url)
|
||||
)
|
||||
continue
|
||||
|
||||
for original_url, needles in url_needles.items():
|
||||
if len(match_rows) >= max_rows:
|
||||
break
|
||||
@@ -3622,6 +3688,54 @@ def check_url_exists_in_storage(
|
||||
match_rows.append(display_row)
|
||||
continue
|
||||
|
||||
if bulk_mode and bulk_patterns:
|
||||
bulk_hits: Optional[List[Any]] = None
|
||||
bulk_limit = min(2000, max(200, len(unique_urls) * 8))
|
||||
try:
|
||||
bulk_hits = backend.search(
|
||||
"url:*",
|
||||
limit=bulk_limit,
|
||||
pattern_hint=bulk_patterns,
|
||||
) or []
|
||||
except Exception:
|
||||
try:
|
||||
bulk_hits = backend.search("url:*", limit=bulk_limit) or []
|
||||
except Exception:
|
||||
bulk_hits = None
|
||||
|
||||
if bulk_hits is not None:
|
||||
for hit in bulk_hits:
|
||||
if len(match_rows) >= max_rows:
|
||||
break
|
||||
url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=False)
|
||||
if not url_values:
|
||||
continue
|
||||
|
||||
for original_url, needles in url_needles.items():
|
||||
if len(match_rows) >= max_rows:
|
||||
break
|
||||
if (original_url, str(backend_name)) in seen_pairs:
|
||||
continue
|
||||
|
||||
matched = False
|
||||
for url_value in url_values:
|
||||
for needle in (needles or []):
|
||||
if _match_normalized_url(str(needle or ""), str(url_value or "")):
|
||||
matched = True
|
||||
break
|
||||
if matched:
|
||||
break
|
||||
|
||||
if not matched:
|
||||
continue
|
||||
|
||||
seen_pairs.add((original_url, str(backend_name)))
|
||||
matched_urls.add(original_url)
|
||||
match_rows.append(
|
||||
_build_display_row_for_hit(hit, str(backend_name), original_url)
|
||||
)
|
||||
continue
|
||||
|
||||
for original_url, needles in url_needles.items():
|
||||
if len(match_rows) >= max_rows:
|
||||
break
|
||||
|
||||
Reference in New Issue
Block a user