f
This commit is contained in:
@@ -21,7 +21,7 @@ from contextlib import contextmanager
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path, PurePosixPath
|
from pathlib import Path, PurePosixPath
|
||||||
from threading import RLock
|
from threading import RLock
|
||||||
from typing import Optional, Dict, Any, List, Tuple, Set
|
from typing import Optional, Dict, Any, List, Tuple, Set, Sequence
|
||||||
|
|
||||||
from SYS.utils import sha256_file, expand_path
|
from SYS.utils import sha256_file, expand_path
|
||||||
from SYS.logger import debug as _debug
|
from SYS.logger import debug as _debug
|
||||||
@@ -3001,6 +3001,47 @@ class DatabaseAPI:
|
|||||||
)
|
)
|
||||||
return rows
|
return rows
|
||||||
|
|
||||||
|
def get_files_by_url_like_any(
|
||||||
|
self,
|
||||||
|
like_patterns: Sequence[str],
|
||||||
|
limit: Optional[int] = None,
|
||||||
|
) -> List[tuple]:
|
||||||
|
"""Get files whose URL metadata matches any of the provided LIKE patterns.
|
||||||
|
|
||||||
|
Returns (hash, file_path, size, ext, url) tuples.
|
||||||
|
"""
|
||||||
|
patterns = [str(p or "").strip() for p in (like_patterns or [])]
|
||||||
|
patterns = [p for p in patterns if p]
|
||||||
|
if not patterns:
|
||||||
|
return []
|
||||||
|
|
||||||
|
mm_debug(
|
||||||
|
f"[folder-db] get_files_by_url_like_any start: patterns={len(patterns)} limit={limit or 10000}"
|
||||||
|
)
|
||||||
|
cursor = self.get_cursor()
|
||||||
|
where_or = " OR ".join(["LOWER(m.url) LIKE ?"] * len(patterns))
|
||||||
|
query = f"""
|
||||||
|
SELECT f.hash, f.file_path,
|
||||||
|
COALESCE((SELECT size FROM metadata WHERE hash = f.hash), 0) as size,
|
||||||
|
COALESCE((SELECT ext FROM metadata WHERE hash = f.hash), '') as ext,
|
||||||
|
COALESCE(m.url, '') as url
|
||||||
|
FROM file f
|
||||||
|
JOIN metadata m ON f.hash = m.hash
|
||||||
|
WHERE m.url IS NOT NULL
|
||||||
|
AND ({where_or})
|
||||||
|
ORDER BY f.file_path
|
||||||
|
LIMIT ?
|
||||||
|
"""
|
||||||
|
cursor.execute(
|
||||||
|
query,
|
||||||
|
(*[p.lower() for p in patterns], limit or 10000),
|
||||||
|
)
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
mm_debug(
|
||||||
|
f"[folder-db] get_files_by_url_like_any done: {len(rows)} row(s)"
|
||||||
|
)
|
||||||
|
return rows
|
||||||
|
|
||||||
def get_file_metadata(self,
|
def get_file_metadata(self,
|
||||||
file_hashes: Set[str],
|
file_hashes: Set[str],
|
||||||
limit: Optional[int] = None) -> List[tuple]:
|
limit: Optional[int] = None) -> List[tuple]:
|
||||||
|
|||||||
@@ -1071,6 +1071,25 @@ class Folder(Store):
|
|||||||
if namespace == "url":
|
if namespace == "url":
|
||||||
pattern_hint = kwargs.get("pattern_hint")
|
pattern_hint = kwargs.get("pattern_hint")
|
||||||
|
|
||||||
|
def _pattern_candidates(raw: Any) -> List[str]:
|
||||||
|
if raw is None:
|
||||||
|
return []
|
||||||
|
if isinstance(raw, (list, tuple, set)):
|
||||||
|
out: List[str] = []
|
||||||
|
for item in raw:
|
||||||
|
text = str(item or "").strip()
|
||||||
|
if text and text not in out:
|
||||||
|
out.append(text)
|
||||||
|
return out
|
||||||
|
if isinstance(raw, str):
|
||||||
|
text = raw.strip()
|
||||||
|
return [text] if text else []
|
||||||
|
return []
|
||||||
|
|
||||||
|
pattern_candidates = _pattern_candidates(pattern_hint)
|
||||||
|
if len(pattern_candidates) > 200:
|
||||||
|
pattern_candidates = pattern_candidates[:200]
|
||||||
|
|
||||||
def _parse_url_value(raw: Any) -> list[str]:
|
def _parse_url_value(raw: Any) -> list[str]:
|
||||||
if raw is None:
|
if raw is None:
|
||||||
return []
|
return []
|
||||||
@@ -1094,14 +1113,24 @@ class Folder(Store):
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
def _matches_pattern(url_list: list[str]) -> bool:
|
def _matches_pattern(url_list: list[str]) -> bool:
|
||||||
if not pattern_hint:
|
if not pattern_candidates:
|
||||||
return True
|
return True
|
||||||
for candidate_url in url_list:
|
for candidate_url in url_list:
|
||||||
if _match_url_pattern(candidate_url, pattern_hint):
|
for pat in pattern_candidates:
|
||||||
|
if _match_url_pattern(candidate_url, pat):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if not pattern or pattern == "*":
|
if not pattern or pattern == "*":
|
||||||
|
if pattern_candidates:
|
||||||
|
debug(
|
||||||
|
f"[folder:{backend_label}] url search: any-url (limit={limit}) pattern_hint={len(pattern_candidates)}"
|
||||||
|
)
|
||||||
|
rows = api.get_files_by_url_like_any(
|
||||||
|
[_url_like_pattern(p) for p in pattern_candidates],
|
||||||
|
limit,
|
||||||
|
)
|
||||||
|
else:
|
||||||
debug(f"[folder:{backend_label}] url search: any-url (limit={limit})")
|
debug(f"[folder:{backend_label}] url search: any-url (limit={limit})")
|
||||||
rows = api.get_files_with_any_url(limit)
|
rows = api.get_files_with_any_url(limit)
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ import sys
|
|||||||
import tempfile
|
import tempfile
|
||||||
import shutil
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Dict, List, Optional, Tuple
|
from typing import Any, Dict, List, Optional, Sequence, Tuple
|
||||||
|
|
||||||
from urllib.parse import quote
|
from urllib.parse import quote
|
||||||
|
|
||||||
@@ -516,7 +516,8 @@ class HydrusNetwork(Store):
|
|||||||
url_value: str | None,
|
url_value: str | None,
|
||||||
want_any: bool,
|
want_any: bool,
|
||||||
fetch_limit: int,
|
fetch_limit: int,
|
||||||
scan_limit: int | None = None
|
scan_limit: int | None = None,
|
||||||
|
needles: Optional[Sequence[str]] = None,
|
||||||
) -> list[dict[str, Any]]:
|
) -> list[dict[str, Any]]:
|
||||||
"""Best-effort URL search by scanning Hydrus metadata with include_file_url=True."""
|
"""Best-effort URL search by scanning Hydrus metadata with include_file_url=True."""
|
||||||
|
|
||||||
@@ -572,17 +573,29 @@ class HydrusNetwork(Store):
|
|||||||
if not candidate_file_ids and not candidate_hashes:
|
if not candidate_file_ids and not candidate_hashes:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
needle_list: list[str] = []
|
||||||
|
if isinstance(needles, (list, tuple, set)):
|
||||||
|
for item in needles:
|
||||||
|
text = str(item or "").strip().lower()
|
||||||
|
if text and text not in needle_list:
|
||||||
|
needle_list.append(text)
|
||||||
|
if not needle_list:
|
||||||
needle = (url_value or "").strip().lower()
|
needle = (url_value or "").strip().lower()
|
||||||
|
if needle:
|
||||||
|
needle_list = [needle]
|
||||||
chunk_size = 200
|
chunk_size = 200
|
||||||
out: list[dict[str, Any]] = []
|
out: list[dict[str, Any]] = []
|
||||||
if scan_limit is None:
|
if scan_limit is None:
|
||||||
try:
|
try:
|
||||||
if not want_any and url_value:
|
if not want_any and needle_list:
|
||||||
|
if len(needle_list) > 1:
|
||||||
|
scan_limit = max(int(fetch_limit) * 20, 2000)
|
||||||
|
else:
|
||||||
scan_limit = max(200, min(int(fetch_limit), 400))
|
scan_limit = max(200, min(int(fetch_limit), 400))
|
||||||
else:
|
else:
|
||||||
scan_limit = max(int(fetch_limit) * 5, 1000)
|
scan_limit = max(int(fetch_limit) * 5, 1000)
|
||||||
except Exception:
|
except Exception:
|
||||||
scan_limit = 400 if (not want_any and url_value) else 1000
|
scan_limit = 400 if (not want_any and needle_list) else 1000
|
||||||
if scan_limit is not None:
|
if scan_limit is not None:
|
||||||
scan_limit = min(int(scan_limit), 10000)
|
scan_limit = min(int(scan_limit), 10000)
|
||||||
scanned = 0
|
scanned = 0
|
||||||
@@ -641,9 +654,9 @@ class HydrusNetwork(Store):
|
|||||||
if want_any:
|
if want_any:
|
||||||
out.append(meta)
|
out.append(meta)
|
||||||
continue
|
continue
|
||||||
if not needle:
|
if not needle_list:
|
||||||
continue
|
continue
|
||||||
if any(needle in u.lower() for u in urls):
|
if any(any(n in u.lower() for n in needle_list) for u in urls):
|
||||||
out.append(meta)
|
out.append(meta)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -698,13 +711,32 @@ class HydrusNetwork(Store):
|
|||||||
|
|
||||||
# Special case: url:* and url:<value>
|
# Special case: url:* and url:<value>
|
||||||
metadata_list: list[dict[str, Any]] | None = None
|
metadata_list: list[dict[str, Any]] | None = None
|
||||||
pattern_hint = str(kwargs.get("pattern_hint") or "").strip().lower()
|
pattern_hint_raw = kwargs.get("pattern_hint")
|
||||||
|
pattern_hints: list[str] = []
|
||||||
|
if isinstance(pattern_hint_raw, (list, tuple, set)):
|
||||||
|
for item in pattern_hint_raw:
|
||||||
|
text = str(item or "").strip().lower()
|
||||||
|
if text and text not in pattern_hints:
|
||||||
|
pattern_hints.append(text)
|
||||||
|
elif isinstance(pattern_hint_raw, str):
|
||||||
|
text = pattern_hint_raw.strip().lower()
|
||||||
|
if text:
|
||||||
|
pattern_hints.append(text)
|
||||||
|
pattern_hint = pattern_hints[0] if pattern_hints else ""
|
||||||
if ":" in query_lower and not query_lower.startswith(":"):
|
if ":" in query_lower and not query_lower.startswith(":"):
|
||||||
namespace, pattern = query_lower.split(":", 1)
|
namespace, pattern = query_lower.split(":", 1)
|
||||||
namespace = namespace.strip().lower()
|
namespace = namespace.strip().lower()
|
||||||
pattern = pattern.strip()
|
pattern = pattern.strip()
|
||||||
if namespace == "url":
|
if namespace == "url":
|
||||||
if not pattern or pattern == "*":
|
if not pattern or pattern == "*":
|
||||||
|
if pattern_hints:
|
||||||
|
metadata_list = _iter_url_filtered_metadata(
|
||||||
|
None,
|
||||||
|
want_any=False,
|
||||||
|
fetch_limit=int(limit) if limit else 100,
|
||||||
|
needles=pattern_hints,
|
||||||
|
)
|
||||||
|
else:
|
||||||
metadata_list = _iter_url_filtered_metadata(
|
metadata_list = _iter_url_filtered_metadata(
|
||||||
None,
|
None,
|
||||||
want_any=True,
|
want_any=True,
|
||||||
@@ -807,6 +839,7 @@ class HydrusNetwork(Store):
|
|||||||
want_any=False,
|
want_any=False,
|
||||||
fetch_limit=int(limit) if limit else 100,
|
fetch_limit=int(limit) if limit else 100,
|
||||||
scan_limit=scan_limit_override,
|
scan_limit=scan_limit_override,
|
||||||
|
needles=pattern_hints if pattern_hints else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Parse the query into tags
|
# Parse the query into tags
|
||||||
|
|||||||
@@ -3358,6 +3358,113 @@ def check_url_exists_in_storage(
|
|||||||
_mark_preflight_checked()
|
_mark_preflight_checked()
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
bulk_mode = len(unique_urls) >= 8
|
||||||
|
|
||||||
|
def _build_bulk_patterns(needles_map: Dict[str, List[str]], max_per_url: int = 3, max_total: int = 240) -> List[str]:
|
||||||
|
patterns: List[str] = []
|
||||||
|
for _original, needles in needles_map.items():
|
||||||
|
for needle in (needles or [])[:max_per_url]:
|
||||||
|
needle_text = str(needle or "").strip()
|
||||||
|
if not needle_text:
|
||||||
|
continue
|
||||||
|
if needle_text not in patterns:
|
||||||
|
patterns.append(needle_text)
|
||||||
|
if len(patterns) >= max_total:
|
||||||
|
return patterns
|
||||||
|
return patterns
|
||||||
|
|
||||||
|
bulk_patterns = _build_bulk_patterns(url_needles)
|
||||||
|
|
||||||
|
def _match_normalized_url(pattern_text: str, candidate_url: str) -> bool:
|
||||||
|
pattern_norm = _normalize_url_for_search(pattern_text)
|
||||||
|
candidate_norm = _normalize_url_for_search(candidate_url)
|
||||||
|
if not pattern_norm or not candidate_norm:
|
||||||
|
return False
|
||||||
|
if pattern_norm == candidate_norm:
|
||||||
|
return True
|
||||||
|
return pattern_norm in candidate_norm
|
||||||
|
|
||||||
|
def _extract_urls_from_hit(
|
||||||
|
hit: Any,
|
||||||
|
backend: Any,
|
||||||
|
*,
|
||||||
|
allow_backend_lookup: bool = True,
|
||||||
|
) -> List[str]:
|
||||||
|
url_values: List[str] = []
|
||||||
|
try:
|
||||||
|
raw_urls = get_field(hit, "known_urls") or get_field(hit, "urls") or get_field(hit, "url")
|
||||||
|
if isinstance(raw_urls, str) and raw_urls.strip():
|
||||||
|
url_values.append(raw_urls.strip())
|
||||||
|
elif isinstance(raw_urls, (list, tuple, set)):
|
||||||
|
for item in raw_urls:
|
||||||
|
if isinstance(item, str) and item.strip():
|
||||||
|
url_values.append(item.strip())
|
||||||
|
except Exception:
|
||||||
|
url_values = []
|
||||||
|
|
||||||
|
if url_values or not allow_backend_lookup:
|
||||||
|
return url_values
|
||||||
|
|
||||||
|
try:
|
||||||
|
file_hash = get_field(hit, "hash") or get_field(hit, "file_hash") or get_field(hit, "sha256") or ""
|
||||||
|
except Exception:
|
||||||
|
file_hash = ""
|
||||||
|
|
||||||
|
if file_hash:
|
||||||
|
try:
|
||||||
|
fetched = backend.get_url(str(file_hash))
|
||||||
|
if isinstance(fetched, str) and fetched.strip():
|
||||||
|
url_values.append(fetched.strip())
|
||||||
|
elif isinstance(fetched, (list, tuple, set)):
|
||||||
|
for item in fetched:
|
||||||
|
if isinstance(item, str) and item.strip():
|
||||||
|
url_values.append(item.strip())
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return url_values
|
||||||
|
|
||||||
|
def _build_display_row_for_hit(
|
||||||
|
hit: Any,
|
||||||
|
backend_name: str,
|
||||||
|
original_url: str,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
try:
|
||||||
|
from SYS.result_table import build_display_row
|
||||||
|
extracted = build_display_row(hit, keys=["title", "store", "hash", "ext", "size"])
|
||||||
|
except Exception:
|
||||||
|
extracted = {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
title = extracted.get("title") or get_field(hit, "title") or get_field(hit, "name") or get_field(hit, "target") or get_field(hit, "path") or "(exists)"
|
||||||
|
except Exception:
|
||||||
|
title = "(exists)"
|
||||||
|
|
||||||
|
try:
|
||||||
|
file_hash = extracted.get("hash") or get_field(hit, "hash") or get_field(hit, "file_hash") or get_field(hit, "sha256") or ""
|
||||||
|
except Exception:
|
||||||
|
file_hash = ""
|
||||||
|
|
||||||
|
ext = extracted.get("ext") if isinstance(extracted, dict) else ""
|
||||||
|
size_val = extracted.get("size") if isinstance(extracted, dict) else None
|
||||||
|
|
||||||
|
return {
|
||||||
|
"title": str(title),
|
||||||
|
"store": str(get_field(hit, "store") or backend_name),
|
||||||
|
"hash": str(file_hash or ""),
|
||||||
|
"ext": str(ext or ""),
|
||||||
|
"size": size_val,
|
||||||
|
"url": original_url,
|
||||||
|
"columns": [
|
||||||
|
("Title", str(title)),
|
||||||
|
("Store", str(get_field(hit, "store") or backend_name)),
|
||||||
|
("Hash", str(file_hash or "")),
|
||||||
|
("Ext", str(ext or "")),
|
||||||
|
("Size", size_val),
|
||||||
|
("URL", original_url),
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
def _search_backend_url_hits(
|
def _search_backend_url_hits(
|
||||||
backend: Any,
|
backend: Any,
|
||||||
backend_name: str,
|
backend_name: str,
|
||||||
@@ -3379,15 +3486,6 @@ def check_url_exists_in_storage(
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
if not backend_hits:
|
if not backend_hits:
|
||||||
def _match_normalized_url(pattern_text: str, candidate_url: str) -> bool:
|
|
||||||
pattern_norm = _normalize_url_for_search(pattern_text)
|
|
||||||
candidate_norm = _normalize_url_for_search(candidate_url)
|
|
||||||
if not pattern_norm or not candidate_norm:
|
|
||||||
return False
|
|
||||||
if pattern_norm == candidate_norm:
|
|
||||||
return True
|
|
||||||
return pattern_norm in candidate_norm
|
|
||||||
|
|
||||||
fallback_hits: List[Dict[str, Any]] = []
|
fallback_hits: List[Dict[str, Any]] = []
|
||||||
try:
|
try:
|
||||||
fallback_hits = backend.search("url:*", limit=200) or []
|
fallback_hits = backend.search("url:*", limit=200) or []
|
||||||
@@ -3395,31 +3493,7 @@ def check_url_exists_in_storage(
|
|||||||
fallback_hits = []
|
fallback_hits = []
|
||||||
|
|
||||||
for hit in fallback_hits:
|
for hit in fallback_hits:
|
||||||
url_values: List[str] = []
|
url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=True)
|
||||||
try:
|
|
||||||
raw_urls = get_field(hit, "known_urls") or get_field(hit, "urls") or get_field(hit, "url")
|
|
||||||
if isinstance(raw_urls, str) and raw_urls.strip():
|
|
||||||
url_values.append(raw_urls.strip())
|
|
||||||
elif isinstance(raw_urls, (list, tuple, set)):
|
|
||||||
for item in raw_urls:
|
|
||||||
if isinstance(item, str) and item.strip():
|
|
||||||
url_values.append(item.strip())
|
|
||||||
except Exception:
|
|
||||||
url_values = []
|
|
||||||
|
|
||||||
if not url_values:
|
|
||||||
try:
|
|
||||||
file_hash = hit.get("hash") if isinstance(hit, dict) else None
|
|
||||||
if file_hash:
|
|
||||||
fetched = backend.get_url(str(file_hash))
|
|
||||||
if isinstance(fetched, str) and fetched.strip():
|
|
||||||
url_values.append(fetched.strip())
|
|
||||||
elif isinstance(fetched, (list, tuple, set)):
|
|
||||||
for item in fetched:
|
|
||||||
if isinstance(item, str) and item.strip():
|
|
||||||
url_values.append(item.strip())
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
if not url_values:
|
if not url_values:
|
||||||
continue
|
continue
|
||||||
@@ -3436,68 +3510,12 @@ def check_url_exists_in_storage(
|
|||||||
if not matched:
|
if not matched:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
title = "(exists)"
|
return _build_display_row_for_hit(hit, backend_name, original_url)
|
||||||
try:
|
|
||||||
title = hit.get("title") or hit.get("name") or hit.get("target") or hit.get("path") or "(exists)"
|
|
||||||
except Exception:
|
|
||||||
title = "(exists)"
|
|
||||||
|
|
||||||
file_hash = ""
|
|
||||||
try:
|
|
||||||
file_hash = hit.get("hash") or hit.get("file_hash") or hit.get("sha256") or ""
|
|
||||||
except Exception:
|
|
||||||
file_hash = ""
|
|
||||||
|
|
||||||
return {
|
|
||||||
"title": str(title),
|
|
||||||
"store": str(hit.get("store") or backend_name),
|
|
||||||
"hash": str(file_hash or ""),
|
|
||||||
"ext": "",
|
|
||||||
"size": None,
|
|
||||||
"url": original_url,
|
|
||||||
"columns": [
|
|
||||||
("Title", str(title)),
|
|
||||||
("Store", str(hit.get("store") or backend_name)),
|
|
||||||
("Hash", str(file_hash or "")),
|
|
||||||
("URL", original_url),
|
|
||||||
],
|
|
||||||
}
|
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
hit = backend_hits[0]
|
hit = backend_hits[0]
|
||||||
title = hit.get("title") or hit.get("name") or hit.get("target") or hit.get("path") or "(exists)"
|
return _build_display_row_for_hit(hit, backend_name, original_url)
|
||||||
file_hash = hit.get("hash") or hit.get("file_hash") or hit.get("sha256") or ""
|
|
||||||
|
|
||||||
try:
|
|
||||||
from SYS.result_table import build_display_row
|
|
||||||
extracted = build_display_row(hit, keys=["title", "store", "hash", "ext", "size"])
|
|
||||||
except Exception:
|
|
||||||
extracted = {}
|
|
||||||
|
|
||||||
extracted["title"] = str(title)
|
|
||||||
extracted["store"] = str(hit.get("store") or backend_name)
|
|
||||||
extracted["hash"] = str(file_hash or "")
|
|
||||||
|
|
||||||
ext = extracted.get("ext")
|
|
||||||
size_val = extracted.get("size")
|
|
||||||
|
|
||||||
return {
|
|
||||||
"title": str(title),
|
|
||||||
"store": str(hit.get("store") or backend_name),
|
|
||||||
"hash": str(file_hash or ""),
|
|
||||||
"ext": str(ext or ""),
|
|
||||||
"size": size_val,
|
|
||||||
"url": original_url,
|
|
||||||
"columns": [
|
|
||||||
("Title", str(title)),
|
|
||||||
("Store", str(hit.get("store") or backend_name)),
|
|
||||||
("Hash", str(file_hash or "")),
|
|
||||||
("Ext", str(ext or "")),
|
|
||||||
("Size", size_val),
|
|
||||||
("URL", original_url),
|
|
||||||
],
|
|
||||||
}
|
|
||||||
|
|
||||||
backend_names: List[str] = []
|
backend_names: List[str] = []
|
||||||
try:
|
try:
|
||||||
@@ -3558,6 +3576,54 @@ def check_url_exists_in_storage(
|
|||||||
if not hydrus_available:
|
if not hydrus_available:
|
||||||
debug("Bulk URL preflight: hydrus availability check failed; attempting best-effort lookup")
|
debug("Bulk URL preflight: hydrus availability check failed; attempting best-effort lookup")
|
||||||
|
|
||||||
|
if bulk_mode and bulk_patterns:
|
||||||
|
bulk_hits: Optional[List[Any]] = None
|
||||||
|
bulk_limit = min(2000, max(200, len(unique_urls) * 8))
|
||||||
|
try:
|
||||||
|
bulk_hits = backend.search(
|
||||||
|
"url:*",
|
||||||
|
limit=bulk_limit,
|
||||||
|
pattern_hint=bulk_patterns,
|
||||||
|
) or []
|
||||||
|
except Exception:
|
||||||
|
try:
|
||||||
|
bulk_hits = backend.search("url:*", limit=bulk_limit) or []
|
||||||
|
except Exception:
|
||||||
|
bulk_hits = None
|
||||||
|
|
||||||
|
if bulk_hits is not None:
|
||||||
|
for hit in bulk_hits:
|
||||||
|
if len(match_rows) >= max_rows:
|
||||||
|
break
|
||||||
|
url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=False)
|
||||||
|
if not url_values:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for original_url, needles in url_needles.items():
|
||||||
|
if len(match_rows) >= max_rows:
|
||||||
|
break
|
||||||
|
if (original_url, str(backend_name)) in seen_pairs:
|
||||||
|
continue
|
||||||
|
|
||||||
|
matched = False
|
||||||
|
for url_value in url_values:
|
||||||
|
for needle in (needles or []):
|
||||||
|
if _match_normalized_url(str(needle or ""), str(url_value or "")):
|
||||||
|
matched = True
|
||||||
|
break
|
||||||
|
if matched:
|
||||||
|
break
|
||||||
|
|
||||||
|
if not matched:
|
||||||
|
continue
|
||||||
|
|
||||||
|
seen_pairs.add((original_url, str(backend_name)))
|
||||||
|
matched_urls.add(original_url)
|
||||||
|
match_rows.append(
|
||||||
|
_build_display_row_for_hit(hit, str(backend_name), original_url)
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
for original_url, needles in url_needles.items():
|
for original_url, needles in url_needles.items():
|
||||||
if len(match_rows) >= max_rows:
|
if len(match_rows) >= max_rows:
|
||||||
break
|
break
|
||||||
@@ -3622,6 +3688,54 @@ def check_url_exists_in_storage(
|
|||||||
match_rows.append(display_row)
|
match_rows.append(display_row)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
if bulk_mode and bulk_patterns:
|
||||||
|
bulk_hits: Optional[List[Any]] = None
|
||||||
|
bulk_limit = min(2000, max(200, len(unique_urls) * 8))
|
||||||
|
try:
|
||||||
|
bulk_hits = backend.search(
|
||||||
|
"url:*",
|
||||||
|
limit=bulk_limit,
|
||||||
|
pattern_hint=bulk_patterns,
|
||||||
|
) or []
|
||||||
|
except Exception:
|
||||||
|
try:
|
||||||
|
bulk_hits = backend.search("url:*", limit=bulk_limit) or []
|
||||||
|
except Exception:
|
||||||
|
bulk_hits = None
|
||||||
|
|
||||||
|
if bulk_hits is not None:
|
||||||
|
for hit in bulk_hits:
|
||||||
|
if len(match_rows) >= max_rows:
|
||||||
|
break
|
||||||
|
url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=False)
|
||||||
|
if not url_values:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for original_url, needles in url_needles.items():
|
||||||
|
if len(match_rows) >= max_rows:
|
||||||
|
break
|
||||||
|
if (original_url, str(backend_name)) in seen_pairs:
|
||||||
|
continue
|
||||||
|
|
||||||
|
matched = False
|
||||||
|
for url_value in url_values:
|
||||||
|
for needle in (needles or []):
|
||||||
|
if _match_normalized_url(str(needle or ""), str(url_value or "")):
|
||||||
|
matched = True
|
||||||
|
break
|
||||||
|
if matched:
|
||||||
|
break
|
||||||
|
|
||||||
|
if not matched:
|
||||||
|
continue
|
||||||
|
|
||||||
|
seen_pairs.add((original_url, str(backend_name)))
|
||||||
|
matched_urls.add(original_url)
|
||||||
|
match_rows.append(
|
||||||
|
_build_display_row_for_hit(hit, str(backend_name), original_url)
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
for original_url, needles in url_needles.items():
|
for original_url, needles in url_needles.items():
|
||||||
if len(match_rows) >= max_rows:
|
if len(match_rows) >= max_rows:
|
||||||
break
|
break
|
||||||
|
|||||||
Reference in New Issue
Block a user