This commit is contained in:
2026-01-16 01:47:00 -08:00
parent 41e95d0360
commit 12436e5a6a
4 changed files with 492 additions and 130 deletions

View File

@@ -1,5 +1,7 @@
from __future__ import annotations
from queue import SimpleQueue
from threading import Thread
from dataclasses import dataclass
from typing import Any, Dict, List, Sequence, Optional, Set, Tuple
import sys
@@ -34,6 +36,8 @@ class UrlItem:
class Get_Url(Cmdlet):
"""Get url associated with files via hash+store, or search urls by pattern."""
STORE_SEARCH_TIMEOUT_SECONDS = 6.0
def __init__(self) -> None:
super().__init__(
name="get-url",
@@ -81,8 +85,56 @@ class Get_Url(Cmdlet):
normalized_url = Get_Url._normalize_url_for_search(url)
normalized_pattern = Get_Url._normalize_url_for_search(pattern)
# Use fnmatch for wildcard matching (* and ?)
return fnmatch(normalized_url, normalized_pattern)
has_wildcards = any(ch in normalized_pattern for ch in ("*", "?"))
if has_wildcards:
return fnmatch(normalized_url, normalized_pattern)
normalized_url_no_slash = normalized_url.rstrip("/")
normalized_pattern_no_slash = normalized_pattern.rstrip("/")
if normalized_pattern_no_slash and normalized_pattern_no_slash == normalized_url_no_slash:
return True
return normalized_pattern in normalized_url
def _execute_search_with_timeout(
self,
backend: Any,
query: str,
limit: int,
store_name: str,
**kwargs: Any,
) -> Optional[List[Any]]:
queue: SimpleQueue[tuple[str, Any]] = SimpleQueue()
def _worker() -> None:
try:
queue.put(("ok", backend.search(query, limit=limit, **kwargs)))
except Exception as exc:
queue.put(("err", exc))
worker = Thread(target=_worker, daemon=True)
worker.start()
worker.join(timeout=self.STORE_SEARCH_TIMEOUT_SECONDS)
if worker.is_alive():
debug(
f"Store '{store_name}' search timed out after {self.STORE_SEARCH_TIMEOUT_SECONDS}s",
file=sys.stderr,
)
return None
if queue.empty():
return []
status, payload = queue.get()
if status == "err":
debug(
f"Store '{store_name}' search failed: {payload}",
file=sys.stderr,
)
return []
return payload or []
@staticmethod
def _extract_first_url(value: Any) -> Optional[str]:
@@ -95,6 +147,35 @@ class Get_Url(Cmdlet):
return item.strip()
return None
@staticmethod
def _extract_urls_from_hit(hit: Any) -> List[str]:
"""Extract candidate URLs directly from a search hit, if present."""
raw = None
try:
raw = get_field(hit, "known_urls")
if not raw:
raw = get_field(hit, "urls")
if not raw:
raw = get_field(hit, "url")
if not raw:
raw = get_field(hit, "source_url") or get_field(hit, "source_urls")
except Exception:
raw = None
if isinstance(raw, str):
val = raw.strip()
return [val] if val else []
if isinstance(raw, (list, tuple)):
out: list[str] = []
for item in raw:
if not isinstance(item, str):
continue
v = item.strip()
if v:
out.append(v)
return out
return []
@staticmethod
def _extract_title_from_result(result: Any) -> Optional[str]:
# Prefer explicit title field.
@@ -219,6 +300,7 @@ class Get_Url(Cmdlet):
"""
items: List[UrlItem] = []
found_stores: Set[str] = set()
MAX_RESULTS = 256
try:
storage = Store(config)
@@ -230,6 +312,8 @@ class Get_Url(Cmdlet):
return items, list(found_stores)
for store_name in store_names:
if len(items) >= MAX_RESULTS:
break
try:
backend = storage[store_name]
@@ -243,9 +327,12 @@ class Get_Url(Cmdlet):
has_wildcards = any(ch in raw_pattern for ch in ("*", "?"))
# If this is a Hydrus backend and the pattern is a single URL,
# normalize it through the official API.
# normalize it through the official API. Skip for bare domains.
normalized_url = None
if not has_wildcards and hasattr(backend, "get_url_info"):
looks_like_url = (
"://" in raw_pattern or raw_pattern.startswith("magnet:")
)
if not has_wildcards and looks_like_url and hasattr(backend, "get_url_info"):
try:
info = backend.get_url_info(raw_pattern) # type: ignore[attr-defined]
if isinstance(info, dict):
@@ -255,13 +342,39 @@ class Get_Url(Cmdlet):
except Exception:
normalized_url = None
search_query = "url:*" if has_wildcards else f"url:{normalized_url or raw_pattern}"
try:
search_results = backend.search(search_query, limit=1000)
except Exception:
search_results = []
target_pattern = normalized_url or raw_pattern
if has_wildcards or not target_pattern:
search_query = "url:*"
else:
wrapped_pattern = f"*{target_pattern}*"
search_query = f"url:{wrapped_pattern}"
search_limit = max(1, min(MAX_RESULTS, 1000))
search_results = self._execute_search_with_timeout(
backend,
search_query,
search_limit,
store_name,
pattern_hint=target_pattern,
)
if search_results is None:
continue
search_results = search_results or []
if not search_results and target_pattern and not has_wildcards:
fallback_results = self._execute_search_with_timeout(
backend,
"url:*",
search_limit,
store_name,
pattern_hint=target_pattern,
)
if fallback_results is None:
continue
search_results = fallback_results or []
for hit in (search_results or []):
if len(items) >= MAX_RESULTS:
break
file_hash = None
if isinstance(hit, dict):
file_hash = hit.get("hash") or hit.get("file_hash")
@@ -271,25 +384,57 @@ class Get_Url(Cmdlet):
file_hash = str(file_hash)
title = title_cache.get(file_hash, "")
if not title:
try:
title = (
get_field(hit, "title")
or get_field(hit, "name")
or get_field(hit, "file_title")
or ""
)
except Exception:
title = ""
if not title:
title = self._resolve_title_for_hash(backend, file_hash, hit)
title_cache[file_hash] = title
title_cache[file_hash] = title
size, ext = meta_cache.get(file_hash, (None, ""))
if size is None and not ext:
size, ext = self._resolve_size_ext_for_hash(backend, file_hash, hit)
meta_cache[file_hash] = (size, ext)
try:
size = get_field(hit, "size")
if size is None:
size = get_field(hit, "size_bytes")
if size is None:
size = get_field(hit, "file_size")
if size is None:
size = get_field(hit, "filesize")
size = int(size) if isinstance(size, (int, float)) else None
except Exception:
size = None
try:
urls = backend.get_url(file_hash)
except Exception:
urls = []
try:
ext = get_field(hit, "ext") or get_field(hit, "extension")
ext = str(ext).strip().lstrip(".") if isinstance(ext, str) else ""
except Exception:
ext = ""
if size is None and not ext:
size, ext = self._resolve_size_ext_for_hash(backend, file_hash, hit)
meta_cache[file_hash] = (size, ext)
urls = self._extract_urls_from_hit(hit)
if not urls:
try:
urls = backend.get_url(file_hash)
except Exception:
urls = []
for url in (urls or []):
if len(items) >= MAX_RESULTS:
break
if not self._match_url_pattern(str(url), raw_pattern):
continue
# Double-check it looks like a URL to avoid data leakage from dirty DBs
from SYS.metadata import normalize_urls
valid = normalize_urls([str(url)])
if not valid:
@@ -306,6 +451,8 @@ class Get_Url(Cmdlet):
)
)
found_stores.add(str(store_name))
if len(items) >= MAX_RESULTS:
break
except Exception as exc:
debug(
f"Error searching store '{store_name}': {exc}",