f
This commit is contained in:
@@ -1,5 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from queue import SimpleQueue
|
||||
from threading import Thread
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Sequence, Optional, Set, Tuple
|
||||
import sys
|
||||
@@ -34,6 +36,8 @@ class UrlItem:
|
||||
class Get_Url(Cmdlet):
|
||||
"""Get url associated with files via hash+store, or search urls by pattern."""
|
||||
|
||||
STORE_SEARCH_TIMEOUT_SECONDS = 6.0
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__(
|
||||
name="get-url",
|
||||
@@ -81,8 +85,56 @@ class Get_Url(Cmdlet):
|
||||
normalized_url = Get_Url._normalize_url_for_search(url)
|
||||
normalized_pattern = Get_Url._normalize_url_for_search(pattern)
|
||||
|
||||
# Use fnmatch for wildcard matching (* and ?)
|
||||
return fnmatch(normalized_url, normalized_pattern)
|
||||
has_wildcards = any(ch in normalized_pattern for ch in ("*", "?"))
|
||||
if has_wildcards:
|
||||
return fnmatch(normalized_url, normalized_pattern)
|
||||
|
||||
normalized_url_no_slash = normalized_url.rstrip("/")
|
||||
normalized_pattern_no_slash = normalized_pattern.rstrip("/")
|
||||
if normalized_pattern_no_slash and normalized_pattern_no_slash == normalized_url_no_slash:
|
||||
return True
|
||||
|
||||
return normalized_pattern in normalized_url
|
||||
|
||||
def _execute_search_with_timeout(
|
||||
self,
|
||||
backend: Any,
|
||||
query: str,
|
||||
limit: int,
|
||||
store_name: str,
|
||||
**kwargs: Any,
|
||||
) -> Optional[List[Any]]:
|
||||
queue: SimpleQueue[tuple[str, Any]] = SimpleQueue()
|
||||
|
||||
def _worker() -> None:
|
||||
try:
|
||||
queue.put(("ok", backend.search(query, limit=limit, **kwargs)))
|
||||
except Exception as exc:
|
||||
queue.put(("err", exc))
|
||||
|
||||
worker = Thread(target=_worker, daemon=True)
|
||||
worker.start()
|
||||
worker.join(timeout=self.STORE_SEARCH_TIMEOUT_SECONDS)
|
||||
|
||||
if worker.is_alive():
|
||||
debug(
|
||||
f"Store '{store_name}' search timed out after {self.STORE_SEARCH_TIMEOUT_SECONDS}s",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return None
|
||||
|
||||
if queue.empty():
|
||||
return []
|
||||
|
||||
status, payload = queue.get()
|
||||
if status == "err":
|
||||
debug(
|
||||
f"Store '{store_name}' search failed: {payload}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return []
|
||||
|
||||
return payload or []
|
||||
|
||||
@staticmethod
|
||||
def _extract_first_url(value: Any) -> Optional[str]:
|
||||
@@ -95,6 +147,35 @@ class Get_Url(Cmdlet):
|
||||
return item.strip()
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _extract_urls_from_hit(hit: Any) -> List[str]:
|
||||
"""Extract candidate URLs directly from a search hit, if present."""
|
||||
raw = None
|
||||
try:
|
||||
raw = get_field(hit, "known_urls")
|
||||
if not raw:
|
||||
raw = get_field(hit, "urls")
|
||||
if not raw:
|
||||
raw = get_field(hit, "url")
|
||||
if not raw:
|
||||
raw = get_field(hit, "source_url") or get_field(hit, "source_urls")
|
||||
except Exception:
|
||||
raw = None
|
||||
|
||||
if isinstance(raw, str):
|
||||
val = raw.strip()
|
||||
return [val] if val else []
|
||||
if isinstance(raw, (list, tuple)):
|
||||
out: list[str] = []
|
||||
for item in raw:
|
||||
if not isinstance(item, str):
|
||||
continue
|
||||
v = item.strip()
|
||||
if v:
|
||||
out.append(v)
|
||||
return out
|
||||
return []
|
||||
|
||||
@staticmethod
|
||||
def _extract_title_from_result(result: Any) -> Optional[str]:
|
||||
# Prefer explicit title field.
|
||||
@@ -219,6 +300,7 @@ class Get_Url(Cmdlet):
|
||||
"""
|
||||
items: List[UrlItem] = []
|
||||
found_stores: Set[str] = set()
|
||||
MAX_RESULTS = 256
|
||||
|
||||
try:
|
||||
storage = Store(config)
|
||||
@@ -230,6 +312,8 @@ class Get_Url(Cmdlet):
|
||||
return items, list(found_stores)
|
||||
|
||||
for store_name in store_names:
|
||||
if len(items) >= MAX_RESULTS:
|
||||
break
|
||||
try:
|
||||
backend = storage[store_name]
|
||||
|
||||
@@ -243,9 +327,12 @@ class Get_Url(Cmdlet):
|
||||
has_wildcards = any(ch in raw_pattern for ch in ("*", "?"))
|
||||
|
||||
# If this is a Hydrus backend and the pattern is a single URL,
|
||||
# normalize it through the official API.
|
||||
# normalize it through the official API. Skip for bare domains.
|
||||
normalized_url = None
|
||||
if not has_wildcards and hasattr(backend, "get_url_info"):
|
||||
looks_like_url = (
|
||||
"://" in raw_pattern or raw_pattern.startswith("magnet:")
|
||||
)
|
||||
if not has_wildcards and looks_like_url and hasattr(backend, "get_url_info"):
|
||||
try:
|
||||
info = backend.get_url_info(raw_pattern) # type: ignore[attr-defined]
|
||||
if isinstance(info, dict):
|
||||
@@ -255,13 +342,39 @@ class Get_Url(Cmdlet):
|
||||
except Exception:
|
||||
normalized_url = None
|
||||
|
||||
search_query = "url:*" if has_wildcards else f"url:{normalized_url or raw_pattern}"
|
||||
try:
|
||||
search_results = backend.search(search_query, limit=1000)
|
||||
except Exception:
|
||||
search_results = []
|
||||
target_pattern = normalized_url or raw_pattern
|
||||
if has_wildcards or not target_pattern:
|
||||
search_query = "url:*"
|
||||
else:
|
||||
wrapped_pattern = f"*{target_pattern}*"
|
||||
search_query = f"url:{wrapped_pattern}"
|
||||
search_limit = max(1, min(MAX_RESULTS, 1000))
|
||||
search_results = self._execute_search_with_timeout(
|
||||
backend,
|
||||
search_query,
|
||||
search_limit,
|
||||
store_name,
|
||||
pattern_hint=target_pattern,
|
||||
)
|
||||
if search_results is None:
|
||||
continue
|
||||
|
||||
search_results = search_results or []
|
||||
if not search_results and target_pattern and not has_wildcards:
|
||||
fallback_results = self._execute_search_with_timeout(
|
||||
backend,
|
||||
"url:*",
|
||||
search_limit,
|
||||
store_name,
|
||||
pattern_hint=target_pattern,
|
||||
)
|
||||
if fallback_results is None:
|
||||
continue
|
||||
search_results = fallback_results or []
|
||||
|
||||
for hit in (search_results or []):
|
||||
if len(items) >= MAX_RESULTS:
|
||||
break
|
||||
file_hash = None
|
||||
if isinstance(hit, dict):
|
||||
file_hash = hit.get("hash") or hit.get("file_hash")
|
||||
@@ -271,25 +384,57 @@ class Get_Url(Cmdlet):
|
||||
file_hash = str(file_hash)
|
||||
|
||||
title = title_cache.get(file_hash, "")
|
||||
if not title:
|
||||
try:
|
||||
title = (
|
||||
get_field(hit, "title")
|
||||
or get_field(hit, "name")
|
||||
or get_field(hit, "file_title")
|
||||
or ""
|
||||
)
|
||||
except Exception:
|
||||
title = ""
|
||||
if not title:
|
||||
title = self._resolve_title_for_hash(backend, file_hash, hit)
|
||||
title_cache[file_hash] = title
|
||||
title_cache[file_hash] = title
|
||||
|
||||
size, ext = meta_cache.get(file_hash, (None, ""))
|
||||
if size is None and not ext:
|
||||
size, ext = self._resolve_size_ext_for_hash(backend, file_hash, hit)
|
||||
meta_cache[file_hash] = (size, ext)
|
||||
try:
|
||||
size = get_field(hit, "size")
|
||||
if size is None:
|
||||
size = get_field(hit, "size_bytes")
|
||||
if size is None:
|
||||
size = get_field(hit, "file_size")
|
||||
if size is None:
|
||||
size = get_field(hit, "filesize")
|
||||
size = int(size) if isinstance(size, (int, float)) else None
|
||||
except Exception:
|
||||
size = None
|
||||
|
||||
try:
|
||||
urls = backend.get_url(file_hash)
|
||||
except Exception:
|
||||
urls = []
|
||||
try:
|
||||
ext = get_field(hit, "ext") or get_field(hit, "extension")
|
||||
ext = str(ext).strip().lstrip(".") if isinstance(ext, str) else ""
|
||||
except Exception:
|
||||
ext = ""
|
||||
|
||||
if size is None and not ext:
|
||||
size, ext = self._resolve_size_ext_for_hash(backend, file_hash, hit)
|
||||
meta_cache[file_hash] = (size, ext)
|
||||
|
||||
urls = self._extract_urls_from_hit(hit)
|
||||
if not urls:
|
||||
try:
|
||||
urls = backend.get_url(file_hash)
|
||||
except Exception:
|
||||
urls = []
|
||||
|
||||
for url in (urls or []):
|
||||
if len(items) >= MAX_RESULTS:
|
||||
break
|
||||
if not self._match_url_pattern(str(url), raw_pattern):
|
||||
continue
|
||||
|
||||
# Double-check it looks like a URL to avoid data leakage from dirty DBs
|
||||
|
||||
from SYS.metadata import normalize_urls
|
||||
valid = normalize_urls([str(url)])
|
||||
if not valid:
|
||||
@@ -306,6 +451,8 @@ class Get_Url(Cmdlet):
|
||||
)
|
||||
)
|
||||
found_stores.add(str(store_name))
|
||||
if len(items) >= MAX_RESULTS:
|
||||
break
|
||||
except Exception as exc:
|
||||
debug(
|
||||
f"Error searching store '{store_name}': {exc}",
|
||||
|
||||
Reference in New Issue
Block a user