This commit is contained in:
2026-01-24 09:11:05 -08:00
parent 3a4d3f029d
commit 4eb055bb48
2 changed files with 206 additions and 113 deletions

View File

@@ -5,7 +5,7 @@ import sys
import tempfile
import shutil
from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence, Tuple
from typing import Any, Dict, List, Literal, Optional, Sequence, Tuple
from urllib.parse import quote
@@ -455,6 +455,7 @@ class HydrusNetwork(Store):
"""
limit = kwargs.get("limit", 100)
minimal = bool(kwargs.get("minimal", False))
url_only = bool(kwargs.get("url_only", False))
try:
client = self._client
@@ -676,6 +677,86 @@ class HydrusNetwork(Store):
return out
def _search_url_query_metadata(
url_query: str,
fetch_limit: int,
*,
minimal: bool = False,
) -> list[dict[str, Any]]:
"""Run a strict url:<pattern> search without falling back to system predicates."""
if not url_query:
return []
try:
payload = client.search_files(
tags=[url_query],
return_hashes=True,
return_file_ids=True,
)
except Exception:
return []
candidate_ids, candidate_hashes = _extract_search_ids(payload)
if not candidate_ids and not candidate_hashes:
return []
metas_out: list[dict[str, Any]] = []
chunk_size = 200
def _fetch_chunk(kind: Literal["file_ids", "hashes"], values: list[Any]) -> None:
nonlocal metas_out
if not values or len(metas_out) >= fetch_limit:
return
for start in range(0, len(values), chunk_size):
if len(metas_out) >= fetch_limit:
break
remaining = fetch_limit - len(metas_out)
if remaining <= 0:
break
end = start + min(chunk_size, remaining)
chunk = values[start:end]
if not chunk:
continue
try:
if kind == "file_ids":
metadata = client.fetch_file_metadata(
file_ids=chunk,
include_file_url=True,
include_service_keys_to_tags=False,
include_duration=False,
include_size=not minimal,
include_mime=False,
)
else:
metadata = client.fetch_file_metadata(
hashes=chunk,
include_file_url=True,
include_service_keys_to_tags=False,
include_duration=False,
include_size=not minimal,
include_mime=False,
)
except Exception:
continue
fetched = metadata.get("metadata", []) if isinstance(metadata, dict) else []
if not isinstance(fetched, list):
continue
for meta in fetched:
if len(metas_out) >= fetch_limit:
break
if not isinstance(meta, dict):
continue
metas_out.append(meta)
if candidate_ids:
_fetch_chunk("file_ids", candidate_ids)
if len(metas_out) < fetch_limit and candidate_hashes:
_fetch_chunk("hashes", candidate_hashes)
return metas_out[:fetch_limit]
query_lower = query.lower().strip()
# Support `ext:<value>` anywhere in the query. We filter results by the
@@ -735,122 +816,133 @@ class HydrusNetwork(Store):
namespace = namespace.strip().lower()
pattern = pattern.strip()
if namespace == "url":
if not pattern or pattern == "*":
if pattern_hints:
metadata_list = _iter_url_filtered_metadata(
None,
want_any=False,
fetch_limit=int(limit) if limit else 100,
needles=pattern_hints,
minimal=minimal,
)
else:
metadata_list = _iter_url_filtered_metadata(
None,
want_any=True,
fetch_limit=int(limit) if limit else 100,
minimal=minimal,
)
try:
fetch_limit_raw = int(limit) if limit else 100
except Exception:
fetch_limit_raw = 100
if url_only:
metadata_list = _search_url_query_metadata(
query_lower,
fetch_limit_raw,
minimal=minimal,
)
else:
def _clean_url_search_token(value: str | None) -> str:
token = str(value or "").strip().lower()
if not token:
return ""
return token.replace("*", "").replace("?", "")
# Fast-path: exact URL via /add_urls/get_url_files when a full URL is provided.
try:
if pattern.startswith("http://") or pattern.startswith(
"https://"):
from API.HydrusNetwork import HydrusRequestSpec
spec = HydrusRequestSpec(
method="GET",
endpoint="/add_urls/get_url_files",
query={
"url": pattern
},
if not pattern or pattern == "*":
if pattern_hints:
metadata_list = _iter_url_filtered_metadata(
None,
want_any=False,
fetch_limit=fetch_limit_raw,
needles=pattern_hints,
minimal=minimal,
)
response = client._perform_request(
spec
) # type: ignore[attr-defined]
hashes = []
file_ids = []
if isinstance(response, dict):
raw_hashes = response.get("hashes") or response.get(
"file_hashes"
)
if isinstance(raw_hashes, list):
hashes = [
str(h).strip() for h in raw_hashes
if isinstance(h, str) and str(h).strip()
]
raw_ids = response.get("file_ids")
if isinstance(raw_ids, list):
for item in raw_ids:
try:
file_ids.append(int(item))
except (TypeError, ValueError):
continue
else:
metadata_list = _iter_url_filtered_metadata(
None,
want_any=True,
fetch_limit=fetch_limit_raw,
minimal=minimal,
)
else:
def _clean_url_search_token(value: str | None) -> str:
token = str(value or "").strip().lower()
if not token:
return ""
return token.replace("*", "").replace("?", "")
if file_ids:
payload = client.fetch_file_metadata(
file_ids=file_ids,
include_file_url=True,
include_service_keys_to_tags=not minimal,
include_duration=not minimal,
include_size=not minimal,
include_mime=not minimal,
)
metas = (
payload.get("metadata",
[]) if isinstance(payload,
dict) else []
)
if isinstance(metas, list):
metadata_list = [
m for m in metas if isinstance(m, dict)
]
elif hashes:
payload = client.fetch_file_metadata(
hashes=hashes,
include_file_url=True,
include_service_keys_to_tags=not minimal,
include_duration=not minimal,
include_size=not minimal,
include_mime=not minimal,
)
metas = (
payload.get("metadata",
[]) if isinstance(payload,
dict) else []
)
if isinstance(metas, list):
metadata_list = [
m for m in metas if isinstance(m, dict)
]
except Exception:
metadata_list = None
# Fast-path: exact URL via /add_urls/get_url_files when a full URL is provided.
try:
if pattern.startswith("http://") or pattern.startswith(
"https://"):
from API.HydrusNetwork import HydrusRequestSpec
# Fallback: substring scan
if metadata_list is None:
search_token = _clean_url_search_token(pattern_hint or pattern)
scan_limit_override: int | None = None
if search_token:
is_domain_only = ("://" not in search_token and "/" not in search_token)
if is_domain_only:
try:
scan_limit_override = max(int(limit or 100) * 20, 2000)
except Exception:
scan_limit_override = 2000
metadata_list = _iter_url_filtered_metadata(
search_token,
want_any=False,
fetch_limit=int(limit) if limit else 100,
scan_limit=scan_limit_override,
needles=pattern_hints if pattern_hints else None,
minimal=minimal,
)
spec = HydrusRequestSpec(
method="GET",
endpoint="/add_urls/get_url_files",
query={
"url": pattern
},
)
response = client._perform_request(
spec
) # type: ignore[attr-defined]
hashes = []
file_ids = []
if isinstance(response, dict):
raw_hashes = response.get("hashes") or response.get(
"file_hashes"
)
if isinstance(raw_hashes, list):
hashes = [
str(h).strip() for h in raw_hashes
if isinstance(h, str) and str(h).strip()
]
raw_ids = response.get("file_ids")
if isinstance(raw_ids, list):
for item in raw_ids:
try:
file_ids.append(int(item))
except (TypeError, ValueError):
continue
if file_ids:
payload = client.fetch_file_metadata(
file_ids=file_ids,
include_file_url=True,
include_service_keys_to_tags=not minimal,
include_duration=not minimal,
include_size=not minimal,
include_mime=not minimal,
)
metas = (
payload.get("metadata",
[]) if isinstance(payload,
dict) else []
)
if isinstance(metas, list):
metadata_list = [
m for m in metas if isinstance(m, dict)
]
elif hashes:
payload = client.fetch_file_metadata(
hashes=hashes,
include_file_url=True,
include_service_keys_to_tags=not minimal,
include_duration=not minimal,
include_size=not minimal,
include_mime=not minimal,
)
metas = (
payload.get("metadata",
[]) if isinstance(payload,
dict) else []
)
if isinstance(metas, list):
metadata_list = [
m for m in metas if isinstance(m, dict)
]
except Exception:
metadata_list = None
# Fallback: substring scan
if metadata_list is None:
search_token = _clean_url_search_token(pattern_hint or pattern)
scan_limit_override: int | None = None
if search_token:
is_domain_only = ("://" not in search_token and "/" not in search_token)
if is_domain_only:
try:
scan_limit_override = max(fetch_limit_raw * 20, 2000)
except Exception:
scan_limit_override = 2000
metadata_list = _iter_url_filtered_metadata(
search_token,
want_any=False,
fetch_limit=fetch_limit_raw,
scan_limit=scan_limit_override,
needles=pattern_hints if pattern_hints else None,
minimal=minimal,
)
elif namespace == "system":
normalized_system_predicate = pattern.strip()
if normalized_system_predicate == "has url":