This commit is contained in:
2026-01-24 09:11:05 -08:00
parent 3a4d3f029d
commit 4eb055bb48
2 changed files with 206 additions and 113 deletions

View File

@@ -5,7 +5,7 @@ import sys
import tempfile import tempfile
import shutil import shutil
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence, Tuple from typing import Any, Dict, List, Literal, Optional, Sequence, Tuple
from urllib.parse import quote from urllib.parse import quote
@@ -455,6 +455,7 @@ class HydrusNetwork(Store):
""" """
limit = kwargs.get("limit", 100) limit = kwargs.get("limit", 100)
minimal = bool(kwargs.get("minimal", False)) minimal = bool(kwargs.get("minimal", False))
url_only = bool(kwargs.get("url_only", False))
try: try:
client = self._client client = self._client
@@ -676,6 +677,86 @@ class HydrusNetwork(Store):
return out return out
def _search_url_query_metadata(
url_query: str,
fetch_limit: int,
*,
minimal: bool = False,
) -> list[dict[str, Any]]:
"""Run a strict url:<pattern> search without falling back to system predicates."""
if not url_query:
return []
try:
payload = client.search_files(
tags=[url_query],
return_hashes=True,
return_file_ids=True,
)
except Exception:
return []
candidate_ids, candidate_hashes = _extract_search_ids(payload)
if not candidate_ids and not candidate_hashes:
return []
metas_out: list[dict[str, Any]] = []
chunk_size = 200
def _fetch_chunk(kind: Literal["file_ids", "hashes"], values: list[Any]) -> None:
nonlocal metas_out
if not values or len(metas_out) >= fetch_limit:
return
for start in range(0, len(values), chunk_size):
if len(metas_out) >= fetch_limit:
break
remaining = fetch_limit - len(metas_out)
if remaining <= 0:
break
end = start + min(chunk_size, remaining)
chunk = values[start:end]
if not chunk:
continue
try:
if kind == "file_ids":
metadata = client.fetch_file_metadata(
file_ids=chunk,
include_file_url=True,
include_service_keys_to_tags=False,
include_duration=False,
include_size=not minimal,
include_mime=False,
)
else:
metadata = client.fetch_file_metadata(
hashes=chunk,
include_file_url=True,
include_service_keys_to_tags=False,
include_duration=False,
include_size=not minimal,
include_mime=False,
)
except Exception:
continue
fetched = metadata.get("metadata", []) if isinstance(metadata, dict) else []
if not isinstance(fetched, list):
continue
for meta in fetched:
if len(metas_out) >= fetch_limit:
break
if not isinstance(meta, dict):
continue
metas_out.append(meta)
if candidate_ids:
_fetch_chunk("file_ids", candidate_ids)
if len(metas_out) < fetch_limit and candidate_hashes:
_fetch_chunk("hashes", candidate_hashes)
return metas_out[:fetch_limit]
query_lower = query.lower().strip() query_lower = query.lower().strip()
# Support `ext:<value>` anywhere in the query. We filter results by the # Support `ext:<value>` anywhere in the query. We filter results by the
@@ -735,122 +816,133 @@ class HydrusNetwork(Store):
namespace = namespace.strip().lower() namespace = namespace.strip().lower()
pattern = pattern.strip() pattern = pattern.strip()
if namespace == "url": if namespace == "url":
if not pattern or pattern == "*": try:
if pattern_hints: fetch_limit_raw = int(limit) if limit else 100
metadata_list = _iter_url_filtered_metadata( except Exception:
None, fetch_limit_raw = 100
want_any=False, if url_only:
fetch_limit=int(limit) if limit else 100, metadata_list = _search_url_query_metadata(
needles=pattern_hints, query_lower,
minimal=minimal, fetch_limit_raw,
) minimal=minimal,
else: )
metadata_list = _iter_url_filtered_metadata(
None,
want_any=True,
fetch_limit=int(limit) if limit else 100,
minimal=minimal,
)
else: else:
def _clean_url_search_token(value: str | None) -> str: if not pattern or pattern == "*":
token = str(value or "").strip().lower() if pattern_hints:
if not token: metadata_list = _iter_url_filtered_metadata(
return "" None,
return token.replace("*", "").replace("?", "") want_any=False,
fetch_limit=fetch_limit_raw,
# Fast-path: exact URL via /add_urls/get_url_files when a full URL is provided. needles=pattern_hints,
try: minimal=minimal,
if pattern.startswith("http://") or pattern.startswith(
"https://"):
from API.HydrusNetwork import HydrusRequestSpec
spec = HydrusRequestSpec(
method="GET",
endpoint="/add_urls/get_url_files",
query={
"url": pattern
},
) )
response = client._perform_request( else:
spec metadata_list = _iter_url_filtered_metadata(
) # type: ignore[attr-defined] None,
hashes = [] want_any=True,
file_ids = [] fetch_limit=fetch_limit_raw,
if isinstance(response, dict): minimal=minimal,
raw_hashes = response.get("hashes") or response.get( )
"file_hashes" else:
) def _clean_url_search_token(value: str | None) -> str:
if isinstance(raw_hashes, list): token = str(value or "").strip().lower()
hashes = [ if not token:
str(h).strip() for h in raw_hashes return ""
if isinstance(h, str) and str(h).strip() return token.replace("*", "").replace("?", "")
]
raw_ids = response.get("file_ids")
if isinstance(raw_ids, list):
for item in raw_ids:
try:
file_ids.append(int(item))
except (TypeError, ValueError):
continue
if file_ids: # Fast-path: exact URL via /add_urls/get_url_files when a full URL is provided.
payload = client.fetch_file_metadata( try:
file_ids=file_ids, if pattern.startswith("http://") or pattern.startswith(
include_file_url=True, "https://"):
include_service_keys_to_tags=not minimal, from API.HydrusNetwork import HydrusRequestSpec
include_duration=not minimal,
include_size=not minimal,
include_mime=not minimal,
)
metas = (
payload.get("metadata",
[]) if isinstance(payload,
dict) else []
)
if isinstance(metas, list):
metadata_list = [
m for m in metas if isinstance(m, dict)
]
elif hashes:
payload = client.fetch_file_metadata(
hashes=hashes,
include_file_url=True,
include_service_keys_to_tags=not minimal,
include_duration=not minimal,
include_size=not minimal,
include_mime=not minimal,
)
metas = (
payload.get("metadata",
[]) if isinstance(payload,
dict) else []
)
if isinstance(metas, list):
metadata_list = [
m for m in metas if isinstance(m, dict)
]
except Exception:
metadata_list = None
# Fallback: substring scan spec = HydrusRequestSpec(
if metadata_list is None: method="GET",
search_token = _clean_url_search_token(pattern_hint or pattern) endpoint="/add_urls/get_url_files",
scan_limit_override: int | None = None query={
if search_token: "url": pattern
is_domain_only = ("://" not in search_token and "/" not in search_token) },
if is_domain_only: )
try: response = client._perform_request(
scan_limit_override = max(int(limit or 100) * 20, 2000) spec
except Exception: ) # type: ignore[attr-defined]
scan_limit_override = 2000 hashes = []
metadata_list = _iter_url_filtered_metadata( file_ids = []
search_token, if isinstance(response, dict):
want_any=False, raw_hashes = response.get("hashes") or response.get(
fetch_limit=int(limit) if limit else 100, "file_hashes"
scan_limit=scan_limit_override, )
needles=pattern_hints if pattern_hints else None, if isinstance(raw_hashes, list):
minimal=minimal, hashes = [
) str(h).strip() for h in raw_hashes
if isinstance(h, str) and str(h).strip()
]
raw_ids = response.get("file_ids")
if isinstance(raw_ids, list):
for item in raw_ids:
try:
file_ids.append(int(item))
except (TypeError, ValueError):
continue
if file_ids:
payload = client.fetch_file_metadata(
file_ids=file_ids,
include_file_url=True,
include_service_keys_to_tags=not minimal,
include_duration=not minimal,
include_size=not minimal,
include_mime=not minimal,
)
metas = (
payload.get("metadata",
[]) if isinstance(payload,
dict) else []
)
if isinstance(metas, list):
metadata_list = [
m for m in metas if isinstance(m, dict)
]
elif hashes:
payload = client.fetch_file_metadata(
hashes=hashes,
include_file_url=True,
include_service_keys_to_tags=not minimal,
include_duration=not minimal,
include_size=not minimal,
include_mime=not minimal,
)
metas = (
payload.get("metadata",
[]) if isinstance(payload,
dict) else []
)
if isinstance(metas, list):
metadata_list = [
m for m in metas if isinstance(m, dict)
]
except Exception:
metadata_list = None
# Fallback: substring scan
if metadata_list is None:
search_token = _clean_url_search_token(pattern_hint or pattern)
scan_limit_override: int | None = None
if search_token:
is_domain_only = ("://" not in search_token and "/" not in search_token)
if is_domain_only:
try:
scan_limit_override = max(fetch_limit_raw * 20, 2000)
except Exception:
scan_limit_override = 2000
metadata_list = _iter_url_filtered_metadata(
search_token,
want_any=False,
fetch_limit=fetch_limit_raw,
scan_limit=scan_limit_override,
needles=pattern_hints if pattern_hints else None,
minimal=minimal,
)
elif namespace == "system": elif namespace == "system":
normalized_system_predicate = pattern.strip() normalized_system_predicate = pattern.strip()
if normalized_system_predicate == "has url": if normalized_system_predicate == "has url":

View File

@@ -366,6 +366,7 @@ class Get_Url(Cmdlet):
store_name, store_name,
pattern_hint=target_pattern, pattern_hint=target_pattern,
minimal=True, minimal=True,
url_only=True,
) )
if search_results is None: if search_results is None:
continue continue