f
This commit is contained in:
@@ -466,7 +466,9 @@ class HydrusNetwork(Store):
|
||||
def _extract_urls(meta_obj: Any) -> list[str]:
|
||||
if not isinstance(meta_obj, dict):
|
||||
return []
|
||||
raw = meta_obj.get("url")
|
||||
raw = meta_obj.get("known_urls")
|
||||
if raw is None:
|
||||
raw = meta_obj.get("url")
|
||||
if raw is None:
|
||||
raw = meta_obj.get("urls")
|
||||
if isinstance(raw, str):
|
||||
@@ -483,100 +485,178 @@ class HydrusNetwork(Store):
|
||||
return out
|
||||
return []
|
||||
|
||||
def _extract_search_ids(payload: Any) -> tuple[list[int], list[str]]:
|
||||
if not isinstance(payload, dict):
|
||||
return [], []
|
||||
raw_ids = payload.get("file_ids", [])
|
||||
raw_hashes = payload.get("hashes", [])
|
||||
ids_out: list[int] = []
|
||||
hashes_out: list[str] = []
|
||||
if isinstance(raw_ids, list):
|
||||
for item in raw_ids:
|
||||
try:
|
||||
if isinstance(item, (int, float)):
|
||||
ids_out.append(int(item))
|
||||
continue
|
||||
if isinstance(item, str) and item.strip().isdigit():
|
||||
ids_out.append(int(item.strip()))
|
||||
except Exception:
|
||||
continue
|
||||
if isinstance(raw_hashes, list):
|
||||
for item in raw_hashes:
|
||||
try:
|
||||
candidate = str(item or "").strip().lower()
|
||||
if candidate:
|
||||
hashes_out.append(candidate)
|
||||
except Exception:
|
||||
continue
|
||||
return ids_out, hashes_out
|
||||
|
||||
def _iter_url_filtered_metadata(
|
||||
url_value: str | None,
|
||||
want_any: bool,
|
||||
fetch_limit: int
|
||||
) -> list[dict[str,
|
||||
Any]]:
|
||||
fetch_limit: int,
|
||||
scan_limit: int | None = None
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Best-effort URL search by scanning Hydrus metadata with include_file_url=True."""
|
||||
|
||||
# First try a fast system predicate if Hydrus supports it.
|
||||
candidate_file_ids: list[int] = []
|
||||
try:
|
||||
if want_any:
|
||||
candidate_hashes: list[str] = []
|
||||
seen_file_ids: set[int] = set()
|
||||
seen_hashes: set[str] = set()
|
||||
|
||||
def _add_candidates(ids: list[int], hashes: list[str]) -> None:
|
||||
for fid in ids:
|
||||
if fid in seen_file_ids:
|
||||
continue
|
||||
seen_file_ids.add(fid)
|
||||
candidate_file_ids.append(fid)
|
||||
for hh in hashes:
|
||||
if hh in seen_hashes:
|
||||
continue
|
||||
seen_hashes.add(hh)
|
||||
candidate_hashes.append(hh)
|
||||
|
||||
predicate_supported = getattr(self, "_has_url_predicate", None)
|
||||
if predicate_supported is not False:
|
||||
try:
|
||||
predicate = "system:has url"
|
||||
url_search = client.search_files(
|
||||
tags=[predicate],
|
||||
return_hashes=False,
|
||||
return_file_ids=True,
|
||||
return_hashes=True,
|
||||
return_file_ids=False,
|
||||
return_file_count=False,
|
||||
)
|
||||
ids = url_search.get("file_ids",
|
||||
[]) if isinstance(url_search,
|
||||
dict) else []
|
||||
if isinstance(ids, list):
|
||||
candidate_file_ids = [
|
||||
int(x) for x in ids
|
||||
if isinstance(x, (int, float,
|
||||
str)) and str(x).strip().isdigit()
|
||||
]
|
||||
except Exception:
|
||||
candidate_file_ids = []
|
||||
ids, hashes = _extract_search_ids(url_search)
|
||||
_add_candidates(ids, hashes)
|
||||
self._has_url_predicate = True
|
||||
except Exception as exc:
|
||||
try:
|
||||
from API.HydrusNetwork import HydrusRequestError
|
||||
|
||||
if not candidate_file_ids:
|
||||
# Fallback: scan from system:everything and filter by URL substring.
|
||||
if isinstance(exc, HydrusRequestError) and getattr(exc, "status", None) == 400:
|
||||
self._has_url_predicate = False
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not candidate_file_ids and not candidate_hashes:
|
||||
everything = client.search_files(
|
||||
tags=["system:everything"],
|
||||
return_hashes=False,
|
||||
return_file_ids=True,
|
||||
return_hashes=True,
|
||||
return_file_ids=False,
|
||||
return_file_count=False,
|
||||
)
|
||||
ids = everything.get("file_ids",
|
||||
[]) if isinstance(everything,
|
||||
dict) else []
|
||||
if isinstance(ids, list):
|
||||
candidate_file_ids = [
|
||||
int(x) for x in ids if isinstance(x, (int, float))
|
||||
]
|
||||
ids, hashes = _extract_search_ids(everything)
|
||||
_add_candidates(ids, hashes)
|
||||
|
||||
if not candidate_file_ids:
|
||||
if not candidate_file_ids and not candidate_hashes:
|
||||
return []
|
||||
|
||||
needle = (url_value or "").strip().lower()
|
||||
chunk_size = 200
|
||||
out: list[dict[str, Any]] = []
|
||||
if scan_limit is None:
|
||||
try:
|
||||
if not want_any and url_value:
|
||||
scan_limit = max(200, min(int(fetch_limit), 400))
|
||||
else:
|
||||
scan_limit = max(int(fetch_limit) * 5, 1000)
|
||||
except Exception:
|
||||
scan_limit = 400 if (not want_any and url_value) else 1000
|
||||
if scan_limit is not None:
|
||||
scan_limit = min(int(scan_limit), 10000)
|
||||
scanned = 0
|
||||
|
||||
for start in range(0, len(candidate_file_ids), chunk_size):
|
||||
def _process_source(items: list[Any], kind: str) -> None:
|
||||
nonlocal scanned
|
||||
for start in range(0, len(items), chunk_size):
|
||||
if len(out) >= fetch_limit:
|
||||
return
|
||||
if scan_limit is not None and scanned >= scan_limit:
|
||||
return
|
||||
chunk = items[start:start + chunk_size]
|
||||
if scan_limit is not None:
|
||||
remaining = scan_limit - scanned
|
||||
if remaining <= 0:
|
||||
return
|
||||
if len(chunk) > remaining:
|
||||
chunk = chunk[:remaining]
|
||||
scanned += len(chunk)
|
||||
try:
|
||||
if kind == "hashes":
|
||||
payload = client.fetch_file_metadata(
|
||||
hashes=chunk,
|
||||
include_file_url=True,
|
||||
include_service_keys_to_tags=True,
|
||||
include_duration=True,
|
||||
include_size=True,
|
||||
include_mime=True,
|
||||
)
|
||||
else:
|
||||
payload = client.fetch_file_metadata(
|
||||
file_ids=chunk,
|
||||
include_file_url=True,
|
||||
include_service_keys_to_tags=True,
|
||||
include_duration=True,
|
||||
include_size=True,
|
||||
include_mime=True,
|
||||
)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
metas = payload.get("metadata",
|
||||
[]) if isinstance(payload,
|
||||
dict) else []
|
||||
if not isinstance(metas, list):
|
||||
continue
|
||||
|
||||
for meta in metas:
|
||||
if len(out) >= fetch_limit:
|
||||
break
|
||||
if not isinstance(meta, dict):
|
||||
continue
|
||||
urls = _extract_urls(meta)
|
||||
if not urls:
|
||||
continue
|
||||
if want_any:
|
||||
out.append(meta)
|
||||
continue
|
||||
if not needle:
|
||||
continue
|
||||
if any(needle in u.lower() for u in urls):
|
||||
out.append(meta)
|
||||
continue
|
||||
|
||||
sources: list[tuple[str, list[Any]]] = []
|
||||
if candidate_hashes:
|
||||
sources.append(("hashes", candidate_hashes))
|
||||
elif candidate_file_ids:
|
||||
sources.append(("file_ids", candidate_file_ids))
|
||||
|
||||
for kind, items in sources:
|
||||
if len(out) >= fetch_limit:
|
||||
break
|
||||
chunk = candidate_file_ids[start:start + chunk_size]
|
||||
try:
|
||||
payload = client.fetch_file_metadata(
|
||||
file_ids=chunk,
|
||||
include_file_url=True,
|
||||
include_service_keys_to_tags=True,
|
||||
include_duration=True,
|
||||
include_size=True,
|
||||
include_mime=True,
|
||||
)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
metas = payload.get("metadata",
|
||||
[]) if isinstance(payload,
|
||||
dict) else []
|
||||
if not isinstance(metas, list):
|
||||
continue
|
||||
|
||||
for meta in metas:
|
||||
if not isinstance(meta, dict):
|
||||
continue
|
||||
urls = _extract_urls(meta)
|
||||
if not urls:
|
||||
continue
|
||||
if want_any:
|
||||
out.append(meta)
|
||||
if len(out) >= fetch_limit:
|
||||
break
|
||||
continue
|
||||
|
||||
if not needle:
|
||||
continue
|
||||
if any(needle in u.lower() for u in urls):
|
||||
out.append(meta)
|
||||
if len(out) >= fetch_limit:
|
||||
break
|
||||
_process_source(items, kind)
|
||||
|
||||
return out
|
||||
|
||||
@@ -618,6 +698,7 @@ class HydrusNetwork(Store):
|
||||
|
||||
# Special case: url:* and url:<value>
|
||||
metadata_list: list[dict[str, Any]] | None = None
|
||||
pattern_hint = str(kwargs.get("pattern_hint") or "").strip().lower()
|
||||
if ":" in query_lower and not query_lower.startswith(":"):
|
||||
namespace, pattern = query_lower.split(":", 1)
|
||||
namespace = namespace.strip().lower()
|
||||
@@ -630,6 +711,12 @@ class HydrusNetwork(Store):
|
||||
fetch_limit=int(limit) if limit else 100
|
||||
)
|
||||
else:
|
||||
def _clean_url_search_token(value: str | None) -> str:
|
||||
token = str(value or "").strip().lower()
|
||||
if not token:
|
||||
return ""
|
||||
return token.replace("*", "").replace("?", "")
|
||||
|
||||
# Fast-path: exact URL via /add_urls/get_url_files when a full URL is provided.
|
||||
try:
|
||||
if pattern.startswith("http://") or pattern.startswith(
|
||||
@@ -706,10 +793,20 @@ class HydrusNetwork(Store):
|
||||
|
||||
# Fallback: substring scan
|
||||
if metadata_list is None:
|
||||
search_token = _clean_url_search_token(pattern_hint or pattern)
|
||||
scan_limit_override: int | None = None
|
||||
if search_token:
|
||||
is_domain_only = ("://" not in search_token and "/" not in search_token)
|
||||
if is_domain_only:
|
||||
try:
|
||||
scan_limit_override = max(int(limit or 100) * 20, 2000)
|
||||
except Exception:
|
||||
scan_limit_override = 2000
|
||||
metadata_list = _iter_url_filtered_metadata(
|
||||
pattern,
|
||||
search_token,
|
||||
want_any=False,
|
||||
fetch_limit=int(limit) if limit else 100
|
||||
fetch_limit=int(limit) if limit else 100,
|
||||
scan_limit=scan_limit_override,
|
||||
)
|
||||
|
||||
# Parse the query into tags
|
||||
@@ -742,26 +839,6 @@ class HydrusNetwork(Store):
|
||||
# Search files with the tags (unless url: search already produced metadata)
|
||||
results = []
|
||||
|
||||
def _extract_search_ids(payload: Any) -> tuple[list[int], list[str]]:
|
||||
if not isinstance(payload, dict):
|
||||
return [], []
|
||||
raw_ids = payload.get("file_ids", [])
|
||||
raw_hashes = payload.get("hashes", [])
|
||||
ids_out: list[int] = []
|
||||
hashes_out: list[str] = []
|
||||
if isinstance(raw_ids, list):
|
||||
for item in raw_ids:
|
||||
try:
|
||||
ids_out.append(int(item))
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
if isinstance(raw_hashes, list):
|
||||
hashes_out = [
|
||||
str(h).strip() for h in raw_hashes
|
||||
if isinstance(h, str) and str(h).strip()
|
||||
]
|
||||
return ids_out, hashes_out
|
||||
|
||||
if metadata_list is None:
|
||||
file_ids: list[int] = []
|
||||
hashes: list[str] = []
|
||||
|
||||
Reference in New Issue
Block a user