diff --git a/Store/HydrusNetwork.py b/Store/HydrusNetwork.py index c365a9b..50cd0ff 100644 --- a/Store/HydrusNetwork.py +++ b/Store/HydrusNetwork.py @@ -5,7 +5,7 @@ import sys import tempfile import shutil from pathlib import Path -from typing import Any, Dict, List, Optional, Sequence, Tuple +from typing import Any, Dict, List, Literal, Optional, Sequence, Tuple from urllib.parse import quote @@ -455,6 +455,7 @@ class HydrusNetwork(Store): """ limit = kwargs.get("limit", 100) minimal = bool(kwargs.get("minimal", False)) + url_only = bool(kwargs.get("url_only", False)) try: client = self._client @@ -676,6 +677,86 @@ class HydrusNetwork(Store): return out + def _search_url_query_metadata( + url_query: str, + fetch_limit: int, + *, + minimal: bool = False, + ) -> list[dict[str, Any]]: + """Run a strict url: search without falling back to system predicates.""" + + if not url_query: + return [] + + try: + payload = client.search_files( + tags=[url_query], + return_hashes=True, + return_file_ids=True, + ) + except Exception: + return [] + + candidate_ids, candidate_hashes = _extract_search_ids(payload) + if not candidate_ids and not candidate_hashes: + return [] + + metas_out: list[dict[str, Any]] = [] + chunk_size = 200 + + def _fetch_chunk(kind: Literal["file_ids", "hashes"], values: list[Any]) -> None: + nonlocal metas_out + if not values or len(metas_out) >= fetch_limit: + return + for start in range(0, len(values), chunk_size): + if len(metas_out) >= fetch_limit: + break + remaining = fetch_limit - len(metas_out) + if remaining <= 0: + break + end = start + min(chunk_size, remaining) + chunk = values[start:end] + if not chunk: + continue + try: + if kind == "file_ids": + metadata = client.fetch_file_metadata( + file_ids=chunk, + include_file_url=True, + include_service_keys_to_tags=False, + include_duration=False, + include_size=not minimal, + include_mime=False, + ) + else: + metadata = client.fetch_file_metadata( + hashes=chunk, + include_file_url=True, + include_service_keys_to_tags=False, + include_duration=False, + include_size=not minimal, + include_mime=False, + ) + except Exception: + continue + + fetched = metadata.get("metadata", []) if isinstance(metadata, dict) else [] + if not isinstance(fetched, list): + continue + for meta in fetched: + if len(metas_out) >= fetch_limit: + break + if not isinstance(meta, dict): + continue + metas_out.append(meta) + + if candidate_ids: + _fetch_chunk("file_ids", candidate_ids) + if len(metas_out) < fetch_limit and candidate_hashes: + _fetch_chunk("hashes", candidate_hashes) + + return metas_out[:fetch_limit] + query_lower = query.lower().strip() # Support `ext:` anywhere in the query. We filter results by the @@ -735,122 +816,133 @@ class HydrusNetwork(Store): namespace = namespace.strip().lower() pattern = pattern.strip() if namespace == "url": - if not pattern or pattern == "*": - if pattern_hints: - metadata_list = _iter_url_filtered_metadata( - None, - want_any=False, - fetch_limit=int(limit) if limit else 100, - needles=pattern_hints, - minimal=minimal, - ) - else: - metadata_list = _iter_url_filtered_metadata( - None, - want_any=True, - fetch_limit=int(limit) if limit else 100, - minimal=minimal, - ) + try: + fetch_limit_raw = int(limit) if limit else 100 + except Exception: + fetch_limit_raw = 100 + if url_only: + metadata_list = _search_url_query_metadata( + query_lower, + fetch_limit_raw, + minimal=minimal, + ) else: - def _clean_url_search_token(value: str | None) -> str: - token = str(value or "").strip().lower() - if not token: - return "" - return token.replace("*", "").replace("?", "") - - # Fast-path: exact URL via /add_urls/get_url_files when a full URL is provided. - try: - if pattern.startswith("http://") or pattern.startswith( - "https://"): - from API.HydrusNetwork import HydrusRequestSpec - - spec = HydrusRequestSpec( - method="GET", - endpoint="/add_urls/get_url_files", - query={ - "url": pattern - }, + if not pattern or pattern == "*": + if pattern_hints: + metadata_list = _iter_url_filtered_metadata( + None, + want_any=False, + fetch_limit=fetch_limit_raw, + needles=pattern_hints, + minimal=minimal, ) - response = client._perform_request( - spec - ) # type: ignore[attr-defined] - hashes = [] - file_ids = [] - if isinstance(response, dict): - raw_hashes = response.get("hashes") or response.get( - "file_hashes" - ) - if isinstance(raw_hashes, list): - hashes = [ - str(h).strip() for h in raw_hashes - if isinstance(h, str) and str(h).strip() - ] - raw_ids = response.get("file_ids") - if isinstance(raw_ids, list): - for item in raw_ids: - try: - file_ids.append(int(item)) - except (TypeError, ValueError): - continue + else: + metadata_list = _iter_url_filtered_metadata( + None, + want_any=True, + fetch_limit=fetch_limit_raw, + minimal=minimal, + ) + else: + def _clean_url_search_token(value: str | None) -> str: + token = str(value or "").strip().lower() + if not token: + return "" + return token.replace("*", "").replace("?", "") - if file_ids: - payload = client.fetch_file_metadata( - file_ids=file_ids, - include_file_url=True, - include_service_keys_to_tags=not minimal, - include_duration=not minimal, - include_size=not minimal, - include_mime=not minimal, - ) - metas = ( - payload.get("metadata", - []) if isinstance(payload, - dict) else [] - ) - if isinstance(metas, list): - metadata_list = [ - m for m in metas if isinstance(m, dict) - ] - elif hashes: - payload = client.fetch_file_metadata( - hashes=hashes, - include_file_url=True, - include_service_keys_to_tags=not minimal, - include_duration=not minimal, - include_size=not minimal, - include_mime=not minimal, - ) - metas = ( - payload.get("metadata", - []) if isinstance(payload, - dict) else [] - ) - if isinstance(metas, list): - metadata_list = [ - m for m in metas if isinstance(m, dict) - ] - except Exception: - metadata_list = None + # Fast-path: exact URL via /add_urls/get_url_files when a full URL is provided. + try: + if pattern.startswith("http://") or pattern.startswith( + "https://"): + from API.HydrusNetwork import HydrusRequestSpec - # Fallback: substring scan - if metadata_list is None: - search_token = _clean_url_search_token(pattern_hint or pattern) - scan_limit_override: int | None = None - if search_token: - is_domain_only = ("://" not in search_token and "/" not in search_token) - if is_domain_only: - try: - scan_limit_override = max(int(limit or 100) * 20, 2000) - except Exception: - scan_limit_override = 2000 - metadata_list = _iter_url_filtered_metadata( - search_token, - want_any=False, - fetch_limit=int(limit) if limit else 100, - scan_limit=scan_limit_override, - needles=pattern_hints if pattern_hints else None, - minimal=minimal, - ) + spec = HydrusRequestSpec( + method="GET", + endpoint="/add_urls/get_url_files", + query={ + "url": pattern + }, + ) + response = client._perform_request( + spec + ) # type: ignore[attr-defined] + hashes = [] + file_ids = [] + if isinstance(response, dict): + raw_hashes = response.get("hashes") or response.get( + "file_hashes" + ) + if isinstance(raw_hashes, list): + hashes = [ + str(h).strip() for h in raw_hashes + if isinstance(h, str) and str(h).strip() + ] + raw_ids = response.get("file_ids") + if isinstance(raw_ids, list): + for item in raw_ids: + try: + file_ids.append(int(item)) + except (TypeError, ValueError): + continue + + if file_ids: + payload = client.fetch_file_metadata( + file_ids=file_ids, + include_file_url=True, + include_service_keys_to_tags=not minimal, + include_duration=not minimal, + include_size=not minimal, + include_mime=not minimal, + ) + metas = ( + payload.get("metadata", + []) if isinstance(payload, + dict) else [] + ) + if isinstance(metas, list): + metadata_list = [ + m for m in metas if isinstance(m, dict) + ] + elif hashes: + payload = client.fetch_file_metadata( + hashes=hashes, + include_file_url=True, + include_service_keys_to_tags=not minimal, + include_duration=not minimal, + include_size=not minimal, + include_mime=not minimal, + ) + metas = ( + payload.get("metadata", + []) if isinstance(payload, + dict) else [] + ) + if isinstance(metas, list): + metadata_list = [ + m for m in metas if isinstance(m, dict) + ] + except Exception: + metadata_list = None + + # Fallback: substring scan + if metadata_list is None: + search_token = _clean_url_search_token(pattern_hint or pattern) + scan_limit_override: int | None = None + if search_token: + is_domain_only = ("://" not in search_token and "/" not in search_token) + if is_domain_only: + try: + scan_limit_override = max(fetch_limit_raw * 20, 2000) + except Exception: + scan_limit_override = 2000 + metadata_list = _iter_url_filtered_metadata( + search_token, + want_any=False, + fetch_limit=fetch_limit_raw, + scan_limit=scan_limit_override, + needles=pattern_hints if pattern_hints else None, + minimal=minimal, + ) elif namespace == "system": normalized_system_predicate = pattern.strip() if normalized_system_predicate == "has url": diff --git a/cmdlet/get_url.py b/cmdlet/get_url.py index 8a2463f..a0cd595 100644 --- a/cmdlet/get_url.py +++ b/cmdlet/get_url.py @@ -366,6 +366,7 @@ class Get_Url(Cmdlet): store_name, pattern_hint=target_pattern, minimal=True, + url_only=True, ) if search_results is None: continue