diff --git a/SYS/yt_metadata.py b/SYS/yt_metadata.py index b0d5862..b7bc2c2 100644 --- a/SYS/yt_metadata.py +++ b/SYS/yt_metadata.py @@ -1,5 +1,5 @@ import re -from typing import Any, Dict, List, Set +from typing import Any, Dict, List, Optional, Set def value_normalize(value: Any) -> str: @@ -19,6 +19,18 @@ def _add_tag(tags: List[str], namespace: str, value: str) -> None: tags.append(candidate) +def _extract_channel_from_tag(tag_value: str) -> Optional[str]: + """Return the channel value if tag_value is namespaced with channel.""" + if not tag_value: + return None + normalized = tag_value.strip().lower() + if not normalized.startswith("channel:"): + return None + _, _, remainder = normalized.partition(":") + remainder = remainder.strip() + return remainder or None + + def extract_ytdlp_tags(entry: Dict[str, Any]) -> List[str]: """ """ tags: List[str] = [] @@ -67,7 +79,12 @@ def extract_ytdlp_tags(entry: Dict[str, Any]) -> List[str]: for tag_value in tags_field: if tag_value: normalized = value_normalize(str(tag_value)) - if normalized and normalized not in tags: + if not normalized: + continue + channel_candidate = _extract_channel_from_tag(normalized) + if channel_candidate: + _add_tag(tags, "channel", channel_candidate) + if normalized not in tags: tags.append(normalized) elif isinstance(tags_field, dict): # Tags is dict: {"key": "val"} → tag:key:val @@ -83,10 +100,16 @@ def extract_ytdlp_tags(entry: Dict[str, Any]) -> List[str]: if tag_str: for tag_value in re.split(r'[,\s]+', tag_str): tag_value = tag_value.strip() - if tag_value: - normalized = value_normalize(tag_value) - if normalized and normalized not in tags: - tags.append(normalized) + if not tag_value: + continue + normalized = value_normalize(tag_value) + if not normalized: + continue + channel_candidate = _extract_channel_from_tag(normalized) + if channel_candidate: + _add_tag(tags, "channel", channel_candidate) + if normalized not in tags: + tags.append(normalized) # Extract chapters as tags if present chapters = entry.get("chapters") diff --git a/Store/HydrusNetwork.py b/Store/HydrusNetwork.py index 44f6401..c365a9b 100644 --- a/Store/HydrusNetwork.py +++ b/Store/HydrusNetwork.py @@ -454,6 +454,7 @@ class HydrusNetwork(Store): results = storage["hydrus"].search("Simple Man") """ limit = kwargs.get("limit", 100) + minimal = bool(kwargs.get("minimal", False)) try: client = self._client @@ -518,6 +519,8 @@ class HydrusNetwork(Store): fetch_limit: int, scan_limit: int | None = None, needles: Optional[Sequence[str]] = None, + *, + minimal: bool = False, ) -> list[dict[str, Any]]: """Best-effort URL search by scanning Hydrus metadata with include_file_url=True.""" @@ -620,19 +623,19 @@ class HydrusNetwork(Store): payload = client.fetch_file_metadata( hashes=chunk, include_file_url=True, - include_service_keys_to_tags=True, - include_duration=True, - include_size=True, - include_mime=True, + include_service_keys_to_tags=not minimal, + include_duration=not minimal, + include_size=not minimal, + include_mime=not minimal, ) else: payload = client.fetch_file_metadata( file_ids=chunk, include_file_url=True, - include_service_keys_to_tags=True, - include_duration=True, - include_size=True, - include_mime=True, + include_service_keys_to_tags=not minimal, + include_duration=not minimal, + include_size=not minimal, + include_mime=not minimal, ) except Exception: continue @@ -739,12 +742,14 @@ class HydrusNetwork(Store): want_any=False, fetch_limit=int(limit) if limit else 100, needles=pattern_hints, + minimal=minimal, ) else: metadata_list = _iter_url_filtered_metadata( None, want_any=True, - fetch_limit=int(limit) if limit else 100 + fetch_limit=int(limit) if limit else 100, + minimal=minimal, ) else: def _clean_url_search_token(value: str | None) -> str: @@ -792,10 +797,10 @@ class HydrusNetwork(Store): payload = client.fetch_file_metadata( file_ids=file_ids, include_file_url=True, - include_service_keys_to_tags=True, - include_duration=True, - include_size=True, - include_mime=True, + include_service_keys_to_tags=not minimal, + include_duration=not minimal, + include_size=not minimal, + include_mime=not minimal, ) metas = ( payload.get("metadata", @@ -810,10 +815,10 @@ class HydrusNetwork(Store): payload = client.fetch_file_metadata( hashes=hashes, include_file_url=True, - include_service_keys_to_tags=True, - include_duration=True, - include_size=True, - include_mime=True, + include_service_keys_to_tags=not minimal, + include_duration=not minimal, + include_size=not minimal, + include_mime=not minimal, ) metas = ( payload.get("metadata", @@ -844,6 +849,7 @@ class HydrusNetwork(Store): fetch_limit=int(limit) if limit else 100, scan_limit=scan_limit_override, needles=pattern_hints if pattern_hints else None, + minimal=minimal, ) elif namespace == "system": normalized_system_predicate = pattern.strip() @@ -857,6 +863,7 @@ class HydrusNetwork(Store): want_any=not bool(pattern_hints), fetch_limit=fetch_limit, needles=pattern_hints if pattern_hints else None, + minimal=minimal, ) # Parse the query into tags diff --git a/cmdlet/_shared.py b/cmdlet/_shared.py index f38f481..cfa276c 100644 --- a/cmdlet/_shared.py +++ b/cmdlet/_shared.py @@ -3253,6 +3253,20 @@ def check_url_exists_in_storage( return out + def _dedupe_needles(raw_needles: Sequence[str]) -> List[str]: + output: List[str] = [] + seen: set[str] = set() + for candidate in (raw_needles or []): + candidate_text = str(candidate or "").strip() + if not candidate_text: + continue + key = candidate_text.lower() + if key in seen: + continue + seen.add(key) + output.append(candidate_text) + return output + url_needles: Dict[str, List[str]] = {} for u in unique_urls: needles: List[str] = [] @@ -3301,7 +3315,8 @@ def check_url_exists_in_storage( normalized.append(norm_extra) combined = filtered + expanded + lowered + normalized - url_needles[u] = combined if combined else [u] + deduped = _dedupe_needles(combined) + url_needles[u] = deduped if deduped else [u] if in_pipeline: preflight_cache = _load_preflight_cache() @@ -3341,7 +3356,10 @@ def check_url_exists_in_storage( if _timed_out("before backend scan"): return True - bulk_mode = len(unique_urls) > 1 + # Use bulk mode only if we have a significant number of URLs. + # For small sets (1-3 URLs), individual targeted searches are faster + # and more accurate than scanning all files with URLs in the backend. + bulk_mode = len(unique_urls) > 3 def _build_bulk_patterns(needles_map: Dict[str, List[str]], max_per_url: int = 3, max_total: int = 240) -> List[str]: patterns: List[str] = [] @@ -3455,6 +3473,24 @@ def check_url_exists_in_storage( needles: Sequence[str], ) -> Optional[Dict[str, Any]]: backend_hits: List[Dict[str, Any]] = [] + + # 1) Try exact match first (no wildcards). + # This is extremely fast for Hydrus and others that support direct URL lookup. + for needle in (needles or [])[:5]: + needle_stripped = str(needle or "").strip() + if not needle_stripped or not _httpish(needle_stripped): + continue + try: + # Use 'url:' prefix to ensure storage layers (like Hydrus) recognize it as a URL lookup + query = f"url:{needle_stripped}" + backend_hits = backend.search(query, limit=1, minimal=True) or [] + if backend_hits: + return _build_display_row_for_hit(backend_hits[0], backend_name, original_url) + except Exception: + continue + + # 2) Fallback to wildcard substring search for normalized variants. + # This is for backends where the URL might be stored differently (partial match). for needle in (needles or [])[:3]: needle_text = str(needle or "").strip() if not needle_text: @@ -3462,7 +3498,7 @@ def check_url_exists_in_storage( search_needle = _normalize_url_for_search(needle_text) or needle_text query = f"url:*{search_needle}*" try: - backend_hits = backend.search(query, limit=1) or [] + backend_hits = backend.search(query, limit=1, minimal=True) or [] if backend_hits: break except Exception: @@ -3540,61 +3576,6 @@ def check_url_exists_in_storage( if _timed_out("hydrus scan"): return True - if bulk_mode and bulk_patterns: - bulk_hits: Optional[List[Any]] = None - bulk_limit = min(2000, max(200, len(unique_urls) * 8)) - try: - bulk_hits = backend.search( - "url:*", - limit=bulk_limit, - pattern_hint=bulk_patterns, - ) or [] - except Exception: - try: - bulk_hits = backend.search("url:*", limit=bulk_limit) or [] - except Exception: - bulk_hits = None - - if bulk_hits is None: - debug("Bulk URL preflight: Hydrus bulk scan failed; skipping per-URL checks") - continue - - for hit in bulk_hits: - if _timed_out("hydrus bulk scan"): - return True - if len(match_rows) >= max_rows: - break - url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=False) - if not url_values: - continue - - for original_url, needles in url_needles.items(): - if _timed_out("hydrus bulk scan"): - return True - if len(match_rows) >= max_rows: - break - if (original_url, str(backend_name)) in seen_pairs: - continue - - matched = False - for url_value in url_values: - for needle in (needles or []): - if _match_normalized_url(str(needle or ""), str(url_value or "")): - matched = True - break - if matched: - break - - if not matched: - continue - - seen_pairs.add((original_url, str(backend_name))) - matched_urls.add(original_url) - match_rows.append( - _build_display_row_for_hit(hit, str(backend_name), original_url) - ) - continue - for original_url, needles in url_needles.items(): if _timed_out("hydrus per-url scan"): return True @@ -3616,8 +3597,7 @@ def check_url_exists_in_storage( endpoint="/add_urls/get_url_files", query={"url": needle}, ) - # Access internal client safely if possible, else skip check - if hasattr(client, "_perform_request"): + if hasattr(client, "_perform_request"): response = client._perform_request(spec) raw_hashes = None if isinstance(response, dict): @@ -3638,11 +3618,6 @@ def check_url_exists_in_storage( continue if not found: - fallback_row = _search_backend_url_hits(backend, str(backend_name), original_url, needles) - if fallback_row: - seen_pairs.add((original_url, str(backend_name))) - matched_urls.add(original_url) - match_rows.append(fallback_row) continue seen_pairs.add((original_url, str(backend_name))) diff --git a/cmdlet/add_file.py b/cmdlet/add_file.py index 392397f..1ae3da7 100644 --- a/cmdlet/add_file.py +++ b/cmdlet/add_file.py @@ -7,6 +7,7 @@ import sys import shutil import tempfile import re +from urllib.parse import urlparse from SYS import models from SYS import pipeline as ctx @@ -14,6 +15,7 @@ from SYS.logger import log, debug, is_debug_enabled from SYS.pipeline_progress import PipelineProgress from SYS.utils_constant import ALL_SUPPORTED_EXTENSIONS from Store import Store +from API.HTTP import _download_direct_file from . import _shared as sh Cmdlet = sh.Cmdlet @@ -34,7 +36,7 @@ coerce_to_path = sh.coerce_to_path build_pipeline_preview = sh.build_pipeline_preview get_field = sh.get_field -from SYS.utils import sha256_file, unique_path +from SYS.utils import sha256_file, unique_path, sanitize_filename from SYS.metadata import write_metadata # Canonical supported filetypes for all stores/cmdlets @@ -1079,6 +1081,62 @@ class Add_File(Cmdlet): pass return None, None + @staticmethod + def _build_provider_filename( + pipe_obj: models.PipeObject, + fallback_hash: Optional[str] = None, + source_url: Optional[str] = None, + ) -> str: + title_candidates: List[str] = [] + title_value = getattr(pipe_obj, "title", "") + if title_value: + title_candidates.append(str(title_value)) + + extra = getattr(pipe_obj, "extra", {}) + if isinstance(extra, dict): + candid = extra.get("name") or extra.get("title") + if candid: + title_candidates.append(str(candid)) + + metadata = getattr(pipe_obj, "metadata", {}) + if isinstance(metadata, dict): + meta_name = metadata.get("title") or metadata.get("name") + if meta_name: + title_candidates.append(str(meta_name)) + + text = "" + for candidate in title_candidates: + if candidate: + text = candidate.strip() + if text: + break + + if not text and fallback_hash: + text = fallback_hash[:8] + + safe_name = sanitize_filename(text or "download") + + ext = "" + if isinstance(metadata, dict): + ext = metadata.get("ext") or metadata.get("extension") or "" + if not ext and isinstance(extra, dict): + ext = extra.get("ext") or "" + if not ext and source_url: + try: + parsed = urlparse(source_url) + ext = Path(parsed.path).suffix.lstrip(".") + except Exception: + ext = "" + + if ext: + ext_text = str(ext) + if not ext_text.startswith("."): + ext_text = "." + ext_text.lstrip(".") + if not safe_name.lower().endswith(ext_text.lower()): + safe_name = f"{safe_name}{ext_text}" + + return safe_name or "download" + @staticmethod def _resolve_backend_by_name(store: Any, backend_name: str) -> Optional[Any]: if not store or not backend_name: @@ -1219,6 +1277,32 @@ class Add_File(Cmdlet): ) if dl_path and dl_path.exists(): return dl_path, str(r_hash), tmp_dir + source_url = str(source).strip() + if source_url.lower().startswith(("http://", "https://")): + download_dir = Path(tempfile.mkdtemp(prefix="add-file-src-")) + try: + filename = Add_File._build_provider_filename( + pipe_obj, + str(r_hash), + source_url, + ) + downloaded = _download_direct_file( + source_url, + download_dir, + quiet=True, + suggested_filename=filename, + ) + downloaded_path = downloaded.path + if downloaded_path and downloaded_path.exists(): + pipe_obj.is_temp = True + pipe_obj.path = str(downloaded_path) + return downloaded_path, str(r_hash), download_dir + except Exception as exc: + debug(f"[add-file] Provider download failed: {exc}") + try: + shutil.rmtree(download_dir, ignore_errors=True) + except Exception: + pass except Exception: pass diff --git a/cmdlet/get_url.py b/cmdlet/get_url.py index 398f56c..8a2463f 100644 --- a/cmdlet/get_url.py +++ b/cmdlet/get_url.py @@ -241,95 +241,32 @@ class Get_Url(Cmdlet): return None @staticmethod - def _resolve_title_for_hash(backend: Any, file_hash: str, hit: Any = None) -> str: - """Best-effort title resolution for a found hash. - - Strategy: - - Use the hit's existing title/columns when present. - - Prefer backend.get_metadata(hash) when available (direct lookup). - - Fallback to backend.search('hash:', limit=1) and read title. - """ - try: - if hit is not None: - from_hit = Get_Url._extract_title_from_result(hit) - if from_hit: - return from_hit - except Exception: - pass - - try: - if hasattr(backend, "get_metadata"): - meta = backend.get_metadata(file_hash) - if isinstance(meta, dict): - t = meta.get("title") - if isinstance(t, str) and t.strip(): - return t.strip() - except Exception: - pass - - try: - if hasattr(backend, "search"): - hits = backend.search(f"hash:{file_hash}", limit=1) - if isinstance(hits, list) and hits: - t2 = Get_Url._extract_title_from_result(hits[0]) - if t2: - return t2 - except Exception: - pass - - return "" + def _extract_size_from_hit(hit: Any) -> int | None: + for key in ("size", "file_size", "filesize", "size_bytes"): + try: + val = get_field(hit, key) + except Exception: + val = None + if val is None: + continue + if isinstance(val, (int, float)): + return int(val) + try: + return int(val) + except Exception: + continue + return None @staticmethod - def _resolve_size_ext_for_hash(backend: Any, file_hash: str, hit: Any = None) -> tuple[int | None, str]: - """Best-effort (size, ext) resolution for a found hash.""" - # First: see if the hit already includes these fields. - try: - size_val = get_field(hit, "size") - if size_val is None: - size_val = get_field(hit, "file_size") - if size_val is None: - size_val = get_field(hit, "filesize") - if size_val is None: - size_val = get_field(hit, "size_bytes") - size_int = int(size_val) if isinstance(size_val, (int, float)) else None - except Exception: - size_int = None - - try: - ext_val = get_field(hit, "ext") - if ext_val is None: - ext_val = get_field(hit, "extension") - ext = str(ext_val).strip().lstrip(".") if isinstance(ext_val, str) else "" - except Exception: - ext = "" - - if size_int is not None or ext: - return size_int, ext - - # Next: backend.get_metadata(hash) when available. - try: - if hasattr(backend, "get_metadata"): - meta = backend.get_metadata(file_hash) - if isinstance(meta, dict): - size_val2 = meta.get("size") - if size_val2 is None: - size_val2 = meta.get("file_size") - if size_val2 is None: - size_val2 = meta.get("filesize") - if size_val2 is None: - size_val2 = meta.get("size_bytes") - if isinstance(size_val2, (int, float)): - size_int = int(size_val2) - - ext_val2 = meta.get("ext") - if ext_val2 is None: - ext_val2 = meta.get("extension") - if isinstance(ext_val2, str) and ext_val2.strip(): - ext = ext_val2.strip().lstrip(".") - except Exception: - pass - - return size_int, ext + def _extract_ext_from_hit(hit: Any) -> str: + for key in ("ext", "extension"): + try: + ext_val = get_field(hit, key) + except Exception: + ext_val = None + if isinstance(ext_val, str) and ext_val.strip(): + return ext_val.strip().lstrip(".") + return "" def _search_urls_across_stores(self, pattern: str, @@ -360,9 +297,6 @@ class Get_Url(Cmdlet): try: backend = storage[store_name] - title_cache: Dict[str, str] = {} - meta_cache: Dict[str, tuple[int | None, str]] = {} - # Search only URL-bearing records using the backend's URL search capability. # This avoids the expensive/incorrect "search('*')" scan. try: @@ -431,22 +365,12 @@ class Get_Url(Cmdlet): search_limit, store_name, pattern_hint=target_pattern, + minimal=True, ) if search_results is None: continue search_results = search_results or [] - if not search_results and target_pattern and not has_wildcards: - fallback_results = self._execute_search_with_timeout( - backend, - "url:*", - search_limit, - store_name, - pattern_hint=target_pattern, - ) - if fallback_results is None: - continue - search_results = fallback_results or [] for hit in (search_results or []): if len(items) >= MAX_RESULTS: @@ -459,44 +383,9 @@ class Get_Url(Cmdlet): file_hash = str(file_hash) - title = title_cache.get(file_hash, "") - if not title: - try: - title = ( - get_field(hit, "title") - or get_field(hit, "name") - or get_field(hit, "file_title") - or "" - ) - except Exception: - title = "" - if not title: - title = self._resolve_title_for_hash(backend, file_hash, hit) - title_cache[file_hash] = title - - size, ext = meta_cache.get(file_hash, (None, "")) - if size is None and not ext: - try: - size = get_field(hit, "size") - if size is None: - size = get_field(hit, "size_bytes") - if size is None: - size = get_field(hit, "file_size") - if size is None: - size = get_field(hit, "filesize") - size = int(size) if isinstance(size, (int, float)) else None - except Exception: - size = None - - try: - ext = get_field(hit, "ext") or get_field(hit, "extension") - ext = str(ext).strip().lstrip(".") if isinstance(ext, str) else "" - except Exception: - ext = "" - - if size is None and not ext: - size, ext = self._resolve_size_ext_for_hash(backend, file_hash, hit) - meta_cache[file_hash] = (size, ext) + title = self._extract_title_from_result(hit) or "" + size = self._extract_size_from_hit(hit) + ext = self._extract_ext_from_hit(hit) urls = self._extract_urls_from_hit(hit) if not urls: @@ -505,6 +394,7 @@ class Get_Url(Cmdlet): except Exception: urls = [] + hit_added = False for url in (urls or []): if len(items) >= MAX_RESULTS: break @@ -526,7 +416,9 @@ class Get_Url(Cmdlet): ext=str(ext or ""), ) ) - found_stores.add(str(store_name)) + hit_added = True + if hit_added: + found_stores.add(str(store_name)) if len(items) >= MAX_RESULTS: break except Exception as exc: