This commit is contained in:
2026-01-24 01:38:12 -08:00
parent 4e4c374908
commit 3a4d3f029d
5 changed files with 210 additions and 229 deletions

View File

@@ -1,5 +1,5 @@
import re import re
from typing import Any, Dict, List, Set from typing import Any, Dict, List, Optional, Set
def value_normalize(value: Any) -> str: def value_normalize(value: Any) -> str:
@@ -19,6 +19,18 @@ def _add_tag(tags: List[str], namespace: str, value: str) -> None:
tags.append(candidate) tags.append(candidate)
def _extract_channel_from_tag(tag_value: str) -> Optional[str]:
"""Return the channel value if tag_value is namespaced with channel."""
if not tag_value:
return None
normalized = tag_value.strip().lower()
if not normalized.startswith("channel:"):
return None
_, _, remainder = normalized.partition(":")
remainder = remainder.strip()
return remainder or None
def extract_ytdlp_tags(entry: Dict[str, Any]) -> List[str]: def extract_ytdlp_tags(entry: Dict[str, Any]) -> List[str]:
""" """ """ """
tags: List[str] = [] tags: List[str] = []
@@ -67,7 +79,12 @@ def extract_ytdlp_tags(entry: Dict[str, Any]) -> List[str]:
for tag_value in tags_field: for tag_value in tags_field:
if tag_value: if tag_value:
normalized = value_normalize(str(tag_value)) normalized = value_normalize(str(tag_value))
if normalized and normalized not in tags: if not normalized:
continue
channel_candidate = _extract_channel_from_tag(normalized)
if channel_candidate:
_add_tag(tags, "channel", channel_candidate)
if normalized not in tags:
tags.append(normalized) tags.append(normalized)
elif isinstance(tags_field, dict): elif isinstance(tags_field, dict):
# Tags is dict: {"key": "val"} → tag:key:val # Tags is dict: {"key": "val"} → tag:key:val
@@ -83,10 +100,16 @@ def extract_ytdlp_tags(entry: Dict[str, Any]) -> List[str]:
if tag_str: if tag_str:
for tag_value in re.split(r'[,\s]+', tag_str): for tag_value in re.split(r'[,\s]+', tag_str):
tag_value = tag_value.strip() tag_value = tag_value.strip()
if tag_value: if not tag_value:
normalized = value_normalize(tag_value) continue
if normalized and normalized not in tags: normalized = value_normalize(tag_value)
tags.append(normalized) if not normalized:
continue
channel_candidate = _extract_channel_from_tag(normalized)
if channel_candidate:
_add_tag(tags, "channel", channel_candidate)
if normalized not in tags:
tags.append(normalized)
# Extract chapters as tags if present # Extract chapters as tags if present
chapters = entry.get("chapters") chapters = entry.get("chapters")

View File

@@ -454,6 +454,7 @@ class HydrusNetwork(Store):
results = storage["hydrus"].search("Simple Man") results = storage["hydrus"].search("Simple Man")
""" """
limit = kwargs.get("limit", 100) limit = kwargs.get("limit", 100)
minimal = bool(kwargs.get("minimal", False))
try: try:
client = self._client client = self._client
@@ -518,6 +519,8 @@ class HydrusNetwork(Store):
fetch_limit: int, fetch_limit: int,
scan_limit: int | None = None, scan_limit: int | None = None,
needles: Optional[Sequence[str]] = None, needles: Optional[Sequence[str]] = None,
*,
minimal: bool = False,
) -> list[dict[str, Any]]: ) -> list[dict[str, Any]]:
"""Best-effort URL search by scanning Hydrus metadata with include_file_url=True.""" """Best-effort URL search by scanning Hydrus metadata with include_file_url=True."""
@@ -620,19 +623,19 @@ class HydrusNetwork(Store):
payload = client.fetch_file_metadata( payload = client.fetch_file_metadata(
hashes=chunk, hashes=chunk,
include_file_url=True, include_file_url=True,
include_service_keys_to_tags=True, include_service_keys_to_tags=not minimal,
include_duration=True, include_duration=not minimal,
include_size=True, include_size=not minimal,
include_mime=True, include_mime=not minimal,
) )
else: else:
payload = client.fetch_file_metadata( payload = client.fetch_file_metadata(
file_ids=chunk, file_ids=chunk,
include_file_url=True, include_file_url=True,
include_service_keys_to_tags=True, include_service_keys_to_tags=not minimal,
include_duration=True, include_duration=not minimal,
include_size=True, include_size=not minimal,
include_mime=True, include_mime=not minimal,
) )
except Exception: except Exception:
continue continue
@@ -739,12 +742,14 @@ class HydrusNetwork(Store):
want_any=False, want_any=False,
fetch_limit=int(limit) if limit else 100, fetch_limit=int(limit) if limit else 100,
needles=pattern_hints, needles=pattern_hints,
minimal=minimal,
) )
else: else:
metadata_list = _iter_url_filtered_metadata( metadata_list = _iter_url_filtered_metadata(
None, None,
want_any=True, want_any=True,
fetch_limit=int(limit) if limit else 100 fetch_limit=int(limit) if limit else 100,
minimal=minimal,
) )
else: else:
def _clean_url_search_token(value: str | None) -> str: def _clean_url_search_token(value: str | None) -> str:
@@ -792,10 +797,10 @@ class HydrusNetwork(Store):
payload = client.fetch_file_metadata( payload = client.fetch_file_metadata(
file_ids=file_ids, file_ids=file_ids,
include_file_url=True, include_file_url=True,
include_service_keys_to_tags=True, include_service_keys_to_tags=not minimal,
include_duration=True, include_duration=not minimal,
include_size=True, include_size=not minimal,
include_mime=True, include_mime=not minimal,
) )
metas = ( metas = (
payload.get("metadata", payload.get("metadata",
@@ -810,10 +815,10 @@ class HydrusNetwork(Store):
payload = client.fetch_file_metadata( payload = client.fetch_file_metadata(
hashes=hashes, hashes=hashes,
include_file_url=True, include_file_url=True,
include_service_keys_to_tags=True, include_service_keys_to_tags=not minimal,
include_duration=True, include_duration=not minimal,
include_size=True, include_size=not minimal,
include_mime=True, include_mime=not minimal,
) )
metas = ( metas = (
payload.get("metadata", payload.get("metadata",
@@ -844,6 +849,7 @@ class HydrusNetwork(Store):
fetch_limit=int(limit) if limit else 100, fetch_limit=int(limit) if limit else 100,
scan_limit=scan_limit_override, scan_limit=scan_limit_override,
needles=pattern_hints if pattern_hints else None, needles=pattern_hints if pattern_hints else None,
minimal=minimal,
) )
elif namespace == "system": elif namespace == "system":
normalized_system_predicate = pattern.strip() normalized_system_predicate = pattern.strip()
@@ -857,6 +863,7 @@ class HydrusNetwork(Store):
want_any=not bool(pattern_hints), want_any=not bool(pattern_hints),
fetch_limit=fetch_limit, fetch_limit=fetch_limit,
needles=pattern_hints if pattern_hints else None, needles=pattern_hints if pattern_hints else None,
minimal=minimal,
) )
# Parse the query into tags # Parse the query into tags

View File

@@ -3253,6 +3253,20 @@ def check_url_exists_in_storage(
return out return out
def _dedupe_needles(raw_needles: Sequence[str]) -> List[str]:
output: List[str] = []
seen: set[str] = set()
for candidate in (raw_needles or []):
candidate_text = str(candidate or "").strip()
if not candidate_text:
continue
key = candidate_text.lower()
if key in seen:
continue
seen.add(key)
output.append(candidate_text)
return output
url_needles: Dict[str, List[str]] = {} url_needles: Dict[str, List[str]] = {}
for u in unique_urls: for u in unique_urls:
needles: List[str] = [] needles: List[str] = []
@@ -3301,7 +3315,8 @@ def check_url_exists_in_storage(
normalized.append(norm_extra) normalized.append(norm_extra)
combined = filtered + expanded + lowered + normalized combined = filtered + expanded + lowered + normalized
url_needles[u] = combined if combined else [u] deduped = _dedupe_needles(combined)
url_needles[u] = deduped if deduped else [u]
if in_pipeline: if in_pipeline:
preflight_cache = _load_preflight_cache() preflight_cache = _load_preflight_cache()
@@ -3341,7 +3356,10 @@ def check_url_exists_in_storage(
if _timed_out("before backend scan"): if _timed_out("before backend scan"):
return True return True
bulk_mode = len(unique_urls) > 1 # Use bulk mode only if we have a significant number of URLs.
# For small sets (1-3 URLs), individual targeted searches are faster
# and more accurate than scanning all files with URLs in the backend.
bulk_mode = len(unique_urls) > 3
def _build_bulk_patterns(needles_map: Dict[str, List[str]], max_per_url: int = 3, max_total: int = 240) -> List[str]: def _build_bulk_patterns(needles_map: Dict[str, List[str]], max_per_url: int = 3, max_total: int = 240) -> List[str]:
patterns: List[str] = [] patterns: List[str] = []
@@ -3455,6 +3473,24 @@ def check_url_exists_in_storage(
needles: Sequence[str], needles: Sequence[str],
) -> Optional[Dict[str, Any]]: ) -> Optional[Dict[str, Any]]:
backend_hits: List[Dict[str, Any]] = [] backend_hits: List[Dict[str, Any]] = []
# 1) Try exact match first (no wildcards).
# This is extremely fast for Hydrus and others that support direct URL lookup.
for needle in (needles or [])[:5]:
needle_stripped = str(needle or "").strip()
if not needle_stripped or not _httpish(needle_stripped):
continue
try:
# Use 'url:' prefix to ensure storage layers (like Hydrus) recognize it as a URL lookup
query = f"url:{needle_stripped}"
backend_hits = backend.search(query, limit=1, minimal=True) or []
if backend_hits:
return _build_display_row_for_hit(backend_hits[0], backend_name, original_url)
except Exception:
continue
# 2) Fallback to wildcard substring search for normalized variants.
# This is for backends where the URL might be stored differently (partial match).
for needle in (needles or [])[:3]: for needle in (needles or [])[:3]:
needle_text = str(needle or "").strip() needle_text = str(needle or "").strip()
if not needle_text: if not needle_text:
@@ -3462,7 +3498,7 @@ def check_url_exists_in_storage(
search_needle = _normalize_url_for_search(needle_text) or needle_text search_needle = _normalize_url_for_search(needle_text) or needle_text
query = f"url:*{search_needle}*" query = f"url:*{search_needle}*"
try: try:
backend_hits = backend.search(query, limit=1) or [] backend_hits = backend.search(query, limit=1, minimal=True) or []
if backend_hits: if backend_hits:
break break
except Exception: except Exception:
@@ -3540,61 +3576,6 @@ def check_url_exists_in_storage(
if _timed_out("hydrus scan"): if _timed_out("hydrus scan"):
return True return True
if bulk_mode and bulk_patterns:
bulk_hits: Optional[List[Any]] = None
bulk_limit = min(2000, max(200, len(unique_urls) * 8))
try:
bulk_hits = backend.search(
"url:*",
limit=bulk_limit,
pattern_hint=bulk_patterns,
) or []
except Exception:
try:
bulk_hits = backend.search("url:*", limit=bulk_limit) or []
except Exception:
bulk_hits = None
if bulk_hits is None:
debug("Bulk URL preflight: Hydrus bulk scan failed; skipping per-URL checks")
continue
for hit in bulk_hits:
if _timed_out("hydrus bulk scan"):
return True
if len(match_rows) >= max_rows:
break
url_values = _extract_urls_from_hit(hit, backend, allow_backend_lookup=False)
if not url_values:
continue
for original_url, needles in url_needles.items():
if _timed_out("hydrus bulk scan"):
return True
if len(match_rows) >= max_rows:
break
if (original_url, str(backend_name)) in seen_pairs:
continue
matched = False
for url_value in url_values:
for needle in (needles or []):
if _match_normalized_url(str(needle or ""), str(url_value or "")):
matched = True
break
if matched:
break
if not matched:
continue
seen_pairs.add((original_url, str(backend_name)))
matched_urls.add(original_url)
match_rows.append(
_build_display_row_for_hit(hit, str(backend_name), original_url)
)
continue
for original_url, needles in url_needles.items(): for original_url, needles in url_needles.items():
if _timed_out("hydrus per-url scan"): if _timed_out("hydrus per-url scan"):
return True return True
@@ -3616,7 +3597,6 @@ def check_url_exists_in_storage(
endpoint="/add_urls/get_url_files", endpoint="/add_urls/get_url_files",
query={"url": needle}, query={"url": needle},
) )
# Access internal client safely if possible, else skip check
if hasattr(client, "_perform_request"): if hasattr(client, "_perform_request"):
response = client._perform_request(spec) response = client._perform_request(spec)
raw_hashes = None raw_hashes = None
@@ -3638,11 +3618,6 @@ def check_url_exists_in_storage(
continue continue
if not found: if not found:
fallback_row = _search_backend_url_hits(backend, str(backend_name), original_url, needles)
if fallback_row:
seen_pairs.add((original_url, str(backend_name)))
matched_urls.add(original_url)
match_rows.append(fallback_row)
continue continue
seen_pairs.add((original_url, str(backend_name))) seen_pairs.add((original_url, str(backend_name)))

View File

@@ -7,6 +7,7 @@ import sys
import shutil import shutil
import tempfile import tempfile
import re import re
from urllib.parse import urlparse
from SYS import models from SYS import models
from SYS import pipeline as ctx from SYS import pipeline as ctx
@@ -14,6 +15,7 @@ from SYS.logger import log, debug, is_debug_enabled
from SYS.pipeline_progress import PipelineProgress from SYS.pipeline_progress import PipelineProgress
from SYS.utils_constant import ALL_SUPPORTED_EXTENSIONS from SYS.utils_constant import ALL_SUPPORTED_EXTENSIONS
from Store import Store from Store import Store
from API.HTTP import _download_direct_file
from . import _shared as sh from . import _shared as sh
Cmdlet = sh.Cmdlet Cmdlet = sh.Cmdlet
@@ -34,7 +36,7 @@ coerce_to_path = sh.coerce_to_path
build_pipeline_preview = sh.build_pipeline_preview build_pipeline_preview = sh.build_pipeline_preview
get_field = sh.get_field get_field = sh.get_field
from SYS.utils import sha256_file, unique_path from SYS.utils import sha256_file, unique_path, sanitize_filename
from SYS.metadata import write_metadata from SYS.metadata import write_metadata
# Canonical supported filetypes for all stores/cmdlets # Canonical supported filetypes for all stores/cmdlets
@@ -1079,6 +1081,62 @@ class Add_File(Cmdlet):
pass pass
return None, None return None, None
@staticmethod
def _build_provider_filename(
pipe_obj: models.PipeObject,
fallback_hash: Optional[str] = None,
source_url: Optional[str] = None,
) -> str:
title_candidates: List[str] = []
title_value = getattr(pipe_obj, "title", "")
if title_value:
title_candidates.append(str(title_value))
extra = getattr(pipe_obj, "extra", {})
if isinstance(extra, dict):
candid = extra.get("name") or extra.get("title")
if candid:
title_candidates.append(str(candid))
metadata = getattr(pipe_obj, "metadata", {})
if isinstance(metadata, dict):
meta_name = metadata.get("title") or metadata.get("name")
if meta_name:
title_candidates.append(str(meta_name))
text = ""
for candidate in title_candidates:
if candidate:
text = candidate.strip()
if text:
break
if not text and fallback_hash:
text = fallback_hash[:8]
safe_name = sanitize_filename(text or "download")
ext = ""
if isinstance(metadata, dict):
ext = metadata.get("ext") or metadata.get("extension") or ""
if not ext and isinstance(extra, dict):
ext = extra.get("ext") or ""
if not ext and source_url:
try:
parsed = urlparse(source_url)
ext = Path(parsed.path).suffix.lstrip(".")
except Exception:
ext = ""
if ext:
ext_text = str(ext)
if not ext_text.startswith("."):
ext_text = "." + ext_text.lstrip(".")
if not safe_name.lower().endswith(ext_text.lower()):
safe_name = f"{safe_name}{ext_text}"
return safe_name or "download"
@staticmethod @staticmethod
def _resolve_backend_by_name(store: Any, backend_name: str) -> Optional[Any]: def _resolve_backend_by_name(store: Any, backend_name: str) -> Optional[Any]:
if not store or not backend_name: if not store or not backend_name:
@@ -1219,6 +1277,32 @@ class Add_File(Cmdlet):
) )
if dl_path and dl_path.exists(): if dl_path and dl_path.exists():
return dl_path, str(r_hash), tmp_dir return dl_path, str(r_hash), tmp_dir
source_url = str(source).strip()
if source_url.lower().startswith(("http://", "https://")):
download_dir = Path(tempfile.mkdtemp(prefix="add-file-src-"))
try:
filename = Add_File._build_provider_filename(
pipe_obj,
str(r_hash),
source_url,
)
downloaded = _download_direct_file(
source_url,
download_dir,
quiet=True,
suggested_filename=filename,
)
downloaded_path = downloaded.path
if downloaded_path and downloaded_path.exists():
pipe_obj.is_temp = True
pipe_obj.path = str(downloaded_path)
return downloaded_path, str(r_hash), download_dir
except Exception as exc:
debug(f"[add-file] Provider download failed: {exc}")
try:
shutil.rmtree(download_dir, ignore_errors=True)
except Exception:
pass
except Exception: except Exception:
pass pass

View File

@@ -241,95 +241,32 @@ class Get_Url(Cmdlet):
return None return None
@staticmethod @staticmethod
def _resolve_title_for_hash(backend: Any, file_hash: str, hit: Any = None) -> str: def _extract_size_from_hit(hit: Any) -> int | None:
"""Best-effort title resolution for a found hash. for key in ("size", "file_size", "filesize", "size_bytes"):
try:
Strategy: val = get_field(hit, key)
- Use the hit's existing title/columns when present. except Exception:
- Prefer backend.get_metadata(hash) when available (direct lookup). val = None
- Fallback to backend.search('hash:<sha>', limit=1) and read title. if val is None:
""" continue
try: if isinstance(val, (int, float)):
if hit is not None: return int(val)
from_hit = Get_Url._extract_title_from_result(hit) try:
if from_hit: return int(val)
return from_hit except Exception:
except Exception: continue
pass return None
try:
if hasattr(backend, "get_metadata"):
meta = backend.get_metadata(file_hash)
if isinstance(meta, dict):
t = meta.get("title")
if isinstance(t, str) and t.strip():
return t.strip()
except Exception:
pass
try:
if hasattr(backend, "search"):
hits = backend.search(f"hash:{file_hash}", limit=1)
if isinstance(hits, list) and hits:
t2 = Get_Url._extract_title_from_result(hits[0])
if t2:
return t2
except Exception:
pass
return ""
@staticmethod @staticmethod
def _resolve_size_ext_for_hash(backend: Any, file_hash: str, hit: Any = None) -> tuple[int | None, str]: def _extract_ext_from_hit(hit: Any) -> str:
"""Best-effort (size, ext) resolution for a found hash.""" for key in ("ext", "extension"):
# First: see if the hit already includes these fields. try:
try: ext_val = get_field(hit, key)
size_val = get_field(hit, "size") except Exception:
if size_val is None: ext_val = None
size_val = get_field(hit, "file_size") if isinstance(ext_val, str) and ext_val.strip():
if size_val is None: return ext_val.strip().lstrip(".")
size_val = get_field(hit, "filesize") return ""
if size_val is None:
size_val = get_field(hit, "size_bytes")
size_int = int(size_val) if isinstance(size_val, (int, float)) else None
except Exception:
size_int = None
try:
ext_val = get_field(hit, "ext")
if ext_val is None:
ext_val = get_field(hit, "extension")
ext = str(ext_val).strip().lstrip(".") if isinstance(ext_val, str) else ""
except Exception:
ext = ""
if size_int is not None or ext:
return size_int, ext
# Next: backend.get_metadata(hash) when available.
try:
if hasattr(backend, "get_metadata"):
meta = backend.get_metadata(file_hash)
if isinstance(meta, dict):
size_val2 = meta.get("size")
if size_val2 is None:
size_val2 = meta.get("file_size")
if size_val2 is None:
size_val2 = meta.get("filesize")
if size_val2 is None:
size_val2 = meta.get("size_bytes")
if isinstance(size_val2, (int, float)):
size_int = int(size_val2)
ext_val2 = meta.get("ext")
if ext_val2 is None:
ext_val2 = meta.get("extension")
if isinstance(ext_val2, str) and ext_val2.strip():
ext = ext_val2.strip().lstrip(".")
except Exception:
pass
return size_int, ext
def _search_urls_across_stores(self, def _search_urls_across_stores(self,
pattern: str, pattern: str,
@@ -360,9 +297,6 @@ class Get_Url(Cmdlet):
try: try:
backend = storage[store_name] backend = storage[store_name]
title_cache: Dict[str, str] = {}
meta_cache: Dict[str, tuple[int | None, str]] = {}
# Search only URL-bearing records using the backend's URL search capability. # Search only URL-bearing records using the backend's URL search capability.
# This avoids the expensive/incorrect "search('*')" scan. # This avoids the expensive/incorrect "search('*')" scan.
try: try:
@@ -431,22 +365,12 @@ class Get_Url(Cmdlet):
search_limit, search_limit,
store_name, store_name,
pattern_hint=target_pattern, pattern_hint=target_pattern,
minimal=True,
) )
if search_results is None: if search_results is None:
continue continue
search_results = search_results or [] search_results = search_results or []
if not search_results and target_pattern and not has_wildcards:
fallback_results = self._execute_search_with_timeout(
backend,
"url:*",
search_limit,
store_name,
pattern_hint=target_pattern,
)
if fallback_results is None:
continue
search_results = fallback_results or []
for hit in (search_results or []): for hit in (search_results or []):
if len(items) >= MAX_RESULTS: if len(items) >= MAX_RESULTS:
@@ -459,44 +383,9 @@ class Get_Url(Cmdlet):
file_hash = str(file_hash) file_hash = str(file_hash)
title = title_cache.get(file_hash, "") title = self._extract_title_from_result(hit) or ""
if not title: size = self._extract_size_from_hit(hit)
try: ext = self._extract_ext_from_hit(hit)
title = (
get_field(hit, "title")
or get_field(hit, "name")
or get_field(hit, "file_title")
or ""
)
except Exception:
title = ""
if not title:
title = self._resolve_title_for_hash(backend, file_hash, hit)
title_cache[file_hash] = title
size, ext = meta_cache.get(file_hash, (None, ""))
if size is None and not ext:
try:
size = get_field(hit, "size")
if size is None:
size = get_field(hit, "size_bytes")
if size is None:
size = get_field(hit, "file_size")
if size is None:
size = get_field(hit, "filesize")
size = int(size) if isinstance(size, (int, float)) else None
except Exception:
size = None
try:
ext = get_field(hit, "ext") or get_field(hit, "extension")
ext = str(ext).strip().lstrip(".") if isinstance(ext, str) else ""
except Exception:
ext = ""
if size is None and not ext:
size, ext = self._resolve_size_ext_for_hash(backend, file_hash, hit)
meta_cache[file_hash] = (size, ext)
urls = self._extract_urls_from_hit(hit) urls = self._extract_urls_from_hit(hit)
if not urls: if not urls:
@@ -505,6 +394,7 @@ class Get_Url(Cmdlet):
except Exception: except Exception:
urls = [] urls = []
hit_added = False
for url in (urls or []): for url in (urls or []):
if len(items) >= MAX_RESULTS: if len(items) >= MAX_RESULTS:
break break
@@ -526,7 +416,9 @@ class Get_Url(Cmdlet):
ext=str(ext or ""), ext=str(ext or ""),
) )
) )
found_stores.add(str(store_name)) hit_added = True
if hit_added:
found_stores.add(str(store_name))
if len(items) >= MAX_RESULTS: if len(items) >= MAX_RESULTS:
break break
except Exception as exc: except Exception as exc: