From 922b649e1778138409f5461d6263f6146aca82e8 Mon Sep 17 00:00:00 2001 From: Nose Date: Tue, 20 Jan 2026 16:42:49 -0800 Subject: [PATCH] j --- API/HTTP.py | 4 +- API/data/alldebrid.json | 4 +- Provider/metadata_provider.py | 2 +- SYS/metadata.py | 67 +------------- SYS/models.py | 24 +++-- SYS/pipeline.py | 3 + SYS/yt_metadata.py | 102 +++++++++++++++++++++ cmdlet/get_tag.py | 161 ++++++++++++++++++++++++---------- tool/ytdlp.py | 125 ++++++++++++++++++++++---- 9 files changed, 351 insertions(+), 141 deletions(-) create mode 100644 SYS/yt_metadata.py diff --git a/API/HTTP.py b/API/HTTP.py index 4592797..79b3f48 100644 --- a/API/HTTP.py +++ b/API/HTTP.py @@ -25,7 +25,7 @@ from SYS.models import DebugLogger, DownloadError, DownloadMediaResult, Progress from SYS.utils import ensure_directory, sha256_file try: # Optional; used for metadata extraction when available - from SYS.metadata import extract_ytdlp_tags + from SYS.yt_metadata import extract_ytdlp_tags except Exception: # pragma: no cover - optional dependency extract_ytdlp_tags = None # type: ignore[assignment] @@ -56,7 +56,7 @@ def _resolve_verify_value(verify_ssl: bool) -> Union[bool, str]: mod = sys.modules.get(mod_name) if mod is None: mod = __import__(mod_name) - except Exception: + except (ImportError, ModuleNotFoundError): return None # Common APIs that return a bundle path diff --git a/API/data/alldebrid.json b/API/data/alldebrid.json index 57188d9..cc1a77e 100644 --- a/API/data/alldebrid.json +++ b/API/data/alldebrid.json @@ -353,7 +353,7 @@ "filedot\\.(xyz|to|top)/([0-9a-zA-Z]{12})" ], "regexp": "filedot\\.(xyz|to|top)/([0-9a-zA-Z]{12})", - "status": false + "status": true }, "filefactory": { "name": "filefactory", @@ -622,7 +622,7 @@ "(simfileshare\\.net/download/[0-9]+/)" ], "regexp": "(simfileshare\\.net/download/[0-9]+/)", - "status": false + "status": true }, "streamtape": { "name": "streamtape", diff --git a/Provider/metadata_provider.py b/Provider/metadata_provider.py index 0471031..4a66b77 100644 --- a/Provider/metadata_provider.py +++ b/Provider/metadata_provider.py @@ -852,7 +852,7 @@ class YtdlpMetadataProvider(MetadataProvider): tags: List[str] = [] try: - from SYS.metadata import extract_ytdlp_tags + from SYS.yt_metadata import extract_ytdlp_tags except Exception: extract_ytdlp_tags = None # type: ignore[assignment] diff --git a/SYS/metadata.py b/SYS/metadata.py index a91c6e2..124587d 100644 --- a/SYS/metadata.py +++ b/SYS/metadata.py @@ -7,7 +7,7 @@ from SYS.logger import log, debug from pathlib import Path from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple -from API.HydrusNetwork import apply_hydrus_tag_mutation, fetch_hydrus_metadata, fetch_hydrus_metadata_by_url +from SYS.yt_metadata import extract_ytdlp_tags try: # Optional; used when available for richer metadata fetches import yt_dlp @@ -918,72 +918,7 @@ def apply_tag_mutation(payload: Dict[str, } -def extract_ytdlp_tags(entry: Dict[str, Any]) -> List[str]: - """ """ - tags: List[str] = [] - seen_namespaces: Set[str] = set() - # Meaningful yt-dlp fields that should become tags - # This mapping excludes technical fields: filesize, duration, format_id, vcodec, acodec, ext, etc. - field_to_namespace = { - "artist": "artist", - "album": "album", - "creator": "creator", - "uploader": "creator", # Map uploader to creator (deduplicate) - "uploader_id": "creator", - "channel": "channel", - "genre": "genre", - "track": "track", - "track_number": "track_number", - "release_date": "release_date", - "upload_date": "upload_date", - "title": "title", - "license": "license", - "location": "location", - } - - # Extract simple field mappings - for yt_field, namespace in field_to_namespace.items(): - value = entry.get(yt_field) - if value is not None: - value_str = value_normalize(str(value)) - if value_str: - # Prevent duplicate creator tags (only use first creator) - if namespace == "creator": - if "creator" in seen_namespaces: - continue - seen_namespaces.add("creator") - - _add_tag(tags, namespace, value_str) - - # Handle tags field specially (could be list, dict, or string) - # For list/sequence tags, capture as freeform (no namespace prefix) - tags_field = entry.get("tags") - if tags_field is not None: - if isinstance(tags_field, list): - # Tags is list: ["tag1", "tag2", ...] → capture as freeform tags (no "tag:" prefix) - # These are typically genre/category tags from the source (BandCamp genres, etc.) - for tag_value in tags_field: - if tag_value: - normalized = value_normalize(str(tag_value)) - if normalized and normalized not in tags: - tags.append(normalized) - elif isinstance(tags_field, dict): - # Tags is dict: {"key": "val"} → tag:key:val - for key, val in tags_field.items(): - if key and val: - key_normalized = value_normalize(str(key)) - val_normalized = value_normalize(str(val)) - if key_normalized and val_normalized: - _add_tag(tags, f"tag:{key_normalized}", val_normalized) - else: - # Tags is string or other: add as freeform - if tags_field: - normalized = value_normalize(str(tags_field)) - if normalized and normalized not in tags: - tags.append(normalized) - - return tags def dedup_tags_by_namespace(tags: List[str], keep_first: bool = True) -> List[str]: diff --git a/SYS/models.py b/SYS/models.py index 452d776..5f32cbd 100644 --- a/SYS/models.py +++ b/SYS/models.py @@ -2,6 +2,7 @@ import datetime import hashlib +import inspect import json import os import shutil @@ -996,16 +997,25 @@ class PipelineLiveProgress: def stop(self) -> None: # Safe to call whether Live is running or paused. if self._live is not None: + stop_fn = self._live.stop + has_clear = False try: - try: - self._live.stop(clear=True) - except TypeError: - self._live.stop() - except Exception: - self._live.stop() - except Exception: + signature = inspect.signature(stop_fn) + has_clear = "clear" in signature.parameters + except (ValueError, TypeError): pass + try: + if has_clear: + stop_fn(clear=True) + else: + stop_fn() + except Exception: + try: + stop_fn() + except Exception: + pass + self._live = None self._console = None self._overall = None diff --git a/SYS/pipeline.py b/SYS/pipeline.py index 890b35b..f1f80fe 100644 --- a/SYS/pipeline.py +++ b/SYS/pipeline.py @@ -2313,6 +2313,9 @@ class PipelineExecutor: if name in {"get-relationship", "get-rel"}: continue + if name in {"get-metadata", + "meta"}: + continue # `.pipe` (MPV) is an interactive launcher; disable pipeline Live progress # for it because it doesn't meaningfully "complete" (mpv may keep running) # and Live output interferes with MPV playlist UI. diff --git a/SYS/yt_metadata.py b/SYS/yt_metadata.py new file mode 100644 index 0000000..b0d5862 --- /dev/null +++ b/SYS/yt_metadata.py @@ -0,0 +1,102 @@ +import re +from typing import Any, Dict, List, Set + + +def value_normalize(value: Any) -> str: + text = str(value).strip() + return text.lower() if text else "" + + +def _add_tag(tags: List[str], namespace: str, value: str) -> None: + """Add a namespaced tag if not already present.""" + if not namespace or not value: + return + normalized_value = value_normalize(value) + if not normalized_value: + return + candidate = f"{namespace}:{normalized_value}" + if candidate not in tags: + tags.append(candidate) + + +def extract_ytdlp_tags(entry: Dict[str, Any]) -> List[str]: + """ """ + tags: List[str] = [] + seen_namespaces: Set[str] = set() + + # Meaningful yt-dlp fields that should become tags + # This mapping excludes technical fields: filesize, duration, format_id, vcodec, acodec, ext, etc. + field_to_namespace = { + "artist": "artist", + "album": "album", + "creator": "creator", + "uploader": "creator", # Map uploader to creator (deduplicate) + "uploader_id": "creator", + "channel": "channel", + "genre": "genre", + "track": "track", + "track_number": "track_number", + "release_date": "release_date", + "upload_date": "upload_date", + "title": "title", + "license": "license", + "location": "location", + } + + # Extract simple field mappings + for yt_field, namespace in field_to_namespace.items(): + value = entry.get(yt_field) + if value is not None: + value_str = value_normalize(str(value)) + if value_str: + # Prevent duplicate creator tags (only use first creator) + if namespace == "creator": + if "creator" in seen_namespaces: + continue + seen_namespaces.add("creator") + + _add_tag(tags, namespace, value_str) + + # Handle tags field specially (could be list, dict, or string) + # For list/sequence tags, capture as freeform (no namespace prefix) + tags_field = entry.get("tags") + if tags_field is not None: + if isinstance(tags_field, list): + # Tags is list: ["tag1", "tag2", ...] → capture as freeform tags (no "tag:" prefix) + # These are typically genre/category tags from the source (BandCamp genres, etc.) + for tag_value in tags_field: + if tag_value: + normalized = value_normalize(str(tag_value)) + if normalized and normalized not in tags: + tags.append(normalized) + elif isinstance(tags_field, dict): + # Tags is dict: {"key": "val"} → tag:key:val + for key, val in tags_field.items(): + if key and val: + key_normalized = value_normalize(str(key)) + val_normalized = value_normalize(str(val)) + if key_normalized and val_normalized: + _add_tag(tags, f"tag:{key_normalized}", val_normalized) + else: + # Tags is string: "tag1,tag2" → split and capture as freeform + tag_str = str(tags_field).strip() + if tag_str: + for tag_value in re.split(r'[,\s]+', tag_str): + tag_value = tag_value.strip() + if tag_value: + normalized = value_normalize(tag_value) + if normalized and normalized not in tags: + tags.append(normalized) + + # Extract chapters as tags if present + chapters = entry.get("chapters") + if chapters and isinstance(chapters, list): + for chapter in chapters: + if isinstance(chapter, dict): + title = chapter.get("title") + if title: + title_norm = value_normalize(str(title)) + if title_norm and title_norm not in tags: + tags.append(title_norm) + + return tags \ No newline at end of file diff --git a/cmdlet/get_tag.py b/cmdlet/get_tag.py index bfd8264..739b14f 100644 --- a/cmdlet/get_tag.py +++ b/cmdlet/get_tag.py @@ -358,10 +358,25 @@ def _emit_tags_as_table( # Store the table and items in history so @.. works to go back # Use overlay mode so it doesn't push the previous search to history stack # This makes get-tag behave like a transient view + table_applied = False try: ctx.set_last_result_table_overlay(table, tag_items, subject) + table_applied = True except AttributeError: - ctx.set_last_result_table(table, tag_items, subject) + try: + ctx.set_last_result_table(table, tag_items, subject) + table_applied = True + except Exception: + table_applied = False + except Exception: + table_applied = False + + if table_applied: + try: + if hasattr(ctx, "set_current_stage_table"): + ctx.set_current_stage_table(table) + except Exception: + pass # Note: CLI will handle displaying the table via ResultTable formatting @@ -776,7 +791,7 @@ def _scrape_url_metadata( import json as json_module try: - from SYS.metadata import extract_ytdlp_tags + from SYS.yt_metadata import extract_ytdlp_tags except ImportError: extract_ytdlp_tags = None @@ -1613,6 +1628,33 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: if isinstance(result, list) and len(result) > 0: result = result[0] + try: + display_subject = ctx.get_last_result_subject() + except Exception: + display_subject = None + + def _value_has_content(value: Any) -> bool: + if value is None: + return False + if isinstance(value, str): + return bool(value.strip()) + if isinstance(value, (list, tuple, set)): + return len(value) > 0 + return True + + def _resolve_subject_value(*keys: str) -> Any: + for key in keys: + val = get_field(result, key, None) + if _value_has_content(val): + return val + if display_subject is None: + return None + for key in keys: + val = get_field(display_subject, key, None) + if _value_has_content(val): + return val + return None + # If the current result already carries a tag list (e.g. a selected metadata # row from get-tag -scrape itunes), APPLY those tags to the file in the store. result_provider = get_field(result, "provider", None) @@ -1726,7 +1768,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: ) return 0 - hash_from_result = normalize_hash(get_field(result, "hash", None)) + hash_from_result = normalize_hash(_resolve_subject_value("hash")) file_hash = hash_override or hash_from_result # Only use emit mode if explicitly requested with --emit flag, not just because we're in a pipeline # This allows interactive REPL to work even in pipelines @@ -1734,7 +1776,8 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: store_label = store_key.strip() if store_key and store_key.strip() else None # Get hash and store from result - store_name = get_field(result, "store") + store_value = _resolve_subject_value("store") + store_name = str(store_value).strip() if store_value is not None else None if not file_hash: log("No hash available in result", file=sys.stderr) @@ -1744,6 +1787,68 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: log("No store specified in result", file=sys.stderr) return 1 + item_title = ( + _resolve_subject_value("title", "name", "filename") + ) + subject_store = store_name + subject_path_value = ( + _resolve_subject_value("path", "target", "filename") + ) + subject_path = None + if subject_path_value is not None: + try: + subject_path = str(subject_path_value) + except Exception: + subject_path = None + + service_name = "" + subject_payload_base: Dict[str, Any] = { + "tag": [], + "title": item_title, + "name": item_title, + "store": subject_store, + "service_name": service_name, + "extra": { + "tag": [], + }, + } + if file_hash: + subject_payload_base["hash"] = file_hash + if subject_path: + subject_payload_base["path"] = subject_path + + def _subject_payload_with( + tags: Sequence[str], + service_name_override: Optional[str] = None, + ) -> Dict[str, Any]: + payload = dict(subject_payload_base) + payload["tag"] = list(tags) + extra = {"tag": list(tags)} + payload["extra"] = extra + if service_name_override is not None: + payload["service_name"] = service_name_override + return payload + + raw_result_tags = get_field(result, "tag", None) + if not isinstance(raw_result_tags, list): + raw_result_tags = get_field(result, "tags", None) + display_tags: List[str] = [] + if isinstance(raw_result_tags, list): + display_tags = [str(t) for t in raw_result_tags if t is not None] + if display_tags and not emit_mode: + subject_payload = _subject_payload_with(display_tags) + _emit_tags_as_table( + display_tags, + file_hash=file_hash, + store=str(subject_store), + service_name=None, + config=config, + item_title=item_title, + path=subject_path, + subject=subject_payload, + ) + return 0 + # Get tags using storage backend try: from Store import Store @@ -1761,56 +1866,18 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: log(f"Failed to get tags: {exc}", file=sys.stderr) return 1 - # Always output to ResultTable (pipeline mode only) - # Extract title for table header - item_title = ( - get_field(result, - "title", - None) or get_field(result, - "name", - None) or get_field(result, - "filename", - None) + subject_payload = _subject_payload_with( + current, + service_name if source == "hydrus" else None, ) - - # Build a subject payload representing the file whose tags are being shown - subject_store = get_field(result, "store", None) or store_name - subject_path = ( - get_field(result, - "path", - None) or get_field(result, - "target", - None) or get_field(result, - "filename", - None) - ) - subject_payload: Dict[str, - Any] = { - "tag": list(current), - "title": item_title, - "name": item_title, - "store": subject_store, - "service_name": service_name, - "extra": { - "tag": list(current), - }, - } - if file_hash: - subject_payload["hash"] = file_hash - if subject_path: - try: - subject_payload["path"] = str(subject_path) - except Exception: - pass - _emit_tags_as_table( current, file_hash=file_hash, - store=subject_store, + store=str(subject_store), service_name=service_name if source == "hydrus" else None, config=config, item_title=item_title, - path=str(subject_path) if subject_path else None, + path=subject_path, subject=subject_payload, ) diff --git a/tool/ytdlp.py b/tool/ytdlp.py index 19f2d4a..6186fd8 100644 --- a/tool/ytdlp.py +++ b/tool/ytdlp.py @@ -27,7 +27,7 @@ from SYS.models import ( ) from SYS.pipeline_progress import PipelineProgress from SYS.utils import ensure_directory, sha256_file -from SYS.metadata import extract_ytdlp_tags +from SYS.yt_metadata import extract_ytdlp_tags _YTDLP_TRANSFER_STATE: Dict[str, Dict[str, Any]] = {} @@ -44,6 +44,96 @@ else: _EXTRACTOR_CACHE: List[Any] | None = None +# Patterns for domain extraction from yt-dlp regexes +# 1) Alternation group followed by \.tld e.g. (?:youtube|youtu|youtube-nocookie)\.com +ALT_GROUP_TLD = re.compile(r'\((?:\?:)?([^\)]+)\)\\\.(?P[A-Za-z0-9.+-]+)') +# 2) Literal domain pieces like youtube\.com or youtu\.be (not preceded by a group) +LITERAL_DOMAIN = re.compile(r'(? List[str]: + if not valid_url: + return [] + if isinstance(valid_url, str): + return [valid_url] + if isinstance(valid_url, (list, tuple)): + return [p for p in valid_url if isinstance(p, str)] + return [] + + +def extract_from_pattern(pat: str) -> set[str]: + domains = set() + + # 1) Alternation groups followed by .tld + for alt_group, tld in ALT_GROUP_TLD.findall(pat): + # alt_group like "youtube|youtu|youtube-nocookie" + for alt in alt_group.split('|'): + alt = alt.strip() + # remove any non-domain tokens like (?:www\.)? if present inside alt (rare) + alt = re.sub(r'\(\?:www\\\.\)\?', '', alt) + if alt: + domains.add(f"{alt}.{tld}".lower()) + + # 2) Literal domain matches (youtube\.com) + for name, tld in LITERAL_DOMAIN.findall(pat): + domains.add(f"{name}.{tld}".lower()) + + # 3) Partial tokens fallback (only if we didn't already capture domains) + # This helps when regexes contain plain tokens like 'zhihu' or 'vimeo' without .com + if not domains: + for token in PARTIAL_TOKEN.findall(pat): + # ignore common regex words that are not domains + if len(token) <= 2: + continue + # avoid tokens that are clearly regex constructs + if token.lower() in {"https", "http", "www", "com", "net", "org"}: + continue + domains.add(f"{token.lower()}.com") + + return domains + + +def extract_domains(valid_url) -> set[str]: + patterns = normalize_patterns(valid_url) + all_domains = set() + for pat in patterns: + all_domains |= extract_from_pattern(pat) + # final cleanup: remove obvious junk like 'com.com' if present + cleaned = set() + for d in all_domains: + # drop duplicates where left side equals tld (e.g., com.com) + parts = d.split('.') + if len(parts) >= 2 and parts[-2] == parts[-1]: + continue + cleaned.add(d) + return cleaned + + +def _build_supported_domains() -> set[str]: + global _SUPPORTED_DOMAINS + if _SUPPORTED_DOMAINS is not None: + return _SUPPORTED_DOMAINS + + _SUPPORTED_DOMAINS = set() + if gen_extractors is None: + return _SUPPORTED_DOMAINS + + try: + for e in gen_extractors(): + name = getattr(e, "IE_NAME", "").lower() + if name == "generic": + continue + regex = getattr(e, "_VALID_URL", None) + domains = extract_domains(regex) + _SUPPORTED_DOMAINS.update(domains) + except Exception: + pass + return _SUPPORTED_DOMAINS + def _get_nested(config: Dict[str, Any], *path: str) -> Any: cur: Any = config @@ -122,17 +212,14 @@ def is_url_supported_by_ytdlp(url: str) -> bool: return False try: - for extractor in _get_extractors(): - try: - if not extractor.suitable(url): - continue - except Exception: - continue - - name = getattr(extractor, "IE_NAME", "").lower() - if name == "generic": - continue - return True + parsed = urlparse(url) + domain = parsed.netloc.lower() + if not domain: + return False + supported = _build_supported_domains() + for base in supported: + if domain == base or domain.endswith("." + base): + return True except Exception: return False @@ -593,16 +680,22 @@ class YtDlpTool: # Defaulting to 'chrome' as the most common path. base_options["cookiesfrombrowser"] = "chrome" + # Special handling for format keywords + if opts.ytdl_format == "audio": + opts = opts._replace(mode="audio", ytdl_format=None) + elif opts.ytdl_format == "video": + opts = opts._replace(mode="video", ytdl_format=None) + if opts.no_playlist: base_options["noplaylist"] = True fmt = opts.ytdl_format or self.default_format(opts.mode) base_options["format"] = fmt - # if opts.mode == "audio": - # base_options["postprocessors"] = [{ - # "key": "FFmpegExtractAudio" - # }] + if opts.mode == "audio": + base_options["postprocessors"] = [{ + "key": "FFmpegExtractAudio" + }] if opts.mode != "audio": format_sort = self.defaults.format_sort or [