j

2026-01-20 16:42:49 -08:00
parent 1e2054189b
commit 922b649e17
9 changed files with 351 additions and 141 deletions
--- a/API/HTTP.py
+++ b/API/HTTP.py
@@ -25,7 +25,7 @@ from SYS.models import DebugLogger, DownloadError, DownloadMediaResult, Progress
 from SYS.utils import ensure_directory, sha256_file

 try:  # Optional; used for metadata extraction when available
-    from SYS.metadata import extract_ytdlp_tags
+    from SYS.yt_metadata import extract_ytdlp_tags
 except Exception:  # pragma: no cover - optional dependency
    extract_ytdlp_tags = None  # type: ignore[assignment]

@@ -56,7 +56,7 @@ def _resolve_verify_value(verify_ssl: bool) -> Union[bool, str]:
            mod = sys.modules.get(mod_name)
            if mod is None:
                mod = __import__(mod_name)
-        except Exception:
+        except (ImportError, ModuleNotFoundError):
            return None

        # Common APIs that return a bundle path
--- a/API/data/alldebrid.json
+++ b/API/data/alldebrid.json
@@ -353,7 +353,7 @@
          "filedot\\.(xyz|to|top)/([0-9a-zA-Z]{12})"
        ],
        "regexp": "filedot\\.(xyz|to|top)/([0-9a-zA-Z]{12})",
-        "status": false
+        "status": true
      },
      "filefactory": {
        "name": "filefactory",
@@ -622,7 +622,7 @@
          "(simfileshare\\.net/download/[0-9]+/)"
        ],
        "regexp": "(simfileshare\\.net/download/[0-9]+/)",
-        "status": false
+        "status": true
      },
      "streamtape": {
        "name": "streamtape",
--- a/Provider/metadata_provider.py
+++ b/Provider/metadata_provider.py
@@ -852,7 +852,7 @@ class YtdlpMetadataProvider(MetadataProvider):

        tags: List[str] = []
        try:
-            from SYS.metadata import extract_ytdlp_tags
+            from SYS.yt_metadata import extract_ytdlp_tags
        except Exception:
            extract_ytdlp_tags = None  # type: ignore[assignment]

--- a/SYS/metadata.py
+++ b/SYS/metadata.py
@@ -7,7 +7,7 @@ from SYS.logger import log, debug
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple

-from API.HydrusNetwork import apply_hydrus_tag_mutation, fetch_hydrus_metadata, fetch_hydrus_metadata_by_url
+from SYS.yt_metadata import extract_ytdlp_tags

 try:  # Optional; used when available for richer metadata fetches
    import yt_dlp
@@ -918,72 +918,7 @@ def apply_tag_mutation(payload: Dict[str,
            }


-def extract_ytdlp_tags(entry: Dict[str, Any]) -> List[str]:
-    """ """
-    tags: List[str] = []
-    seen_namespaces: Set[str] = set()

-    # Meaningful yt-dlp fields that should become tags
-    # This mapping excludes technical fields: filesize, duration, format_id, vcodec, acodec, ext, etc.
-    field_to_namespace = {
-        "artist": "artist",
-        "album": "album",
-        "creator": "creator",
-        "uploader": "creator",  # Map uploader to creator (deduplicate)
-        "uploader_id": "creator",
-        "channel": "channel",
-        "genre": "genre",
-        "track": "track",
-        "track_number": "track_number",
-        "release_date": "release_date",
-        "upload_date": "upload_date",
-        "title": "title",
-        "license": "license",
-        "location": "location",
-    }
-
-    # Extract simple field mappings
-    for yt_field, namespace in field_to_namespace.items():
-        value = entry.get(yt_field)
-        if value is not None:
-            value_str = value_normalize(str(value))
-            if value_str:
-                # Prevent duplicate creator tags (only use first creator)
-                if namespace == "creator":
-                    if "creator" in seen_namespaces:
-                        continue
-                    seen_namespaces.add("creator")
-
-                _add_tag(tags, namespace, value_str)
-
-    # Handle tags field specially (could be list, dict, or string)
-    # For list/sequence tags, capture as freeform (no namespace prefix)
-    tags_field = entry.get("tags")
-    if tags_field is not None:
-        if isinstance(tags_field, list):
-            # Tags is list: ["tag1", "tag2", ...] → capture as freeform tags (no "tag:" prefix)
-            # These are typically genre/category tags from the source (BandCamp genres, etc.)
-            for tag_value in tags_field:
-                if tag_value:
-                    normalized = value_normalize(str(tag_value))
-                    if normalized and normalized not in tags:
-                        tags.append(normalized)
-        elif isinstance(tags_field, dict):
-            # Tags is dict: {"key": "val"} → tag:key:val
-            for key, val in tags_field.items():
-                if key and val:
-                    key_normalized = value_normalize(str(key))
-                    val_normalized = value_normalize(str(val))
-                    if key_normalized and val_normalized:
-                        _add_tag(tags, f"tag:{key_normalized}", val_normalized)
-        else:
-            # Tags is string or other: add as freeform
-            if tags_field:
-                normalized = value_normalize(str(tags_field))
-                if normalized and normalized not in tags:
-                    tags.append(normalized)
-
-    return tags


 def dedup_tags_by_namespace(tags: List[str], keep_first: bool = True) -> List[str]:
--- a/SYS/models.py
+++ b/SYS/models.py
@@ -2,6 +2,7 @@

 import datetime
 import hashlib
+import inspect
 import json
 import os
 import shutil
@@ -996,13 +997,22 @@ class PipelineLiveProgress:
    def stop(self) -> None:
        # Safe to call whether Live is running or paused.
        if self._live is not None:
+            stop_fn = self._live.stop
+            has_clear = False
            try:
+                signature = inspect.signature(stop_fn)
+                has_clear = "clear" in signature.parameters
+            except (ValueError, TypeError):
+                pass
+
            try:
-                    self._live.stop(clear=True)
-                except TypeError:
-                    self._live.stop()
+                if has_clear:
+                    stop_fn(clear=True)
+                else:
+                    stop_fn()
            except Exception:
-                    self._live.stop()
+                try:
+                    stop_fn()
                except Exception:
                    pass

--- a/SYS/pipeline.py
+++ b/SYS/pipeline.py
@@ -2313,6 +2313,9 @@ class PipelineExecutor:
                    if name in {"get-relationship",
                                "get-rel"}:
                        continue
+                    if name in {"get-metadata",
+                                "meta"}:
+                        continue
                    # `.pipe` (MPV) is an interactive launcher; disable pipeline Live progress
                    # for it because it doesn't meaningfully "complete" (mpv may keep running)
                    # and Live output interferes with MPV playlist UI.
--- a/SYS/yt_metadata.py
+++ b/SYS/yt_metadata.py
@@ -0,0 +1,102 @@
+import re
+from typing import Any, Dict, List, Set
+
+
+def value_normalize(value: Any) -> str:
+    text = str(value).strip()
+    return text.lower() if text else ""
+
+
+def _add_tag(tags: List[str], namespace: str, value: str) -> None:
+    """Add a namespaced tag if not already present."""
+    if not namespace or not value:
+        return
+    normalized_value = value_normalize(value)
+    if not normalized_value:
+        return
+    candidate = f"{namespace}:{normalized_value}"
+    if candidate not in tags:
+        tags.append(candidate)
+
+
+def extract_ytdlp_tags(entry: Dict[str, Any]) -> List[str]:
+    """ """
+    tags: List[str] = []
+    seen_namespaces: Set[str] = set()
+
+    # Meaningful yt-dlp fields that should become tags
+    # This mapping excludes technical fields: filesize, duration, format_id, vcodec, acodec, ext, etc.
+    field_to_namespace = {
+        "artist": "artist",
+        "album": "album",
+        "creator": "creator",
+        "uploader": "creator",  # Map uploader to creator (deduplicate)
+        "uploader_id": "creator",
+        "channel": "channel",
+        "genre": "genre",
+        "track": "track",
+        "track_number": "track_number",
+        "release_date": "release_date",
+        "upload_date": "upload_date",
+        "title": "title",
+        "license": "license",
+        "location": "location",
+    }
+
+    # Extract simple field mappings
+    for yt_field, namespace in field_to_namespace.items():
+        value = entry.get(yt_field)
+        if value is not None:
+            value_str = value_normalize(str(value))
+            if value_str:
+                # Prevent duplicate creator tags (only use first creator)
+                if namespace == "creator":
+                    if "creator" in seen_namespaces:
+                        continue
+                    seen_namespaces.add("creator")
+
+                _add_tag(tags, namespace, value_str)
+
+    # Handle tags field specially (could be list, dict, or string)
+    # For list/sequence tags, capture as freeform (no namespace prefix)
+    tags_field = entry.get("tags")
+    if tags_field is not None:
+        if isinstance(tags_field, list):
+            # Tags is list: ["tag1", "tag2", ...] → capture as freeform tags (no "tag:" prefix)
+            # These are typically genre/category tags from the source (BandCamp genres, etc.)
+            for tag_value in tags_field:
+                if tag_value:
+                    normalized = value_normalize(str(tag_value))
+                    if normalized and normalized not in tags:
+                        tags.append(normalized)
+        elif isinstance(tags_field, dict):
+            # Tags is dict: {"key": "val"} → tag:key:val
+            for key, val in tags_field.items():
+                if key and val:
+                    key_normalized = value_normalize(str(key))
+                    val_normalized = value_normalize(str(val))
+                    if key_normalized and val_normalized:
+                        _add_tag(tags, f"tag:{key_normalized}", val_normalized)
+        else:
+            # Tags is string: "tag1,tag2" → split and capture as freeform
+            tag_str = str(tags_field).strip()
+            if tag_str:
+                for tag_value in re.split(r'[,\s]+', tag_str):
+                    tag_value = tag_value.strip()
+                    if tag_value:
+                        normalized = value_normalize(tag_value)
+                        if normalized and normalized not in tags:
+                            tags.append(normalized)
+
+    # Extract chapters as tags if present
+    chapters = entry.get("chapters")
+    if chapters and isinstance(chapters, list):
+        for chapter in chapters:
+            if isinstance(chapter, dict):
+                title = chapter.get("title")
+                if title:
+                    title_norm = value_normalize(str(title))
+                    if title_norm and title_norm not in tags:
+                        tags.append(title_norm)
+
+    return tags
--- a/cmdlet/get_tag.py
+++ b/cmdlet/get_tag.py
@@ -358,10 +358,25 @@ def _emit_tags_as_table(
    # Store the table and items in history so @.. works to go back
    # Use overlay mode so it doesn't push the previous search to history stack
    # This makes get-tag behave like a transient view
+    table_applied = False
    try:
        ctx.set_last_result_table_overlay(table, tag_items, subject)
+        table_applied = True
    except AttributeError:
+        try:
            ctx.set_last_result_table(table, tag_items, subject)
+            table_applied = True
+        except Exception:
+            table_applied = False
+    except Exception:
+        table_applied = False
+
+    if table_applied:
+        try:
+            if hasattr(ctx, "set_current_stage_table"):
+                ctx.set_current_stage_table(table)
+        except Exception:
+            pass
    # Note: CLI will handle displaying the table via ResultTable formatting


@@ -776,7 +791,7 @@ def _scrape_url_metadata(
        import json as json_module

        try:
-            from SYS.metadata import extract_ytdlp_tags
+            from SYS.yt_metadata import extract_ytdlp_tags
        except ImportError:
            extract_ytdlp_tags = None

@@ -1613,6 +1628,33 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
    if isinstance(result, list) and len(result) > 0:
        result = result[0]

+    try:
+        display_subject = ctx.get_last_result_subject()
+    except Exception:
+        display_subject = None
+
+    def _value_has_content(value: Any) -> bool:
+        if value is None:
+            return False
+        if isinstance(value, str):
+            return bool(value.strip())
+        if isinstance(value, (list, tuple, set)):
+            return len(value) > 0
+        return True
+
+    def _resolve_subject_value(*keys: str) -> Any:
+        for key in keys:
+            val = get_field(result, key, None)
+            if _value_has_content(val):
+                return val
+        if display_subject is None:
+            return None
+        for key in keys:
+            val = get_field(display_subject, key, None)
+            if _value_has_content(val):
+                return val
+        return None
+
    # If the current result already carries a tag list (e.g. a selected metadata
    # row from get-tag -scrape itunes), APPLY those tags to the file in the store.
    result_provider = get_field(result, "provider", None)
@@ -1726,7 +1768,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
        )
        return 0

-    hash_from_result = normalize_hash(get_field(result, "hash", None))
+    hash_from_result = normalize_hash(_resolve_subject_value("hash"))
    file_hash = hash_override or hash_from_result
    # Only use emit mode if explicitly requested with --emit flag, not just because we're in a pipeline
    # This allows interactive REPL to work even in pipelines
@@ -1734,7 +1776,8 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
    store_label = store_key.strip() if store_key and store_key.strip() else None

    # Get hash and store from result
-    store_name = get_field(result, "store")
+    store_value = _resolve_subject_value("store")
+    store_name = str(store_value).strip() if store_value is not None else None

    if not file_hash:
        log("No hash available in result", file=sys.stderr)
@@ -1744,6 +1787,68 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
        log("No store specified in result", file=sys.stderr)
        return 1

+    item_title = (
+        _resolve_subject_value("title", "name", "filename")
+    )
+    subject_store = store_name
+    subject_path_value = (
+        _resolve_subject_value("path", "target", "filename")
+    )
+    subject_path = None
+    if subject_path_value is not None:
+        try:
+            subject_path = str(subject_path_value)
+        except Exception:
+            subject_path = None
+
+    service_name = ""
+    subject_payload_base: Dict[str, Any] = {
+        "tag": [],
+        "title": item_title,
+        "name": item_title,
+        "store": subject_store,
+        "service_name": service_name,
+        "extra": {
+            "tag": [],
+        },
+    }
+    if file_hash:
+        subject_payload_base["hash"] = file_hash
+    if subject_path:
+        subject_payload_base["path"] = subject_path
+
+    def _subject_payload_with(
+        tags: Sequence[str],
+        service_name_override: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        payload = dict(subject_payload_base)
+        payload["tag"] = list(tags)
+        extra = {"tag": list(tags)}
+        payload["extra"] = extra
+        if service_name_override is not None:
+            payload["service_name"] = service_name_override
+        return payload
+
+    raw_result_tags = get_field(result, "tag", None)
+    if not isinstance(raw_result_tags, list):
+        raw_result_tags = get_field(result, "tags", None)
+    display_tags: List[str] = []
+    if isinstance(raw_result_tags, list):
+        display_tags = [str(t) for t in raw_result_tags if t is not None]
+    if display_tags and not emit_mode:
+        subject_payload = _subject_payload_with(display_tags)
+        _emit_tags_as_table(
+            display_tags,
+            file_hash=file_hash,
+            store=str(subject_store),
+            service_name=None,
+            config=config,
+            item_title=item_title,
+            path=subject_path,
+            subject=subject_payload,
+        )
+        return 0
+
    # Get tags using storage backend
    try:
        from Store import Store
@@ -1761,56 +1866,18 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
        log(f"Failed to get tags: {exc}", file=sys.stderr)
        return 1

-    # Always output to ResultTable (pipeline mode only)
-    # Extract title for table header
-    item_title = (
-        get_field(result,
-                  "title",
-                  None) or get_field(result,
-                                     "name",
-                                     None) or get_field(result,
-                                                        "filename",
-                                                        None)
+    subject_payload = _subject_payload_with(
+        current,
+        service_name if source == "hydrus" else None,
    )
-
-    # Build a subject payload representing the file whose tags are being shown
-    subject_store = get_field(result, "store", None) or store_name
-    subject_path = (
-        get_field(result,
-                  "path",
-                  None) or get_field(result,
-                                     "target",
-                                     None) or get_field(result,
-                                                        "filename",
-                                                        None)
-    )
-    subject_payload: Dict[str,
-                          Any] = {
-                              "tag": list(current),
-                              "title": item_title,
-                              "name": item_title,
-                              "store": subject_store,
-                              "service_name": service_name,
-                              "extra": {
-                                  "tag": list(current),
-                              },
-                          }
-    if file_hash:
-        subject_payload["hash"] = file_hash
-    if subject_path:
-        try:
-            subject_payload["path"] = str(subject_path)
-        except Exception:
-            pass
-
    _emit_tags_as_table(
        current,
        file_hash=file_hash,
-        store=subject_store,
+        store=str(subject_store),
        service_name=service_name if source == "hydrus" else None,
        config=config,
        item_title=item_title,
-        path=str(subject_path) if subject_path else None,
+        path=subject_path,
        subject=subject_payload,
    )

--- a/tool/ytdlp.py
+++ b/tool/ytdlp.py
@@ -27,7 +27,7 @@ from SYS.models import (
 )
 from SYS.pipeline_progress import PipelineProgress
 from SYS.utils import ensure_directory, sha256_file
-from SYS.metadata import extract_ytdlp_tags
+from SYS.yt_metadata import extract_ytdlp_tags

 _YTDLP_TRANSFER_STATE: Dict[str, Dict[str, Any]] = {}

@@ -44,6 +44,96 @@ else:

 _EXTRACTOR_CACHE: List[Any] | None = None

+# Patterns for domain extraction from yt-dlp regexes
+# 1) Alternation group followed by \.tld  e.g. (?:youtube|youtu|youtube-nocookie)\.com
+ALT_GROUP_TLD = re.compile(r'\((?:\?:)?([^\)]+)\)\\\.(?P<tld>[A-Za-z0-9.+-]+)')
+# 2) Literal domain pieces like youtube\.com or youtu\.be (not preceded by a group)
+LITERAL_DOMAIN = re.compile(r'(?<!\()(?<!\|)(?<!:)([A-Za-z0-9][A-Za-z0-9_-]{0,})\\\.([A-Za-z0-9.+-]+)')
+# 3) Partial domain tokens that appear alone (e.g., zhihu) — treat as zhihu.com fallback
+PARTIAL_TOKEN = re.compile(r'(?<![A-Za-z0-9_-])([A-Za-z0-9][A-Za-z0-9_-]{1,})(?=(?:\\?[/\)\$]|\\\.|$))')
+
+_SUPPORTED_DOMAINS: set[str] | None = None
+
+
+def normalize_patterns(valid_url) -> List[str]:
+    if not valid_url:
+        return []
+    if isinstance(valid_url, str):
+        return [valid_url]
+    if isinstance(valid_url, (list, tuple)):
+        return [p for p in valid_url if isinstance(p, str)]
+    return []
+
+
+def extract_from_pattern(pat: str) -> set[str]:
+    domains = set()
+
+    # 1) Alternation groups followed by .tld
+    for alt_group, tld in ALT_GROUP_TLD.findall(pat):
+        # alt_group like "youtube|youtu|youtube-nocookie"
+        for alt in alt_group.split('|'):
+            alt = alt.strip()
+            # remove any non-domain tokens like (?:www\.)? if present inside alt (rare)
+            alt = re.sub(r'\(\?:www\\\.\)\?', '', alt)
+            if alt:
+                domains.add(f"{alt}.{tld}".lower())
+
+    # 2) Literal domain matches (youtube\.com)
+    for name, tld in LITERAL_DOMAIN.findall(pat):
+        domains.add(f"{name}.{tld}".lower())
+
+    # 3) Partial tokens fallback (only if we didn't already capture domains)
+    # This helps when regexes contain plain tokens like 'zhihu' or 'vimeo' without .com
+    if not domains:
+        for token in PARTIAL_TOKEN.findall(pat):
+            # ignore common regex words that are not domains
+            if len(token) <= 2:
+                continue
+            # avoid tokens that are clearly regex constructs
+            if token.lower() in {"https", "http", "www", "com", "net", "org"}:
+                continue
+            domains.add(f"{token.lower()}.com")
+
+    return domains
+
+
+def extract_domains(valid_url) -> set[str]:
+    patterns = normalize_patterns(valid_url)
+    all_domains = set()
+    for pat in patterns:
+        all_domains |= extract_from_pattern(pat)
+    # final cleanup: remove obvious junk like 'com.com' if present
+    cleaned = set()
+    for d in all_domains:
+        # drop duplicates where left side equals tld (e.g., com.com)
+        parts = d.split('.')
+        if len(parts) >= 2 and parts[-2] == parts[-1]:
+            continue
+        cleaned.add(d)
+    return cleaned
+
+
+def _build_supported_domains() -> set[str]:
+    global _SUPPORTED_DOMAINS
+    if _SUPPORTED_DOMAINS is not None:
+        return _SUPPORTED_DOMAINS
+
+    _SUPPORTED_DOMAINS = set()
+    if gen_extractors is None:
+        return _SUPPORTED_DOMAINS
+
+    try:
+        for e in gen_extractors():
+            name = getattr(e, "IE_NAME", "").lower()
+            if name == "generic":
+                continue
+            regex = getattr(e, "_VALID_URL", None)
+            domains = extract_domains(regex)
+            _SUPPORTED_DOMAINS.update(domains)
+    except Exception:
+        pass
+    return _SUPPORTED_DOMAINS
+

 def _get_nested(config: Dict[str, Any], *path: str) -> Any:
    cur: Any = config
@@ -122,16 +212,13 @@ def is_url_supported_by_ytdlp(url: str) -> bool:
        return False

    try:
-        for extractor in _get_extractors():
-            try:
-                if not extractor.suitable(url):
-                    continue
-            except Exception:
-                continue
-
-            name = getattr(extractor, "IE_NAME", "").lower()
-            if name == "generic":
-                continue
+        parsed = urlparse(url)
+        domain = parsed.netloc.lower()
+        if not domain:
+            return False
+        supported = _build_supported_domains()
+        for base in supported:
+            if domain == base or domain.endswith("." + base):
                return True
    except Exception:
        return False
@@ -593,16 +680,22 @@ class YtDlpTool:
                # Defaulting to 'chrome' as the most common path.
                base_options["cookiesfrombrowser"] = "chrome"

+        # Special handling for format keywords
+        if opts.ytdl_format == "audio":
+            opts = opts._replace(mode="audio", ytdl_format=None)
+        elif opts.ytdl_format == "video":
+            opts = opts._replace(mode="video", ytdl_format=None)
+
        if opts.no_playlist:
            base_options["noplaylist"] = True

        fmt = opts.ytdl_format or self.default_format(opts.mode)
        base_options["format"] = fmt

-        # if opts.mode == "audio":
-        #     base_options["postprocessors"] = [{
-        #         "key": "FFmpegExtractAudio"
-        #     }]
+        if opts.mode == "audio":
+            base_options["postprocessors"] = [{
+                "key": "FFmpegExtractAudio"
+            }]

        if opts.mode != "audio":
            format_sort = self.defaults.format_sort or [