j

2026-01-20 16:42:49 -08:00
parent 1e2054189b
commit 922b649e17
9 changed files with 351 additions and 141 deletions
--- a/API/HTTP.py
+++ b/API/HTTP.py
@@ -25,7 +25,7 @@ from SYS.models import DebugLogger, DownloadError, DownloadMediaResult, Progress
 from SYS.utils import ensure_directory, sha256_file
 try:  # Optional; used for metadata extraction when available
-    from SYS.metadata import extract_ytdlp_tags
+    from SYS.yt_metadata import extract_ytdlp_tags
 except Exception:  # pragma: no cover - optional dependency
    extract_ytdlp_tags = None  # type: ignore[assignment]
@@ -56,7 +56,7 @@ def _resolve_verify_value(verify_ssl: bool) -> Union[bool, str]:
            mod = sys.modules.get(mod_name)
            if mod is None:
                mod = __import__(mod_name)
-        except Exception:
+        except (ImportError, ModuleNotFoundError):
            return None
        # Common APIs that return a bundle path
--- a/API/data/alldebrid.json
+++ b/API/data/alldebrid.json
@@ -353,7 +353,7 @@
          "filedot\\.(xyz|to|top)/([0-9a-zA-Z]{12})"
        ],
        "regexp": "filedot\\.(xyz|to|top)/([0-9a-zA-Z]{12})",
-        "status": false
+        "status": true
      },
      "filefactory": {
        "name": "filefactory",
@@ -622,7 +622,7 @@
          "(simfileshare\\.net/download/[0-9]+/)"
        ],
        "regexp": "(simfileshare\\.net/download/[0-9]+/)",
-        "status": false
+        "status": true
      },
      "streamtape": {
        "name": "streamtape",
--- a/Provider/metadata_provider.py
+++ b/Provider/metadata_provider.py
@@ -852,7 +852,7 @@ class YtdlpMetadataProvider(MetadataProvider):
        tags: List[str] = []
        try:
-            from SYS.metadata import extract_ytdlp_tags
+            from SYS.yt_metadata import extract_ytdlp_tags
        except Exception:
            extract_ytdlp_tags = None  # type: ignore[assignment]
--- a/SYS/metadata.py
+++ b/SYS/metadata.py
@@ -7,7 +7,7 @@ from SYS.logger import log, debug
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple
-from API.HydrusNetwork import apply_hydrus_tag_mutation, fetch_hydrus_metadata, fetch_hydrus_metadata_by_url
+from SYS.yt_metadata import extract_ytdlp_tags
 try:  # Optional; used when available for richer metadata fetches
    import yt_dlp
@@ -918,72 +918,7 @@ def apply_tag_mutation(payload: Dict[str,
            }
 def extract_ytdlp_tags(entry: Dict[str, Any]) -> List[str]:
    """ """
    tags: List[str] = []
    seen_namespaces: Set[str] = set()
    # Meaningful yt-dlp fields that should become tags
    # This mapping excludes technical fields: filesize, duration, format_id, vcodec, acodec, ext, etc.
    field_to_namespace = {
        "artist": "artist",
        "album": "album",
        "creator": "creator",
        "uploader": "creator",  # Map uploader to creator (deduplicate)
        "uploader_id": "creator",
        "channel": "channel",
        "genre": "genre",
        "track": "track",
        "track_number": "track_number",
        "release_date": "release_date",
        "upload_date": "upload_date",
        "title": "title",
        "license": "license",
        "location": "location",
    }
    # Extract simple field mappings
    for yt_field, namespace in field_to_namespace.items():
        value = entry.get(yt_field)
        if value is not None:
            value_str = value_normalize(str(value))
            if value_str:
                # Prevent duplicate creator tags (only use first creator)
                if namespace == "creator":
                    if "creator" in seen_namespaces:
                        continue
                    seen_namespaces.add("creator")
                _add_tag(tags, namespace, value_str)
    # Handle tags field specially (could be list, dict, or string)
    # For list/sequence tags, capture as freeform (no namespace prefix)
    tags_field = entry.get("tags")
    if tags_field is not None:
        if isinstance(tags_field, list):
            # Tags is list: ["tag1", "tag2", ...] → capture as freeform tags (no "tag:" prefix)
            # These are typically genre/category tags from the source (BandCamp genres, etc.)
            for tag_value in tags_field:
                if tag_value:
                    normalized = value_normalize(str(tag_value))
                    if normalized and normalized not in tags:
                        tags.append(normalized)
        elif isinstance(tags_field, dict):
            # Tags is dict: {"key": "val"} → tag:key:val
            for key, val in tags_field.items():
                if key and val:
                    key_normalized = value_normalize(str(key))
                    val_normalized = value_normalize(str(val))
                    if key_normalized and val_normalized:
                        _add_tag(tags, f"tag:{key_normalized}", val_normalized)
        else:
            # Tags is string or other: add as freeform
            if tags_field:
                normalized = value_normalize(str(tags_field))
                if normalized and normalized not in tags:
                    tags.append(normalized)
    return tags
 def dedup_tags_by_namespace(tags: List[str], keep_first: bool = True) -> List[str]:
--- a/SYS/models.py
+++ b/SYS/models.py
@@ -2,6 +2,7 @@
 import datetime
 import hashlib
 import inspect
 import json
 import os
 import shutil
@@ -996,16 +997,25 @@ class PipelineLiveProgress:
    def stop(self) -> None:
        # Safe to call whether Live is running or paused.
        if self._live is not None:
            stop_fn = self._live.stop
            has_clear = False
            try:
-                try:
+                signature = inspect.signature(stop_fn)
-                    self._live.stop(clear=True)
+                has_clear = "clear" in signature.parameters
-                except TypeError:
+            except (ValueError, TypeError):
                    self._live.stop()
                except Exception:
                    self._live.stop()
            except Exception:
                pass
            try:
                if has_clear:
                    stop_fn(clear=True)
                else:
                    stop_fn()
            except Exception:
                try:
                    stop_fn()
                except Exception:
                    pass
        self._live = None
        self._console = None
        self._overall = None
--- a/SYS/pipeline.py
+++ b/SYS/pipeline.py
@@ -2313,6 +2313,9 @@ class PipelineExecutor:
                    if name in {"get-relationship",
                                "get-rel"}:
                        continue
                    if name in {"get-metadata",
                                "meta"}:
                        continue
                    # `.pipe` (MPV) is an interactive launcher; disable pipeline Live progress
                    # for it because it doesn't meaningfully "complete" (mpv may keep running)
                    # and Live output interferes with MPV playlist UI.
--- a/SYS/yt_metadata.py
+++ b/SYS/yt_metadata.py
@@ -0,0 +1,102 @@
 import re
 from typing import Any, Dict, List, Set
 def value_normalize(value: Any) -> str:
    text = str(value).strip()
    return text.lower() if text else ""
 def _add_tag(tags: List[str], namespace: str, value: str) -> None:
    """Add a namespaced tag if not already present."""
    if not namespace or not value:
        return
    normalized_value = value_normalize(value)
    if not normalized_value:
        return
    candidate = f"{namespace}:{normalized_value}"
    if candidate not in tags:
        tags.append(candidate)
 def extract_ytdlp_tags(entry: Dict[str, Any]) -> List[str]:
    """ """
    tags: List[str] = []
    seen_namespaces: Set[str] = set()
    # Meaningful yt-dlp fields that should become tags
    # This mapping excludes technical fields: filesize, duration, format_id, vcodec, acodec, ext, etc.
    field_to_namespace = {
        "artist": "artist",
        "album": "album",
        "creator": "creator",
        "uploader": "creator",  # Map uploader to creator (deduplicate)
        "uploader_id": "creator",
        "channel": "channel",
        "genre": "genre",
        "track": "track",
        "track_number": "track_number",
        "release_date": "release_date",
        "upload_date": "upload_date",
        "title": "title",
        "license": "license",
        "location": "location",
    }
    # Extract simple field mappings
    for yt_field, namespace in field_to_namespace.items():
        value = entry.get(yt_field)
        if value is not None:
            value_str = value_normalize(str(value))
            if value_str:
                # Prevent duplicate creator tags (only use first creator)
                if namespace == "creator":
                    if "creator" in seen_namespaces:
                        continue
                    seen_namespaces.add("creator")
                _add_tag(tags, namespace, value_str)
    # Handle tags field specially (could be list, dict, or string)
    # For list/sequence tags, capture as freeform (no namespace prefix)
    tags_field = entry.get("tags")
    if tags_field is not None:
        if isinstance(tags_field, list):
            # Tags is list: ["tag1", "tag2", ...] → capture as freeform tags (no "tag:" prefix)
            # These are typically genre/category tags from the source (BandCamp genres, etc.)
            for tag_value in tags_field:
                if tag_value:
                    normalized = value_normalize(str(tag_value))
                    if normalized and normalized not in tags:
                        tags.append(normalized)
        elif isinstance(tags_field, dict):
            # Tags is dict: {"key": "val"} → tag:key:val
            for key, val in tags_field.items():
                if key and val:
                    key_normalized = value_normalize(str(key))
                    val_normalized = value_normalize(str(val))
                    if key_normalized and val_normalized:
                        _add_tag(tags, f"tag:{key_normalized}", val_normalized)
        else:
            # Tags is string: "tag1,tag2" → split and capture as freeform
            tag_str = str(tags_field).strip()
            if tag_str:
                for tag_value in re.split(r'[,\s]+', tag_str):
                    tag_value = tag_value.strip()
                    if tag_value:
                        normalized = value_normalize(tag_value)
                        if normalized and normalized not in tags:
                            tags.append(normalized)
    # Extract chapters as tags if present
    chapters = entry.get("chapters")
    if chapters and isinstance(chapters, list):
        for chapter in chapters:
            if isinstance(chapter, dict):
                title = chapter.get("title")
                if title:
                    title_norm = value_normalize(str(title))
                    if title_norm and title_norm not in tags:
                        tags.append(title_norm)
    return tags
--- a/cmdlet/get_tag.py
+++ b/cmdlet/get_tag.py
@@ -358,10 +358,25 @@ def _emit_tags_as_table(
    # Store the table and items in history so @.. works to go back
    # Use overlay mode so it doesn't push the previous search to history stack
    # This makes get-tag behave like a transient view
    table_applied = False
    try:
        ctx.set_last_result_table_overlay(table, tag_items, subject)
        table_applied = True
    except AttributeError:
-        ctx.set_last_result_table(table, tag_items, subject)
+        try:
            ctx.set_last_result_table(table, tag_items, subject)
            table_applied = True
        except Exception:
            table_applied = False
    except Exception:
        table_applied = False
    if table_applied:
        try:
            if hasattr(ctx, "set_current_stage_table"):
                ctx.set_current_stage_table(table)
        except Exception:
            pass
    # Note: CLI will handle displaying the table via ResultTable formatting
@@ -776,7 +791,7 @@ def _scrape_url_metadata(
        import json as json_module
        try:
-            from SYS.metadata import extract_ytdlp_tags
+            from SYS.yt_metadata import extract_ytdlp_tags
        except ImportError:
            extract_ytdlp_tags = None
@@ -1613,6 +1628,33 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
    if isinstance(result, list) and len(result) > 0:
        result = result[0]
    try:
        display_subject = ctx.get_last_result_subject()
    except Exception:
        display_subject = None
    def _value_has_content(value: Any) -> bool:
        if value is None:
            return False
        if isinstance(value, str):
            return bool(value.strip())
        if isinstance(value, (list, tuple, set)):
            return len(value) > 0
        return True
    def _resolve_subject_value(*keys: str) -> Any:
        for key in keys:
            val = get_field(result, key, None)
            if _value_has_content(val):
                return val
        if display_subject is None:
            return None
        for key in keys:
            val = get_field(display_subject, key, None)
            if _value_has_content(val):
                return val
        return None
    # If the current result already carries a tag list (e.g. a selected metadata
    # row from get-tag -scrape itunes), APPLY those tags to the file in the store.
    result_provider = get_field(result, "provider", None)
@@ -1726,7 +1768,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
        )
        return 0
-    hash_from_result = normalize_hash(get_field(result, "hash", None))
+    hash_from_result = normalize_hash(_resolve_subject_value("hash"))
    file_hash = hash_override or hash_from_result
    # Only use emit mode if explicitly requested with --emit flag, not just because we're in a pipeline
    # This allows interactive REPL to work even in pipelines
@@ -1734,7 +1776,8 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
    store_label = store_key.strip() if store_key and store_key.strip() else None
    # Get hash and store from result
-    store_name = get_field(result, "store")
+    store_value = _resolve_subject_value("store")
    store_name = str(store_value).strip() if store_value is not None else None
    if not file_hash:
        log("No hash available in result", file=sys.stderr)
@@ -1744,6 +1787,68 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
        log("No store specified in result", file=sys.stderr)
        return 1
    item_title = (
        _resolve_subject_value("title", "name", "filename")
    )
    subject_store = store_name
    subject_path_value = (
        _resolve_subject_value("path", "target", "filename")
    )
    subject_path = None
    if subject_path_value is not None:
        try:
            subject_path = str(subject_path_value)
        except Exception:
            subject_path = None
    service_name = ""
    subject_payload_base: Dict[str, Any] = {
        "tag": [],
        "title": item_title,
        "name": item_title,
        "store": subject_store,
        "service_name": service_name,
        "extra": {
            "tag": [],
        },
    }
    if file_hash:
        subject_payload_base["hash"] = file_hash
    if subject_path:
        subject_payload_base["path"] = subject_path
    def _subject_payload_with(
        tags: Sequence[str],
        service_name_override: Optional[str] = None,
    ) -> Dict[str, Any]:
        payload = dict(subject_payload_base)
        payload["tag"] = list(tags)
        extra = {"tag": list(tags)}
        payload["extra"] = extra
        if service_name_override is not None:
            payload["service_name"] = service_name_override
        return payload
    raw_result_tags = get_field(result, "tag", None)
    if not isinstance(raw_result_tags, list):
        raw_result_tags = get_field(result, "tags", None)
    display_tags: List[str] = []
    if isinstance(raw_result_tags, list):
        display_tags = [str(t) for t in raw_result_tags if t is not None]
    if display_tags and not emit_mode:
        subject_payload = _subject_payload_with(display_tags)
        _emit_tags_as_table(
            display_tags,
            file_hash=file_hash,
            store=str(subject_store),
            service_name=None,
            config=config,
            item_title=item_title,
            path=subject_path,
            subject=subject_payload,
        )
        return 0
    # Get tags using storage backend
    try:
        from Store import Store
@@ -1761,56 +1866,18 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
        log(f"Failed to get tags: {exc}", file=sys.stderr)
        return 1
-    # Always output to ResultTable (pipeline mode only)
+    subject_payload = _subject_payload_with(
-    # Extract title for table header
+        current,
-    item_title = (
+        service_name if source == "hydrus" else None,
        get_field(result,
                  "title",
                  None) or get_field(result,
                                     "name",
                                     None) or get_field(result,
                                                        "filename",
                                                        None)
    )
    # Build a subject payload representing the file whose tags are being shown
    subject_store = get_field(result, "store", None) or store_name
    subject_path = (
        get_field(result,
                  "path",
                  None) or get_field(result,
                                     "target",
                                     None) or get_field(result,
                                                        "filename",
                                                        None)
    )
    subject_payload: Dict[str,
                          Any] = {
                              "tag": list(current),
                              "title": item_title,
                              "name": item_title,
                              "store": subject_store,
                              "service_name": service_name,
                              "extra": {
                                  "tag": list(current),
                              },
                          }
    if file_hash:
        subject_payload["hash"] = file_hash
    if subject_path:
        try:
            subject_payload["path"] = str(subject_path)
        except Exception:
            pass
    _emit_tags_as_table(
        current,
        file_hash=file_hash,
-        store=subject_store,
+        store=str(subject_store),
        service_name=service_name if source == "hydrus" else None,
        config=config,
        item_title=item_title,
-        path=str(subject_path) if subject_path else None,
+        path=subject_path,
        subject=subject_payload,
    )
--- a/tool/ytdlp.py
+++ b/tool/ytdlp.py
@@ -27,7 +27,7 @@ from SYS.models import (
 )
 from SYS.pipeline_progress import PipelineProgress
 from SYS.utils import ensure_directory, sha256_file
-from SYS.metadata import extract_ytdlp_tags
+from SYS.yt_metadata import extract_ytdlp_tags
 _YTDLP_TRANSFER_STATE: Dict[str, Dict[str, Any]] = {}
@@ -44,6 +44,96 @@ else:
 _EXTRACTOR_CACHE: List[Any] | None = None
 # Patterns for domain extraction from yt-dlp regexes
 # 1) Alternation group followed by \.tld  e.g. (?:youtube|youtu|youtube-nocookie)\.com
 ALT_GROUP_TLD = re.compile(r'\((?:\?:)?([^\)]+)\)\\\.(?P<tld>[A-Za-z0-9.+-]+)')
 # 2) Literal domain pieces like youtube\.com or youtu\.be (not preceded by a group)
 LITERAL_DOMAIN = re.compile(r'(?<!\()(?<!\|)(?<!:)([A-Za-z0-9][A-Za-z0-9_-]{0,})\\\.([A-Za-z0-9.+-]+)')
 # 3) Partial domain tokens that appear alone (e.g., zhihu) — treat as zhihu.com fallback
 PARTIAL_TOKEN = re.compile(r'(?<![A-Za-z0-9_-])([A-Za-z0-9][A-Za-z0-9_-]{1,})(?=(?:\\?[/\)\$]|\\\.|$))')
 _SUPPORTED_DOMAINS: set[str] | None = None
 def normalize_patterns(valid_url) -> List[str]:
    if not valid_url:
        return []
    if isinstance(valid_url, str):
        return [valid_url]
    if isinstance(valid_url, (list, tuple)):
        return [p for p in valid_url if isinstance(p, str)]
    return []
 def extract_from_pattern(pat: str) -> set[str]:
    domains = set()
    # 1) Alternation groups followed by .tld
    for alt_group, tld in ALT_GROUP_TLD.findall(pat):
        # alt_group like "youtube|youtu|youtube-nocookie"
        for alt in alt_group.split('|'):
            alt = alt.strip()
            # remove any non-domain tokens like (?:www\.)? if present inside alt (rare)
            alt = re.sub(r'\(\?:www\\\.\)\?', '', alt)
            if alt:
                domains.add(f"{alt}.{tld}".lower())
    # 2) Literal domain matches (youtube\.com)
    for name, tld in LITERAL_DOMAIN.findall(pat):
        domains.add(f"{name}.{tld}".lower())
    # 3) Partial tokens fallback (only if we didn't already capture domains)
    # This helps when regexes contain plain tokens like 'zhihu' or 'vimeo' without .com
    if not domains:
        for token in PARTIAL_TOKEN.findall(pat):
            # ignore common regex words that are not domains
            if len(token) <= 2:
                continue
            # avoid tokens that are clearly regex constructs
            if token.lower() in {"https", "http", "www", "com", "net", "org"}:
                continue
            domains.add(f"{token.lower()}.com")
    return domains
 def extract_domains(valid_url) -> set[str]:
    patterns = normalize_patterns(valid_url)
    all_domains = set()
    for pat in patterns:
        all_domains |= extract_from_pattern(pat)
    # final cleanup: remove obvious junk like 'com.com' if present
    cleaned = set()
    for d in all_domains:
        # drop duplicates where left side equals tld (e.g., com.com)
        parts = d.split('.')
        if len(parts) >= 2 and parts[-2] == parts[-1]:
            continue
        cleaned.add(d)
    return cleaned
 def _build_supported_domains() -> set[str]:
    global _SUPPORTED_DOMAINS
    if _SUPPORTED_DOMAINS is not None:
        return _SUPPORTED_DOMAINS
    _SUPPORTED_DOMAINS = set()
    if gen_extractors is None:
        return _SUPPORTED_DOMAINS
    try:
        for e in gen_extractors():
            name = getattr(e, "IE_NAME", "").lower()
            if name == "generic":
                continue
            regex = getattr(e, "_VALID_URL", None)
            domains = extract_domains(regex)
            _SUPPORTED_DOMAINS.update(domains)
    except Exception:
        pass
    return _SUPPORTED_DOMAINS
 def _get_nested(config: Dict[str, Any], *path: str) -> Any:
    cur: Any = config
@@ -122,17 +212,14 @@ def is_url_supported_by_ytdlp(url: str) -> bool:
        return False
    try:
-        for extractor in _get_extractors():
+        parsed = urlparse(url)
-            try:
+        domain = parsed.netloc.lower()
-                if not extractor.suitable(url):
+        if not domain:
-                    continue
+            return False
-            except Exception:
+        supported = _build_supported_domains()
-                continue
+        for base in supported:
-
+            if domain == base or domain.endswith("." + base):
-            name = getattr(extractor, "IE_NAME", "").lower()
+                return True
            if name == "generic":
                continue
            return True
    except Exception:
        return False
@@ -593,16 +680,22 @@ class YtDlpTool:
                # Defaulting to 'chrome' as the most common path.
                base_options["cookiesfrombrowser"] = "chrome"
        # Special handling for format keywords
        if opts.ytdl_format == "audio":
            opts = opts._replace(mode="audio", ytdl_format=None)
        elif opts.ytdl_format == "video":
            opts = opts._replace(mode="video", ytdl_format=None)
        if opts.no_playlist:
            base_options["noplaylist"] = True
        fmt = opts.ytdl_format or self.default_format(opts.mode)
        base_options["format"] = fmt
-        # if opts.mode == "audio":
+        if opts.mode == "audio":
-        #     base_options["postprocessors"] = [{
+            base_options["postprocessors"] = [{
-        #         "key": "FFmpegExtractAudio"
+                "key": "FFmpegExtractAudio"
-        #     }]
+            }]
        if opts.mode != "audio":
            format_sort = self.defaults.format_sort or [