This commit is contained in:
2026-01-20 16:42:49 -08:00
parent 1e2054189b
commit 922b649e17
9 changed files with 351 additions and 141 deletions

View File

@@ -25,7 +25,7 @@ from SYS.models import DebugLogger, DownloadError, DownloadMediaResult, Progress
from SYS.utils import ensure_directory, sha256_file from SYS.utils import ensure_directory, sha256_file
try: # Optional; used for metadata extraction when available try: # Optional; used for metadata extraction when available
from SYS.metadata import extract_ytdlp_tags from SYS.yt_metadata import extract_ytdlp_tags
except Exception: # pragma: no cover - optional dependency except Exception: # pragma: no cover - optional dependency
extract_ytdlp_tags = None # type: ignore[assignment] extract_ytdlp_tags = None # type: ignore[assignment]
@@ -56,7 +56,7 @@ def _resolve_verify_value(verify_ssl: bool) -> Union[bool, str]:
mod = sys.modules.get(mod_name) mod = sys.modules.get(mod_name)
if mod is None: if mod is None:
mod = __import__(mod_name) mod = __import__(mod_name)
except Exception: except (ImportError, ModuleNotFoundError):
return None return None
# Common APIs that return a bundle path # Common APIs that return a bundle path

View File

@@ -353,7 +353,7 @@
"filedot\\.(xyz|to|top)/([0-9a-zA-Z]{12})" "filedot\\.(xyz|to|top)/([0-9a-zA-Z]{12})"
], ],
"regexp": "filedot\\.(xyz|to|top)/([0-9a-zA-Z]{12})", "regexp": "filedot\\.(xyz|to|top)/([0-9a-zA-Z]{12})",
"status": false "status": true
}, },
"filefactory": { "filefactory": {
"name": "filefactory", "name": "filefactory",
@@ -622,7 +622,7 @@
"(simfileshare\\.net/download/[0-9]+/)" "(simfileshare\\.net/download/[0-9]+/)"
], ],
"regexp": "(simfileshare\\.net/download/[0-9]+/)", "regexp": "(simfileshare\\.net/download/[0-9]+/)",
"status": false "status": true
}, },
"streamtape": { "streamtape": {
"name": "streamtape", "name": "streamtape",

View File

@@ -852,7 +852,7 @@ class YtdlpMetadataProvider(MetadataProvider):
tags: List[str] = [] tags: List[str] = []
try: try:
from SYS.metadata import extract_ytdlp_tags from SYS.yt_metadata import extract_ytdlp_tags
except Exception: except Exception:
extract_ytdlp_tags = None # type: ignore[assignment] extract_ytdlp_tags = None # type: ignore[assignment]

View File

@@ -7,7 +7,7 @@ from SYS.logger import log, debug
from pathlib import Path from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple
from API.HydrusNetwork import apply_hydrus_tag_mutation, fetch_hydrus_metadata, fetch_hydrus_metadata_by_url from SYS.yt_metadata import extract_ytdlp_tags
try: # Optional; used when available for richer metadata fetches try: # Optional; used when available for richer metadata fetches
import yt_dlp import yt_dlp
@@ -918,72 +918,7 @@ def apply_tag_mutation(payload: Dict[str,
} }
def extract_ytdlp_tags(entry: Dict[str, Any]) -> List[str]:
""" """
tags: List[str] = []
seen_namespaces: Set[str] = set()
# Meaningful yt-dlp fields that should become tags
# This mapping excludes technical fields: filesize, duration, format_id, vcodec, acodec, ext, etc.
field_to_namespace = {
"artist": "artist",
"album": "album",
"creator": "creator",
"uploader": "creator", # Map uploader to creator (deduplicate)
"uploader_id": "creator",
"channel": "channel",
"genre": "genre",
"track": "track",
"track_number": "track_number",
"release_date": "release_date",
"upload_date": "upload_date",
"title": "title",
"license": "license",
"location": "location",
}
# Extract simple field mappings
for yt_field, namespace in field_to_namespace.items():
value = entry.get(yt_field)
if value is not None:
value_str = value_normalize(str(value))
if value_str:
# Prevent duplicate creator tags (only use first creator)
if namespace == "creator":
if "creator" in seen_namespaces:
continue
seen_namespaces.add("creator")
_add_tag(tags, namespace, value_str)
# Handle tags field specially (could be list, dict, or string)
# For list/sequence tags, capture as freeform (no namespace prefix)
tags_field = entry.get("tags")
if tags_field is not None:
if isinstance(tags_field, list):
# Tags is list: ["tag1", "tag2", ...] → capture as freeform tags (no "tag:" prefix)
# These are typically genre/category tags from the source (BandCamp genres, etc.)
for tag_value in tags_field:
if tag_value:
normalized = value_normalize(str(tag_value))
if normalized and normalized not in tags:
tags.append(normalized)
elif isinstance(tags_field, dict):
# Tags is dict: {"key": "val"} → tag:key:val
for key, val in tags_field.items():
if key and val:
key_normalized = value_normalize(str(key))
val_normalized = value_normalize(str(val))
if key_normalized and val_normalized:
_add_tag(tags, f"tag:{key_normalized}", val_normalized)
else:
# Tags is string or other: add as freeform
if tags_field:
normalized = value_normalize(str(tags_field))
if normalized and normalized not in tags:
tags.append(normalized)
return tags
def dedup_tags_by_namespace(tags: List[str], keep_first: bool = True) -> List[str]: def dedup_tags_by_namespace(tags: List[str], keep_first: bool = True) -> List[str]:

View File

@@ -2,6 +2,7 @@
import datetime import datetime
import hashlib import hashlib
import inspect
import json import json
import os import os
import shutil import shutil
@@ -996,16 +997,25 @@ class PipelineLiveProgress:
def stop(self) -> None: def stop(self) -> None:
# Safe to call whether Live is running or paused. # Safe to call whether Live is running or paused.
if self._live is not None: if self._live is not None:
stop_fn = self._live.stop
has_clear = False
try: try:
try: signature = inspect.signature(stop_fn)
self._live.stop(clear=True) has_clear = "clear" in signature.parameters
except TypeError: except (ValueError, TypeError):
self._live.stop()
except Exception:
self._live.stop()
except Exception:
pass pass
try:
if has_clear:
stop_fn(clear=True)
else:
stop_fn()
except Exception:
try:
stop_fn()
except Exception:
pass
self._live = None self._live = None
self._console = None self._console = None
self._overall = None self._overall = None

View File

@@ -2313,6 +2313,9 @@ class PipelineExecutor:
if name in {"get-relationship", if name in {"get-relationship",
"get-rel"}: "get-rel"}:
continue continue
if name in {"get-metadata",
"meta"}:
continue
# `.pipe` (MPV) is an interactive launcher; disable pipeline Live progress # `.pipe` (MPV) is an interactive launcher; disable pipeline Live progress
# for it because it doesn't meaningfully "complete" (mpv may keep running) # for it because it doesn't meaningfully "complete" (mpv may keep running)
# and Live output interferes with MPV playlist UI. # and Live output interferes with MPV playlist UI.

102
SYS/yt_metadata.py Normal file
View File

@@ -0,0 +1,102 @@
import re
from typing import Any, Dict, List, Set
def value_normalize(value: Any) -> str:
text = str(value).strip()
return text.lower() if text else ""
def _add_tag(tags: List[str], namespace: str, value: str) -> None:
"""Add a namespaced tag if not already present."""
if not namespace or not value:
return
normalized_value = value_normalize(value)
if not normalized_value:
return
candidate = f"{namespace}:{normalized_value}"
if candidate not in tags:
tags.append(candidate)
def extract_ytdlp_tags(entry: Dict[str, Any]) -> List[str]:
""" """
tags: List[str] = []
seen_namespaces: Set[str] = set()
# Meaningful yt-dlp fields that should become tags
# This mapping excludes technical fields: filesize, duration, format_id, vcodec, acodec, ext, etc.
field_to_namespace = {
"artist": "artist",
"album": "album",
"creator": "creator",
"uploader": "creator", # Map uploader to creator (deduplicate)
"uploader_id": "creator",
"channel": "channel",
"genre": "genre",
"track": "track",
"track_number": "track_number",
"release_date": "release_date",
"upload_date": "upload_date",
"title": "title",
"license": "license",
"location": "location",
}
# Extract simple field mappings
for yt_field, namespace in field_to_namespace.items():
value = entry.get(yt_field)
if value is not None:
value_str = value_normalize(str(value))
if value_str:
# Prevent duplicate creator tags (only use first creator)
if namespace == "creator":
if "creator" in seen_namespaces:
continue
seen_namespaces.add("creator")
_add_tag(tags, namespace, value_str)
# Handle tags field specially (could be list, dict, or string)
# For list/sequence tags, capture as freeform (no namespace prefix)
tags_field = entry.get("tags")
if tags_field is not None:
if isinstance(tags_field, list):
# Tags is list: ["tag1", "tag2", ...] → capture as freeform tags (no "tag:" prefix)
# These are typically genre/category tags from the source (BandCamp genres, etc.)
for tag_value in tags_field:
if tag_value:
normalized = value_normalize(str(tag_value))
if normalized and normalized not in tags:
tags.append(normalized)
elif isinstance(tags_field, dict):
# Tags is dict: {"key": "val"} → tag:key:val
for key, val in tags_field.items():
if key and val:
key_normalized = value_normalize(str(key))
val_normalized = value_normalize(str(val))
if key_normalized and val_normalized:
_add_tag(tags, f"tag:{key_normalized}", val_normalized)
else:
# Tags is string: "tag1,tag2" → split and capture as freeform
tag_str = str(tags_field).strip()
if tag_str:
for tag_value in re.split(r'[,\s]+', tag_str):
tag_value = tag_value.strip()
if tag_value:
normalized = value_normalize(tag_value)
if normalized and normalized not in tags:
tags.append(normalized)
# Extract chapters as tags if present
chapters = entry.get("chapters")
if chapters and isinstance(chapters, list):
for chapter in chapters:
if isinstance(chapter, dict):
title = chapter.get("title")
if title:
title_norm = value_normalize(str(title))
if title_norm and title_norm not in tags:
tags.append(title_norm)
return tags

View File

@@ -358,10 +358,25 @@ def _emit_tags_as_table(
# Store the table and items in history so @.. works to go back # Store the table and items in history so @.. works to go back
# Use overlay mode so it doesn't push the previous search to history stack # Use overlay mode so it doesn't push the previous search to history stack
# This makes get-tag behave like a transient view # This makes get-tag behave like a transient view
table_applied = False
try: try:
ctx.set_last_result_table_overlay(table, tag_items, subject) ctx.set_last_result_table_overlay(table, tag_items, subject)
table_applied = True
except AttributeError: except AttributeError:
ctx.set_last_result_table(table, tag_items, subject) try:
ctx.set_last_result_table(table, tag_items, subject)
table_applied = True
except Exception:
table_applied = False
except Exception:
table_applied = False
if table_applied:
try:
if hasattr(ctx, "set_current_stage_table"):
ctx.set_current_stage_table(table)
except Exception:
pass
# Note: CLI will handle displaying the table via ResultTable formatting # Note: CLI will handle displaying the table via ResultTable formatting
@@ -776,7 +791,7 @@ def _scrape_url_metadata(
import json as json_module import json as json_module
try: try:
from SYS.metadata import extract_ytdlp_tags from SYS.yt_metadata import extract_ytdlp_tags
except ImportError: except ImportError:
extract_ytdlp_tags = None extract_ytdlp_tags = None
@@ -1613,6 +1628,33 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
if isinstance(result, list) and len(result) > 0: if isinstance(result, list) and len(result) > 0:
result = result[0] result = result[0]
try:
display_subject = ctx.get_last_result_subject()
except Exception:
display_subject = None
def _value_has_content(value: Any) -> bool:
if value is None:
return False
if isinstance(value, str):
return bool(value.strip())
if isinstance(value, (list, tuple, set)):
return len(value) > 0
return True
def _resolve_subject_value(*keys: str) -> Any:
for key in keys:
val = get_field(result, key, None)
if _value_has_content(val):
return val
if display_subject is None:
return None
for key in keys:
val = get_field(display_subject, key, None)
if _value_has_content(val):
return val
return None
# If the current result already carries a tag list (e.g. a selected metadata # If the current result already carries a tag list (e.g. a selected metadata
# row from get-tag -scrape itunes), APPLY those tags to the file in the store. # row from get-tag -scrape itunes), APPLY those tags to the file in the store.
result_provider = get_field(result, "provider", None) result_provider = get_field(result, "provider", None)
@@ -1726,7 +1768,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
) )
return 0 return 0
hash_from_result = normalize_hash(get_field(result, "hash", None)) hash_from_result = normalize_hash(_resolve_subject_value("hash"))
file_hash = hash_override or hash_from_result file_hash = hash_override or hash_from_result
# Only use emit mode if explicitly requested with --emit flag, not just because we're in a pipeline # Only use emit mode if explicitly requested with --emit flag, not just because we're in a pipeline
# This allows interactive REPL to work even in pipelines # This allows interactive REPL to work even in pipelines
@@ -1734,7 +1776,8 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
store_label = store_key.strip() if store_key and store_key.strip() else None store_label = store_key.strip() if store_key and store_key.strip() else None
# Get hash and store from result # Get hash and store from result
store_name = get_field(result, "store") store_value = _resolve_subject_value("store")
store_name = str(store_value).strip() if store_value is not None else None
if not file_hash: if not file_hash:
log("No hash available in result", file=sys.stderr) log("No hash available in result", file=sys.stderr)
@@ -1744,6 +1787,68 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
log("No store specified in result", file=sys.stderr) log("No store specified in result", file=sys.stderr)
return 1 return 1
item_title = (
_resolve_subject_value("title", "name", "filename")
)
subject_store = store_name
subject_path_value = (
_resolve_subject_value("path", "target", "filename")
)
subject_path = None
if subject_path_value is not None:
try:
subject_path = str(subject_path_value)
except Exception:
subject_path = None
service_name = ""
subject_payload_base: Dict[str, Any] = {
"tag": [],
"title": item_title,
"name": item_title,
"store": subject_store,
"service_name": service_name,
"extra": {
"tag": [],
},
}
if file_hash:
subject_payload_base["hash"] = file_hash
if subject_path:
subject_payload_base["path"] = subject_path
def _subject_payload_with(
tags: Sequence[str],
service_name_override: Optional[str] = None,
) -> Dict[str, Any]:
payload = dict(subject_payload_base)
payload["tag"] = list(tags)
extra = {"tag": list(tags)}
payload["extra"] = extra
if service_name_override is not None:
payload["service_name"] = service_name_override
return payload
raw_result_tags = get_field(result, "tag", None)
if not isinstance(raw_result_tags, list):
raw_result_tags = get_field(result, "tags", None)
display_tags: List[str] = []
if isinstance(raw_result_tags, list):
display_tags = [str(t) for t in raw_result_tags if t is not None]
if display_tags and not emit_mode:
subject_payload = _subject_payload_with(display_tags)
_emit_tags_as_table(
display_tags,
file_hash=file_hash,
store=str(subject_store),
service_name=None,
config=config,
item_title=item_title,
path=subject_path,
subject=subject_payload,
)
return 0
# Get tags using storage backend # Get tags using storage backend
try: try:
from Store import Store from Store import Store
@@ -1761,56 +1866,18 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
log(f"Failed to get tags: {exc}", file=sys.stderr) log(f"Failed to get tags: {exc}", file=sys.stderr)
return 1 return 1
# Always output to ResultTable (pipeline mode only) subject_payload = _subject_payload_with(
# Extract title for table header current,
item_title = ( service_name if source == "hydrus" else None,
get_field(result,
"title",
None) or get_field(result,
"name",
None) or get_field(result,
"filename",
None)
) )
# Build a subject payload representing the file whose tags are being shown
subject_store = get_field(result, "store", None) or store_name
subject_path = (
get_field(result,
"path",
None) or get_field(result,
"target",
None) or get_field(result,
"filename",
None)
)
subject_payload: Dict[str,
Any] = {
"tag": list(current),
"title": item_title,
"name": item_title,
"store": subject_store,
"service_name": service_name,
"extra": {
"tag": list(current),
},
}
if file_hash:
subject_payload["hash"] = file_hash
if subject_path:
try:
subject_payload["path"] = str(subject_path)
except Exception:
pass
_emit_tags_as_table( _emit_tags_as_table(
current, current,
file_hash=file_hash, file_hash=file_hash,
store=subject_store, store=str(subject_store),
service_name=service_name if source == "hydrus" else None, service_name=service_name if source == "hydrus" else None,
config=config, config=config,
item_title=item_title, item_title=item_title,
path=str(subject_path) if subject_path else None, path=subject_path,
subject=subject_payload, subject=subject_payload,
) )

View File

@@ -27,7 +27,7 @@ from SYS.models import (
) )
from SYS.pipeline_progress import PipelineProgress from SYS.pipeline_progress import PipelineProgress
from SYS.utils import ensure_directory, sha256_file from SYS.utils import ensure_directory, sha256_file
from SYS.metadata import extract_ytdlp_tags from SYS.yt_metadata import extract_ytdlp_tags
_YTDLP_TRANSFER_STATE: Dict[str, Dict[str, Any]] = {} _YTDLP_TRANSFER_STATE: Dict[str, Dict[str, Any]] = {}
@@ -44,6 +44,96 @@ else:
_EXTRACTOR_CACHE: List[Any] | None = None _EXTRACTOR_CACHE: List[Any] | None = None
# Patterns for domain extraction from yt-dlp regexes
# 1) Alternation group followed by \.tld e.g. (?:youtube|youtu|youtube-nocookie)\.com
ALT_GROUP_TLD = re.compile(r'\((?:\?:)?([^\)]+)\)\\\.(?P<tld>[A-Za-z0-9.+-]+)')
# 2) Literal domain pieces like youtube\.com or youtu\.be (not preceded by a group)
LITERAL_DOMAIN = re.compile(r'(?<!\()(?<!\|)(?<!:)([A-Za-z0-9][A-Za-z0-9_-]{0,})\\\.([A-Za-z0-9.+-]+)')
# 3) Partial domain tokens that appear alone (e.g., zhihu) — treat as zhihu.com fallback
PARTIAL_TOKEN = re.compile(r'(?<![A-Za-z0-9_-])([A-Za-z0-9][A-Za-z0-9_-]{1,})(?=(?:\\?[/\)\$]|\\\.|$))')
_SUPPORTED_DOMAINS: set[str] | None = None
def normalize_patterns(valid_url) -> List[str]:
if not valid_url:
return []
if isinstance(valid_url, str):
return [valid_url]
if isinstance(valid_url, (list, tuple)):
return [p for p in valid_url if isinstance(p, str)]
return []
def extract_from_pattern(pat: str) -> set[str]:
domains = set()
# 1) Alternation groups followed by .tld
for alt_group, tld in ALT_GROUP_TLD.findall(pat):
# alt_group like "youtube|youtu|youtube-nocookie"
for alt in alt_group.split('|'):
alt = alt.strip()
# remove any non-domain tokens like (?:www\.)? if present inside alt (rare)
alt = re.sub(r'\(\?:www\\\.\)\?', '', alt)
if alt:
domains.add(f"{alt}.{tld}".lower())
# 2) Literal domain matches (youtube\.com)
for name, tld in LITERAL_DOMAIN.findall(pat):
domains.add(f"{name}.{tld}".lower())
# 3) Partial tokens fallback (only if we didn't already capture domains)
# This helps when regexes contain plain tokens like 'zhihu' or 'vimeo' without .com
if not domains:
for token in PARTIAL_TOKEN.findall(pat):
# ignore common regex words that are not domains
if len(token) <= 2:
continue
# avoid tokens that are clearly regex constructs
if token.lower() in {"https", "http", "www", "com", "net", "org"}:
continue
domains.add(f"{token.lower()}.com")
return domains
def extract_domains(valid_url) -> set[str]:
patterns = normalize_patterns(valid_url)
all_domains = set()
for pat in patterns:
all_domains |= extract_from_pattern(pat)
# final cleanup: remove obvious junk like 'com.com' if present
cleaned = set()
for d in all_domains:
# drop duplicates where left side equals tld (e.g., com.com)
parts = d.split('.')
if len(parts) >= 2 and parts[-2] == parts[-1]:
continue
cleaned.add(d)
return cleaned
def _build_supported_domains() -> set[str]:
global _SUPPORTED_DOMAINS
if _SUPPORTED_DOMAINS is not None:
return _SUPPORTED_DOMAINS
_SUPPORTED_DOMAINS = set()
if gen_extractors is None:
return _SUPPORTED_DOMAINS
try:
for e in gen_extractors():
name = getattr(e, "IE_NAME", "").lower()
if name == "generic":
continue
regex = getattr(e, "_VALID_URL", None)
domains = extract_domains(regex)
_SUPPORTED_DOMAINS.update(domains)
except Exception:
pass
return _SUPPORTED_DOMAINS
def _get_nested(config: Dict[str, Any], *path: str) -> Any: def _get_nested(config: Dict[str, Any], *path: str) -> Any:
cur: Any = config cur: Any = config
@@ -122,17 +212,14 @@ def is_url_supported_by_ytdlp(url: str) -> bool:
return False return False
try: try:
for extractor in _get_extractors(): parsed = urlparse(url)
try: domain = parsed.netloc.lower()
if not extractor.suitable(url): if not domain:
continue return False
except Exception: supported = _build_supported_domains()
continue for base in supported:
if domain == base or domain.endswith("." + base):
name = getattr(extractor, "IE_NAME", "").lower() return True
if name == "generic":
continue
return True
except Exception: except Exception:
return False return False
@@ -593,16 +680,22 @@ class YtDlpTool:
# Defaulting to 'chrome' as the most common path. # Defaulting to 'chrome' as the most common path.
base_options["cookiesfrombrowser"] = "chrome" base_options["cookiesfrombrowser"] = "chrome"
# Special handling for format keywords
if opts.ytdl_format == "audio":
opts = opts._replace(mode="audio", ytdl_format=None)
elif opts.ytdl_format == "video":
opts = opts._replace(mode="video", ytdl_format=None)
if opts.no_playlist: if opts.no_playlist:
base_options["noplaylist"] = True base_options["noplaylist"] = True
fmt = opts.ytdl_format or self.default_format(opts.mode) fmt = opts.ytdl_format or self.default_format(opts.mode)
base_options["format"] = fmt base_options["format"] = fmt
# if opts.mode == "audio": if opts.mode == "audio":
# base_options["postprocessors"] = [{ base_options["postprocessors"] = [{
# "key": "FFmpegExtractAudio" "key": "FFmpegExtractAudio"
# }] }]
if opts.mode != "audio": if opts.mode != "audio":
format_sort = self.defaults.format_sort or [ format_sort = self.defaults.format_sort or [