This commit is contained in:
2026-01-20 16:42:49 -08:00
parent 1e2054189b
commit 922b649e17
9 changed files with 351 additions and 141 deletions

View File

@@ -25,7 +25,7 @@ from SYS.models import DebugLogger, DownloadError, DownloadMediaResult, Progress
from SYS.utils import ensure_directory, sha256_file
try: # Optional; used for metadata extraction when available
from SYS.metadata import extract_ytdlp_tags
from SYS.yt_metadata import extract_ytdlp_tags
except Exception: # pragma: no cover - optional dependency
extract_ytdlp_tags = None # type: ignore[assignment]
@@ -56,7 +56,7 @@ def _resolve_verify_value(verify_ssl: bool) -> Union[bool, str]:
mod = sys.modules.get(mod_name)
if mod is None:
mod = __import__(mod_name)
except Exception:
except (ImportError, ModuleNotFoundError):
return None
# Common APIs that return a bundle path

View File

@@ -353,7 +353,7 @@
"filedot\\.(xyz|to|top)/([0-9a-zA-Z]{12})"
],
"regexp": "filedot\\.(xyz|to|top)/([0-9a-zA-Z]{12})",
"status": false
"status": true
},
"filefactory": {
"name": "filefactory",
@@ -622,7 +622,7 @@
"(simfileshare\\.net/download/[0-9]+/)"
],
"regexp": "(simfileshare\\.net/download/[0-9]+/)",
"status": false
"status": true
},
"streamtape": {
"name": "streamtape",

View File

@@ -852,7 +852,7 @@ class YtdlpMetadataProvider(MetadataProvider):
tags: List[str] = []
try:
from SYS.metadata import extract_ytdlp_tags
from SYS.yt_metadata import extract_ytdlp_tags
except Exception:
extract_ytdlp_tags = None # type: ignore[assignment]

View File

@@ -7,7 +7,7 @@ from SYS.logger import log, debug
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple
from API.HydrusNetwork import apply_hydrus_tag_mutation, fetch_hydrus_metadata, fetch_hydrus_metadata_by_url
from SYS.yt_metadata import extract_ytdlp_tags
try: # Optional; used when available for richer metadata fetches
import yt_dlp
@@ -918,72 +918,7 @@ def apply_tag_mutation(payload: Dict[str,
}
def extract_ytdlp_tags(entry: Dict[str, Any]) -> List[str]:
""" """
tags: List[str] = []
seen_namespaces: Set[str] = set()
# Meaningful yt-dlp fields that should become tags
# This mapping excludes technical fields: filesize, duration, format_id, vcodec, acodec, ext, etc.
field_to_namespace = {
"artist": "artist",
"album": "album",
"creator": "creator",
"uploader": "creator", # Map uploader to creator (deduplicate)
"uploader_id": "creator",
"channel": "channel",
"genre": "genre",
"track": "track",
"track_number": "track_number",
"release_date": "release_date",
"upload_date": "upload_date",
"title": "title",
"license": "license",
"location": "location",
}
# Extract simple field mappings
for yt_field, namespace in field_to_namespace.items():
value = entry.get(yt_field)
if value is not None:
value_str = value_normalize(str(value))
if value_str:
# Prevent duplicate creator tags (only use first creator)
if namespace == "creator":
if "creator" in seen_namespaces:
continue
seen_namespaces.add("creator")
_add_tag(tags, namespace, value_str)
# Handle tags field specially (could be list, dict, or string)
# For list/sequence tags, capture as freeform (no namespace prefix)
tags_field = entry.get("tags")
if tags_field is not None:
if isinstance(tags_field, list):
# Tags is list: ["tag1", "tag2", ...] → capture as freeform tags (no "tag:" prefix)
# These are typically genre/category tags from the source (BandCamp genres, etc.)
for tag_value in tags_field:
if tag_value:
normalized = value_normalize(str(tag_value))
if normalized and normalized not in tags:
tags.append(normalized)
elif isinstance(tags_field, dict):
# Tags is dict: {"key": "val"} → tag:key:val
for key, val in tags_field.items():
if key and val:
key_normalized = value_normalize(str(key))
val_normalized = value_normalize(str(val))
if key_normalized and val_normalized:
_add_tag(tags, f"tag:{key_normalized}", val_normalized)
else:
# Tags is string or other: add as freeform
if tags_field:
normalized = value_normalize(str(tags_field))
if normalized and normalized not in tags:
tags.append(normalized)
return tags
def dedup_tags_by_namespace(tags: List[str], keep_first: bool = True) -> List[str]:

View File

@@ -2,6 +2,7 @@
import datetime
import hashlib
import inspect
import json
import os
import shutil
@@ -996,13 +997,22 @@ class PipelineLiveProgress:
def stop(self) -> None:
# Safe to call whether Live is running or paused.
if self._live is not None:
stop_fn = self._live.stop
has_clear = False
try:
signature = inspect.signature(stop_fn)
has_clear = "clear" in signature.parameters
except (ValueError, TypeError):
pass
try:
self._live.stop(clear=True)
except TypeError:
self._live.stop()
if has_clear:
stop_fn(clear=True)
else:
stop_fn()
except Exception:
self._live.stop()
try:
stop_fn()
except Exception:
pass

View File

@@ -2313,6 +2313,9 @@ class PipelineExecutor:
if name in {"get-relationship",
"get-rel"}:
continue
if name in {"get-metadata",
"meta"}:
continue
# `.pipe` (MPV) is an interactive launcher; disable pipeline Live progress
# for it because it doesn't meaningfully "complete" (mpv may keep running)
# and Live output interferes with MPV playlist UI.

102
SYS/yt_metadata.py Normal file
View File

@@ -0,0 +1,102 @@
import re
from typing import Any, Dict, List, Set
def value_normalize(value: Any) -> str:
text = str(value).strip()
return text.lower() if text else ""
def _add_tag(tags: List[str], namespace: str, value: str) -> None:
"""Add a namespaced tag if not already present."""
if not namespace or not value:
return
normalized_value = value_normalize(value)
if not normalized_value:
return
candidate = f"{namespace}:{normalized_value}"
if candidate not in tags:
tags.append(candidate)
def extract_ytdlp_tags(entry: Dict[str, Any]) -> List[str]:
""" """
tags: List[str] = []
seen_namespaces: Set[str] = set()
# Meaningful yt-dlp fields that should become tags
# This mapping excludes technical fields: filesize, duration, format_id, vcodec, acodec, ext, etc.
field_to_namespace = {
"artist": "artist",
"album": "album",
"creator": "creator",
"uploader": "creator", # Map uploader to creator (deduplicate)
"uploader_id": "creator",
"channel": "channel",
"genre": "genre",
"track": "track",
"track_number": "track_number",
"release_date": "release_date",
"upload_date": "upload_date",
"title": "title",
"license": "license",
"location": "location",
}
# Extract simple field mappings
for yt_field, namespace in field_to_namespace.items():
value = entry.get(yt_field)
if value is not None:
value_str = value_normalize(str(value))
if value_str:
# Prevent duplicate creator tags (only use first creator)
if namespace == "creator":
if "creator" in seen_namespaces:
continue
seen_namespaces.add("creator")
_add_tag(tags, namespace, value_str)
# Handle tags field specially (could be list, dict, or string)
# For list/sequence tags, capture as freeform (no namespace prefix)
tags_field = entry.get("tags")
if tags_field is not None:
if isinstance(tags_field, list):
# Tags is list: ["tag1", "tag2", ...] → capture as freeform tags (no "tag:" prefix)
# These are typically genre/category tags from the source (BandCamp genres, etc.)
for tag_value in tags_field:
if tag_value:
normalized = value_normalize(str(tag_value))
if normalized and normalized not in tags:
tags.append(normalized)
elif isinstance(tags_field, dict):
# Tags is dict: {"key": "val"} → tag:key:val
for key, val in tags_field.items():
if key and val:
key_normalized = value_normalize(str(key))
val_normalized = value_normalize(str(val))
if key_normalized and val_normalized:
_add_tag(tags, f"tag:{key_normalized}", val_normalized)
else:
# Tags is string: "tag1,tag2" → split and capture as freeform
tag_str = str(tags_field).strip()
if tag_str:
for tag_value in re.split(r'[,\s]+', tag_str):
tag_value = tag_value.strip()
if tag_value:
normalized = value_normalize(tag_value)
if normalized and normalized not in tags:
tags.append(normalized)
# Extract chapters as tags if present
chapters = entry.get("chapters")
if chapters and isinstance(chapters, list):
for chapter in chapters:
if isinstance(chapter, dict):
title = chapter.get("title")
if title:
title_norm = value_normalize(str(title))
if title_norm and title_norm not in tags:
tags.append(title_norm)
return tags

View File

@@ -358,10 +358,25 @@ def _emit_tags_as_table(
# Store the table and items in history so @.. works to go back
# Use overlay mode so it doesn't push the previous search to history stack
# This makes get-tag behave like a transient view
table_applied = False
try:
ctx.set_last_result_table_overlay(table, tag_items, subject)
table_applied = True
except AttributeError:
try:
ctx.set_last_result_table(table, tag_items, subject)
table_applied = True
except Exception:
table_applied = False
except Exception:
table_applied = False
if table_applied:
try:
if hasattr(ctx, "set_current_stage_table"):
ctx.set_current_stage_table(table)
except Exception:
pass
# Note: CLI will handle displaying the table via ResultTable formatting
@@ -776,7 +791,7 @@ def _scrape_url_metadata(
import json as json_module
try:
from SYS.metadata import extract_ytdlp_tags
from SYS.yt_metadata import extract_ytdlp_tags
except ImportError:
extract_ytdlp_tags = None
@@ -1613,6 +1628,33 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
if isinstance(result, list) and len(result) > 0:
result = result[0]
try:
display_subject = ctx.get_last_result_subject()
except Exception:
display_subject = None
def _value_has_content(value: Any) -> bool:
if value is None:
return False
if isinstance(value, str):
return bool(value.strip())
if isinstance(value, (list, tuple, set)):
return len(value) > 0
return True
def _resolve_subject_value(*keys: str) -> Any:
for key in keys:
val = get_field(result, key, None)
if _value_has_content(val):
return val
if display_subject is None:
return None
for key in keys:
val = get_field(display_subject, key, None)
if _value_has_content(val):
return val
return None
# If the current result already carries a tag list (e.g. a selected metadata
# row from get-tag -scrape itunes), APPLY those tags to the file in the store.
result_provider = get_field(result, "provider", None)
@@ -1726,7 +1768,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
)
return 0
hash_from_result = normalize_hash(get_field(result, "hash", None))
hash_from_result = normalize_hash(_resolve_subject_value("hash"))
file_hash = hash_override or hash_from_result
# Only use emit mode if explicitly requested with --emit flag, not just because we're in a pipeline
# This allows interactive REPL to work even in pipelines
@@ -1734,7 +1776,8 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
store_label = store_key.strip() if store_key and store_key.strip() else None
# Get hash and store from result
store_name = get_field(result, "store")
store_value = _resolve_subject_value("store")
store_name = str(store_value).strip() if store_value is not None else None
if not file_hash:
log("No hash available in result", file=sys.stderr)
@@ -1744,6 +1787,68 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
log("No store specified in result", file=sys.stderr)
return 1
item_title = (
_resolve_subject_value("title", "name", "filename")
)
subject_store = store_name
subject_path_value = (
_resolve_subject_value("path", "target", "filename")
)
subject_path = None
if subject_path_value is not None:
try:
subject_path = str(subject_path_value)
except Exception:
subject_path = None
service_name = ""
subject_payload_base: Dict[str, Any] = {
"tag": [],
"title": item_title,
"name": item_title,
"store": subject_store,
"service_name": service_name,
"extra": {
"tag": [],
},
}
if file_hash:
subject_payload_base["hash"] = file_hash
if subject_path:
subject_payload_base["path"] = subject_path
def _subject_payload_with(
tags: Sequence[str],
service_name_override: Optional[str] = None,
) -> Dict[str, Any]:
payload = dict(subject_payload_base)
payload["tag"] = list(tags)
extra = {"tag": list(tags)}
payload["extra"] = extra
if service_name_override is not None:
payload["service_name"] = service_name_override
return payload
raw_result_tags = get_field(result, "tag", None)
if not isinstance(raw_result_tags, list):
raw_result_tags = get_field(result, "tags", None)
display_tags: List[str] = []
if isinstance(raw_result_tags, list):
display_tags = [str(t) for t in raw_result_tags if t is not None]
if display_tags and not emit_mode:
subject_payload = _subject_payload_with(display_tags)
_emit_tags_as_table(
display_tags,
file_hash=file_hash,
store=str(subject_store),
service_name=None,
config=config,
item_title=item_title,
path=subject_path,
subject=subject_payload,
)
return 0
# Get tags using storage backend
try:
from Store import Store
@@ -1761,56 +1866,18 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
log(f"Failed to get tags: {exc}", file=sys.stderr)
return 1
# Always output to ResultTable (pipeline mode only)
# Extract title for table header
item_title = (
get_field(result,
"title",
None) or get_field(result,
"name",
None) or get_field(result,
"filename",
None)
subject_payload = _subject_payload_with(
current,
service_name if source == "hydrus" else None,
)
# Build a subject payload representing the file whose tags are being shown
subject_store = get_field(result, "store", None) or store_name
subject_path = (
get_field(result,
"path",
None) or get_field(result,
"target",
None) or get_field(result,
"filename",
None)
)
subject_payload: Dict[str,
Any] = {
"tag": list(current),
"title": item_title,
"name": item_title,
"store": subject_store,
"service_name": service_name,
"extra": {
"tag": list(current),
},
}
if file_hash:
subject_payload["hash"] = file_hash
if subject_path:
try:
subject_payload["path"] = str(subject_path)
except Exception:
pass
_emit_tags_as_table(
current,
file_hash=file_hash,
store=subject_store,
store=str(subject_store),
service_name=service_name if source == "hydrus" else None,
config=config,
item_title=item_title,
path=str(subject_path) if subject_path else None,
path=subject_path,
subject=subject_payload,
)

View File

@@ -27,7 +27,7 @@ from SYS.models import (
)
from SYS.pipeline_progress import PipelineProgress
from SYS.utils import ensure_directory, sha256_file
from SYS.metadata import extract_ytdlp_tags
from SYS.yt_metadata import extract_ytdlp_tags
_YTDLP_TRANSFER_STATE: Dict[str, Dict[str, Any]] = {}
@@ -44,6 +44,96 @@ else:
_EXTRACTOR_CACHE: List[Any] | None = None
# Patterns for domain extraction from yt-dlp regexes
# 1) Alternation group followed by \.tld e.g. (?:youtube|youtu|youtube-nocookie)\.com
ALT_GROUP_TLD = re.compile(r'\((?:\?:)?([^\)]+)\)\\\.(?P<tld>[A-Za-z0-9.+-]+)')
# 2) Literal domain pieces like youtube\.com or youtu\.be (not preceded by a group)
LITERAL_DOMAIN = re.compile(r'(?<!\()(?<!\|)(?<!:)([A-Za-z0-9][A-Za-z0-9_-]{0,})\\\.([A-Za-z0-9.+-]+)')
# 3) Partial domain tokens that appear alone (e.g., zhihu) — treat as zhihu.com fallback
PARTIAL_TOKEN = re.compile(r'(?<![A-Za-z0-9_-])([A-Za-z0-9][A-Za-z0-9_-]{1,})(?=(?:\\?[/\)\$]|\\\.|$))')
_SUPPORTED_DOMAINS: set[str] | None = None
def normalize_patterns(valid_url) -> List[str]:
if not valid_url:
return []
if isinstance(valid_url, str):
return [valid_url]
if isinstance(valid_url, (list, tuple)):
return [p for p in valid_url if isinstance(p, str)]
return []
def extract_from_pattern(pat: str) -> set[str]:
domains = set()
# 1) Alternation groups followed by .tld
for alt_group, tld in ALT_GROUP_TLD.findall(pat):
# alt_group like "youtube|youtu|youtube-nocookie"
for alt in alt_group.split('|'):
alt = alt.strip()
# remove any non-domain tokens like (?:www\.)? if present inside alt (rare)
alt = re.sub(r'\(\?:www\\\.\)\?', '', alt)
if alt:
domains.add(f"{alt}.{tld}".lower())
# 2) Literal domain matches (youtube\.com)
for name, tld in LITERAL_DOMAIN.findall(pat):
domains.add(f"{name}.{tld}".lower())
# 3) Partial tokens fallback (only if we didn't already capture domains)
# This helps when regexes contain plain tokens like 'zhihu' or 'vimeo' without .com
if not domains:
for token in PARTIAL_TOKEN.findall(pat):
# ignore common regex words that are not domains
if len(token) <= 2:
continue
# avoid tokens that are clearly regex constructs
if token.lower() in {"https", "http", "www", "com", "net", "org"}:
continue
domains.add(f"{token.lower()}.com")
return domains
def extract_domains(valid_url) -> set[str]:
patterns = normalize_patterns(valid_url)
all_domains = set()
for pat in patterns:
all_domains |= extract_from_pattern(pat)
# final cleanup: remove obvious junk like 'com.com' if present
cleaned = set()
for d in all_domains:
# drop duplicates where left side equals tld (e.g., com.com)
parts = d.split('.')
if len(parts) >= 2 and parts[-2] == parts[-1]:
continue
cleaned.add(d)
return cleaned
def _build_supported_domains() -> set[str]:
global _SUPPORTED_DOMAINS
if _SUPPORTED_DOMAINS is not None:
return _SUPPORTED_DOMAINS
_SUPPORTED_DOMAINS = set()
if gen_extractors is None:
return _SUPPORTED_DOMAINS
try:
for e in gen_extractors():
name = getattr(e, "IE_NAME", "").lower()
if name == "generic":
continue
regex = getattr(e, "_VALID_URL", None)
domains = extract_domains(regex)
_SUPPORTED_DOMAINS.update(domains)
except Exception:
pass
return _SUPPORTED_DOMAINS
def _get_nested(config: Dict[str, Any], *path: str) -> Any:
cur: Any = config
@@ -122,16 +212,13 @@ def is_url_supported_by_ytdlp(url: str) -> bool:
return False
try:
for extractor in _get_extractors():
try:
if not extractor.suitable(url):
continue
except Exception:
continue
name = getattr(extractor, "IE_NAME", "").lower()
if name == "generic":
continue
parsed = urlparse(url)
domain = parsed.netloc.lower()
if not domain:
return False
supported = _build_supported_domains()
for base in supported:
if domain == base or domain.endswith("." + base):
return True
except Exception:
return False
@@ -593,16 +680,22 @@ class YtDlpTool:
# Defaulting to 'chrome' as the most common path.
base_options["cookiesfrombrowser"] = "chrome"
# Special handling for format keywords
if opts.ytdl_format == "audio":
opts = opts._replace(mode="audio", ytdl_format=None)
elif opts.ytdl_format == "video":
opts = opts._replace(mode="video", ytdl_format=None)
if opts.no_playlist:
base_options["noplaylist"] = True
fmt = opts.ytdl_format or self.default_format(opts.mode)
base_options["format"] = fmt
# if opts.mode == "audio":
# base_options["postprocessors"] = [{
# "key": "FFmpegExtractAudio"
# }]
if opts.mode == "audio":
base_options["postprocessors"] = [{
"key": "FFmpegExtractAudio"
}]
if opts.mode != "audio":
format_sort = self.defaults.format_sort or [