f
This commit is contained in:
379
SYS/metadata.py
379
SYS/metadata.py
@@ -17,6 +17,14 @@ try: # Optional; used for IMDb lookup without API key
|
||||
from imdbinfo.services import search_title # type: ignore
|
||||
except Exception: # pragma: no cover - optional dependency
|
||||
search_title = None # type: ignore[assignment]
|
||||
try:
|
||||
import mutagen
|
||||
except ImportError:
|
||||
mutagen = None
|
||||
try:
|
||||
import musicbrainzngs
|
||||
except ImportError:
|
||||
musicbrainzngs = None
|
||||
|
||||
|
||||
def value_normalize(value: Any) -> str:
|
||||
@@ -93,6 +101,52 @@ def _sanitize_url(value: Optional[str]) -> Optional[str]:
|
||||
return cleaned
|
||||
|
||||
|
||||
def sanitize_metadata_value(value: Any) -> str:
|
||||
if value is None:
|
||||
return ""
|
||||
if isinstance(value, (list, tuple)):
|
||||
value = ", ".join(str(v) for v in value if v)
|
||||
return str(value).strip().replace("\n", " ").replace("\r", " ")
|
||||
|
||||
|
||||
def unique_preserve_order(items: Iterable[Any]) -> list[Any]:
|
||||
seen = set()
|
||||
result = []
|
||||
for item in items:
|
||||
if item not in seen:
|
||||
seen.add(item)
|
||||
result.append(item)
|
||||
return result
|
||||
|
||||
|
||||
def fetch_musicbrainz_tags(mbid: str, entity: str = "release") -> Dict[str, Any]:
|
||||
if not musicbrainzngs:
|
||||
return {"tag": []}
|
||||
|
||||
musicbrainzngs.set_useragent("Medeia-Macina", "0.1")
|
||||
tags: list[str] = []
|
||||
try:
|
||||
if entity == "release":
|
||||
res = musicbrainzngs.get_release_by_id(mbid, includes=["tags"])
|
||||
tags_list = res.get("release", {}).get("tag-list", [])
|
||||
elif entity == "recording":
|
||||
res = musicbrainzngs.get_recording_by_id(mbid, includes=["tags"])
|
||||
tags_list = res.get("recording", {}).get("tag-list", [])
|
||||
elif entity == "artist":
|
||||
res = musicbrainzngs.get_artist_by_id(mbid, includes=["tags"])
|
||||
tags_list = res.get("artist", {}).get("tag-list", [])
|
||||
else:
|
||||
return {"tag": []}
|
||||
|
||||
for t in tags_list:
|
||||
if isinstance(t, dict) and "name" in t:
|
||||
tags.append(t["name"])
|
||||
except Exception as exc:
|
||||
debug(f"MusicBrainz lookup failed: {exc}")
|
||||
|
||||
return {"tag": tags}
|
||||
|
||||
|
||||
def _clean_existing_tags(existing: Any) -> List[str]:
|
||||
tags: List[str] = []
|
||||
seen: Set[str] = set()
|
||||
@@ -601,7 +655,7 @@ def write_tags(
|
||||
|
||||
# Write via consolidated function
|
||||
try:
|
||||
lines = []
|
||||
lines: List[str] = []
|
||||
lines.extend(str(tag).strip().lower() for tag in tag_list if str(tag).strip())
|
||||
|
||||
if lines:
|
||||
@@ -2415,11 +2469,6 @@ def scrape_url_metadata(
|
||||
try:
|
||||
import json as json_module
|
||||
|
||||
try:
|
||||
from SYS.metadata import extract_ytdlp_tags
|
||||
except ImportError:
|
||||
extract_ytdlp_tags = None
|
||||
|
||||
# Build yt-dlp command with playlist support
|
||||
# IMPORTANT: Do NOT use --flat-playlist! It strips metadata like artist, album, uploader, genre
|
||||
# Without it, yt-dlp gives us full metadata in an 'entries' array within a single JSON object
|
||||
@@ -2462,14 +2511,13 @@ def scrape_url_metadata(
|
||||
# is_playlist = 'entries' in data and isinstance(data.get('entries'), list)
|
||||
|
||||
# Extract tags and playlist items
|
||||
tags = []
|
||||
playlist_items = []
|
||||
tags: List[str] = []
|
||||
playlist_items: List[Dict[str, Any]] = []
|
||||
|
||||
# IMPORTANT: Extract album/playlist-level tags FIRST (before processing entries)
|
||||
# This ensures we get metadata about the collection, not just individual tracks
|
||||
if extract_ytdlp_tags:
|
||||
album_tags = extract_ytdlp_tags(data)
|
||||
tags.extend(album_tags)
|
||||
album_tags = extract_ytdlp_tags(data)
|
||||
tags.extend(album_tags)
|
||||
|
||||
# Case 1: Entries are nested in the main object (standard playlist structure)
|
||||
if "entries" in data and isinstance(data.get("entries"), list):
|
||||
@@ -2493,41 +2541,40 @@ def scrape_url_metadata(
|
||||
|
||||
# Extract tags from each entry and merge (but don't duplicate album-level tags)
|
||||
# Only merge entry tags that are multi-value prefixes (not single-value like title:, artist:, etc.)
|
||||
if extract_ytdlp_tags:
|
||||
entry_tags = extract_ytdlp_tags(entry)
|
||||
entry_tags = extract_ytdlp_tags(entry)
|
||||
|
||||
# Single-value namespaces that should not be duplicated from entries
|
||||
single_value_namespaces = {
|
||||
"title",
|
||||
"artist",
|
||||
"album",
|
||||
"creator",
|
||||
"channel",
|
||||
"release_date",
|
||||
"upload_date",
|
||||
"license",
|
||||
"location",
|
||||
}
|
||||
# Single-value namespaces that should not be duplicated from entries
|
||||
single_value_namespaces = {
|
||||
"title",
|
||||
"artist",
|
||||
"album",
|
||||
"creator",
|
||||
"channel",
|
||||
"release_date",
|
||||
"upload_date",
|
||||
"license",
|
||||
"location",
|
||||
}
|
||||
|
||||
for tag in entry_tags:
|
||||
# Extract the namespace (part before the colon)
|
||||
tag_namespace = tag.split(":",
|
||||
1)[0].lower(
|
||||
) if ":" in tag else None
|
||||
for tag in entry_tags:
|
||||
# Extract the namespace (part before the colon)
|
||||
tag_namespace = tag.split(":",
|
||||
1)[0].lower(
|
||||
) if ":" in tag else None
|
||||
|
||||
# Skip if this namespace already exists in tags (from album level)
|
||||
if tag_namespace and tag_namespace in single_value_namespaces:
|
||||
# Check if any tag with this namespace already exists in tags
|
||||
already_has_namespace = any(
|
||||
t.split(":",
|
||||
1)[0].lower() == tag_namespace for t in tags
|
||||
if ":" in t
|
||||
)
|
||||
if already_has_namespace:
|
||||
continue # Skip this tag, keep the album-level one
|
||||
# Skip if this namespace already exists in tags (from album level)
|
||||
if tag_namespace and tag_namespace in single_value_namespaces:
|
||||
# Check if any tag with this namespace already exists in tags
|
||||
already_has_namespace = any(
|
||||
t.split(":",
|
||||
1)[0].lower() == tag_namespace for t in tags
|
||||
if ":" in t
|
||||
)
|
||||
if already_has_namespace:
|
||||
continue # Skip this tag, keep the album-level one
|
||||
|
||||
if tag not in tags: # Avoid exact duplicates
|
||||
tags.append(tag)
|
||||
if tag not in tags: # Avoid exact duplicates
|
||||
tags.append(tag)
|
||||
|
||||
# Case 2: Playlist detected by playlist_count field (BandCamp albums, etc.)
|
||||
# These need a separate call with --flat-playlist to get the actual entries
|
||||
@@ -2586,7 +2633,7 @@ def scrape_url_metadata(
|
||||
pass # Silently ignore if we can't get playlist entries
|
||||
|
||||
# Fallback: if still no tags detected, get from first item
|
||||
if not tags and extract_ytdlp_tags:
|
||||
if not tags:
|
||||
tags = extract_ytdlp_tags(data)
|
||||
|
||||
# Extract formats from the main data object
|
||||
@@ -2595,11 +2642,7 @@ def scrape_url_metadata(
|
||||
formats = extract_url_formats(data.get("formats", []))
|
||||
|
||||
# Deduplicate tags by namespace to prevent duplicate title:, artist:, etc.
|
||||
try:
|
||||
if dedup_tags_by_namespace:
|
||||
tags = dedup_tags_by_namespace(tags, keep_first=True)
|
||||
except Exception:
|
||||
pass # If dedup fails, return tags as-is
|
||||
tags = dedup_tags_by_namespace(tags, keep_first=True)
|
||||
|
||||
return title, tags, formats, playlist_items
|
||||
|
||||
@@ -2617,8 +2660,8 @@ def extract_url_formats(formats: list) -> List[Tuple[str, str]]:
|
||||
Returns list of (display_label, format_id) tuples.
|
||||
"""
|
||||
try:
|
||||
video_formats = {} # {resolution: format_data}
|
||||
audio_formats = {} # {quality_label: format_data}
|
||||
video_formats: Dict[str, Dict[str, Any]] = {} # {resolution: format_data}
|
||||
audio_formats: Dict[str, Dict[str, Any]] = {} # {quality_label: format_data}
|
||||
|
||||
for fmt in formats:
|
||||
vcodec = fmt.get("vcodec", "none")
|
||||
@@ -2655,7 +2698,7 @@ def extract_url_formats(formats: list) -> List[Tuple[str, str]]:
|
||||
"abr": abr,
|
||||
}
|
||||
|
||||
result = []
|
||||
result: List[Tuple[str, str]] = []
|
||||
|
||||
# Add video formats in descending resolution order
|
||||
for res in sorted(video_formats.keys(),
|
||||
@@ -2674,3 +2717,237 @@ def extract_url_formats(formats: list) -> List[Tuple[str, str]]:
|
||||
except Exception as e:
|
||||
log(f"Error extracting formats: {e}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
def prepare_ffmpeg_metadata(payload: Optional[dict[str, Any]]) -> dict[str, str]:
|
||||
if not isinstance(payload, dict):
|
||||
return {}
|
||||
metadata: dict[str, str] = {}
|
||||
|
||||
def set_field(key: str, raw: Any, limit: int = 2000) -> None:
|
||||
sanitized = sanitize_metadata_value(raw)
|
||||
if not sanitized:
|
||||
return
|
||||
if len(sanitized) > limit:
|
||||
sanitized = sanitized[:limit]
|
||||
metadata[key] = sanitized
|
||||
|
||||
set_field("title", payload.get("title"))
|
||||
set_field("artist", payload.get("artist"), 512)
|
||||
set_field("album", payload.get("album"), 512)
|
||||
set_field("date", payload.get("year") or payload.get("date"), 20)
|
||||
comment = payload.get("comment")
|
||||
tags_value = payload.get("tags")
|
||||
tag_strings: list[str] = []
|
||||
artists_from_tags: list[str] = []
|
||||
albums_from_tags: list[str] = []
|
||||
genres_from_tags: list[str] = []
|
||||
if isinstance(tags_value, list):
|
||||
for raw_tag in tags_value:
|
||||
if raw_tag is None:
|
||||
continue
|
||||
if not isinstance(raw_tag, str):
|
||||
raw_tag = str(raw_tag)
|
||||
tag = raw_tag.strip()
|
||||
if not tag:
|
||||
continue
|
||||
tag_strings.append(tag)
|
||||
namespace, sep, value = tag.partition(":")
|
||||
if sep and value:
|
||||
ns = namespace.strip().lower()
|
||||
value = value.strip()
|
||||
if ns in {"artist", "creator", "author", "performer"}:
|
||||
artists_from_tags.append(value)
|
||||
elif ns in {"album", "series", "collection", "group"}:
|
||||
albums_from_tags.append(value)
|
||||
elif ns in {"genre", "rating"}:
|
||||
genres_from_tags.append(value)
|
||||
elif ns in {"comment", "description"} and not comment:
|
||||
comment = value
|
||||
elif ns in {"year", "date"} and not (payload.get("year") or payload.get("date")):
|
||||
set_field("date", value, 20)
|
||||
else:
|
||||
genres_from_tags.append(tag)
|
||||
if "artist" not in metadata and artists_from_tags:
|
||||
set_field("artist", ", ".join(unique_preserve_order(artists_from_tags)[:3]), 512)
|
||||
if "album" not in metadata and albums_from_tags:
|
||||
set_field("album", unique_preserve_order(albums_from_tags)[0], 512)
|
||||
if genres_from_tags:
|
||||
set_field("genre", ", ".join(unique_preserve_order(genres_from_tags)[:5]), 256)
|
||||
if tag_strings:
|
||||
joined_tags = ", ".join(tag_strings[:50])
|
||||
set_field("keywords", joined_tags, 2000)
|
||||
if not comment:
|
||||
comment = joined_tags
|
||||
if comment:
|
||||
set_field("comment", str(comment), 2000)
|
||||
set_field("description", str(comment), 2000)
|
||||
return metadata
|
||||
|
||||
|
||||
def apply_mutagen_metadata(path: Path, metadata: dict[str, str], fmt: str) -> None:
|
||||
if fmt != "audio":
|
||||
return
|
||||
if not metadata:
|
||||
return
|
||||
if mutagen is None:
|
||||
return
|
||||
try:
|
||||
audio = mutagen.File(path, easy=True) # type: ignore[attr-defined]
|
||||
except Exception as exc: # pragma: no cover - best effort only
|
||||
log(f"mutagen load failed: {exc}", file=sys.stderr)
|
||||
return
|
||||
if audio is None:
|
||||
return
|
||||
field_map = {
|
||||
"title": "title",
|
||||
"artist": "artist",
|
||||
"album": "album",
|
||||
"genre": "genre",
|
||||
"comment": "comment",
|
||||
"description": "comment",
|
||||
"date": "date",
|
||||
}
|
||||
changed = False
|
||||
for source_key, target_key in field_map.items():
|
||||
value = metadata.get(source_key)
|
||||
if not value:
|
||||
continue
|
||||
try:
|
||||
audio[target_key] = [value]
|
||||
changed = True
|
||||
except Exception: # pragma: no cover - best effort only
|
||||
continue
|
||||
if not changed:
|
||||
return
|
||||
try:
|
||||
audio.save()
|
||||
except Exception as exc: # pragma: no cover - best effort only
|
||||
log(f"mutagen save failed: {exc}", file=sys.stderr)
|
||||
|
||||
|
||||
def build_ffmpeg_command(
|
||||
ffmpeg_path: str,
|
||||
input_path: Path,
|
||||
output_path: Path,
|
||||
fmt: str,
|
||||
max_width: int,
|
||||
metadata: Optional[dict[str, str]] = None,
|
||||
) -> list[str]:
|
||||
cmd = [ffmpeg_path, "-y", "-i", str(input_path)]
|
||||
if fmt in {"mp4", "webm"} and max_width and max_width > 0:
|
||||
cmd.extend(["-vf", f"scale='min({max_width},iw)':-2"])
|
||||
if metadata:
|
||||
for key, value in metadata.items():
|
||||
cmd.extend(["-metadata", f"{key}={value}"])
|
||||
|
||||
# Video formats
|
||||
if fmt == "mp4":
|
||||
cmd.extend([
|
||||
"-c:v",
|
||||
"libx265",
|
||||
"-preset",
|
||||
"medium",
|
||||
"-crf",
|
||||
"26",
|
||||
"-tag:v",
|
||||
"hvc1",
|
||||
"-pix_fmt",
|
||||
"yuv420p",
|
||||
"-c:a",
|
||||
"aac",
|
||||
"-b:a",
|
||||
"192k",
|
||||
"-movflags",
|
||||
"+faststart",
|
||||
])
|
||||
elif fmt == "webm":
|
||||
cmd.extend([
|
||||
"-c:v",
|
||||
"libvpx-vp9",
|
||||
"-b:v",
|
||||
"0",
|
||||
"-crf",
|
||||
"32",
|
||||
"-c:a",
|
||||
"libopus",
|
||||
"-b:a",
|
||||
"160k",
|
||||
])
|
||||
cmd.extend(["-f", "webm"])
|
||||
|
||||
# Audio formats
|
||||
elif fmt == "mp3":
|
||||
cmd.extend([
|
||||
"-vn",
|
||||
"-c:a",
|
||||
"libmp3lame",
|
||||
"-b:a",
|
||||
"192k",
|
||||
])
|
||||
cmd.extend(["-f", "mp3"])
|
||||
elif fmt == "flac":
|
||||
cmd.extend([
|
||||
"-vn",
|
||||
"-c:a",
|
||||
"flac",
|
||||
])
|
||||
cmd.extend(["-f", "flac"])
|
||||
elif fmt == "wav":
|
||||
cmd.extend([
|
||||
"-vn",
|
||||
"-c:a",
|
||||
"pcm_s16le",
|
||||
])
|
||||
cmd.extend(["-f", "wav"])
|
||||
elif fmt == "aac":
|
||||
cmd.extend([
|
||||
"-vn",
|
||||
"-c:a",
|
||||
"aac",
|
||||
"-b:a",
|
||||
"192k",
|
||||
])
|
||||
cmd.extend(["-f", "adts"])
|
||||
elif fmt == "m4a":
|
||||
cmd.extend([
|
||||
"-vn",
|
||||
"-c:a",
|
||||
"aac",
|
||||
"-b:a",
|
||||
"192k",
|
||||
])
|
||||
cmd.extend(["-f", "ipod"])
|
||||
elif fmt == "ogg":
|
||||
cmd.extend([
|
||||
"-vn",
|
||||
"-c:a",
|
||||
"libvorbis",
|
||||
"-b:a",
|
||||
"192k",
|
||||
])
|
||||
cmd.extend(["-f", "ogg"])
|
||||
elif fmt == "opus":
|
||||
cmd.extend([
|
||||
"-vn",
|
||||
"-c:a",
|
||||
"libopus",
|
||||
"-b:a",
|
||||
"192k",
|
||||
])
|
||||
cmd.extend(["-f", "opus"])
|
||||
elif fmt == "audio":
|
||||
# Legacy format name for mp3
|
||||
cmd.extend([
|
||||
"-vn",
|
||||
"-c:a",
|
||||
"libmp3lame",
|
||||
"-b:a",
|
||||
"192k",
|
||||
])
|
||||
cmd.extend(["-f", "mp3"])
|
||||
elif fmt != "copy":
|
||||
raise ValueError(f"Unsupported format: {fmt}")
|
||||
|
||||
cmd.append(str(output_path))
|
||||
return cmd
|
||||
|
||||
|
||||
Reference in New Issue
Block a user