Files
Medios-Macina/metadata.py

4135 lines
140 KiB
Python
Raw Normal View History

2025-11-25 20:09:33 -08:00
import json
import re
import subprocess
import sys
import shutil
2025-12-11 19:04:02 -08:00
from SYS.logger import log, debug
2025-11-25 20:09:33 -08:00
from urllib.parse import urlsplit, urlunsplit, unquote
from collections import deque
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple
2025-12-14 00:53:52 -08:00
from models import FileRelationshipTracker
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
try:
import musicbrainzngs # type: ignore
except ImportError: # pragma: no cover
musicbrainzngs = None
from imdbinfo.services import get_movie # type: ignore
try:
import yt_dlp # type: ignore
except ImportError: # pragma: no cover
yt_dlp = None
try:
from SYS.config import load_config, resolve_output_dir # type: ignore
2025-11-25 20:09:33 -08:00
except ImportError: # pragma: no cover
load_config = None # type: ignore[assignment]
resolve_output_dir = None # type: ignore[assignment]
2025-12-11 12:47:30 -08:00
try:
2025-12-11 19:04:02 -08:00
from SYS.utils import sha256_file
2025-12-11 12:47:30 -08:00
except ImportError: # pragma: no cover
sha256_file = None # type: ignore[assignment]
2025-12-11 23:21:45 -08:00
try: # Optional metadata helper for audio files
import mutagen # type: ignore
except ImportError: # pragma: no cover - best effort
mutagen = None # type: ignore
from SYS.utils import sanitize_metadata_value, unique_preserve_order
2025-11-25 20:09:33 -08:00
try:
from helpers.hydrus import HydrusClient, HydrusRequestError, HydrusRequestSpec # type: ignore
except ImportError: # pragma: no cover
HydrusClient = None # type: ignore[assignment]
HydrusRequestError = RuntimeError # type: ignore[assignment]
HydrusRequestSpec = None # type: ignore[assignment]
if musicbrainzngs: # pragma: no branch
musicbrainzngs.set_useragent("DownlowScript", "0.1", "admin@example.com")
MusicBrainzRequestError = getattr(
musicbrainzngs,
"MusicBrainzRequestError",
Exception
)
2025-11-25 20:09:33 -08:00
else: # pragma: no cover
MusicBrainzRequestError = Exception
# Global relationship tracker for the current session
_CURRENT_RELATIONSHIP_TRACKER = FileRelationshipTracker()
2025-12-11 23:21:45 -08:00
def prepare_ffmpeg_metadata(payload: Optional[Dict[str, Any]]) -> Dict[str, str]:
2025-12-12 21:55:38 -08:00
"""Build ffmpeg/mutagen metadata map from payload."""
2025-12-11 23:21:45 -08:00
if not isinstance(payload, dict):
return {}
metadata: Dict[str,
str] = {}
2025-12-11 23:21:45 -08:00
def set_field(key: str, raw: Any, limit: int = 2000) -> None:
sanitized = sanitize_metadata_value(raw)
if not sanitized:
return
if len(sanitized) > limit:
sanitized = sanitized[:limit]
metadata[key] = sanitized
set_field("title", payload.get("title"))
set_field("artist", payload.get("artist"), 512)
set_field("album", payload.get("album"), 512)
set_field("date", payload.get("year"), 20)
comment = payload.get("comment")
tags_value = payload.get("tag")
tag_strings: List[str] = []
artists_from_tags: List[str] = []
albums_from_tags: List[str] = []
genres_from_tags: List[str] = []
if isinstance(tags_value, list):
for raw_tag in tags_value:
if raw_tag is None:
continue
if not isinstance(raw_tag, str):
raw_tag = str(raw_tag)
tag = raw_tag.strip()
if not tag:
continue
tag_strings.append(tag)
namespace, sep, value = tag.partition(":")
if sep and value:
ns = namespace.strip().lower()
value = value.strip()
if ns in {"artist",
"creator",
"author",
"performer"}:
2025-12-11 23:21:45 -08:00
artists_from_tags.append(value)
elif ns in {"album",
"series",
"collection",
"group"}:
2025-12-11 23:21:45 -08:00
albums_from_tags.append(value)
elif ns in {"genre",
"rating"}:
2025-12-11 23:21:45 -08:00
genres_from_tags.append(value)
elif ns in {"comment",
"description"} and not comment:
2025-12-11 23:21:45 -08:00
comment = value
elif ns in {"year",
"date"} and not payload.get("year"):
2025-12-11 23:21:45 -08:00
set_field("date", value, 20)
else:
genres_from_tags.append(tag)
if "artist" not in metadata and artists_from_tags:
set_field(
"artist",
", ".join(unique_preserve_order(artists_from_tags)[:3]),
512
)
2025-12-11 23:21:45 -08:00
if "album" not in metadata and albums_from_tags:
set_field("album", unique_preserve_order(albums_from_tags)[0], 512)
if genres_from_tags:
set_field("genre", ", ".join(unique_preserve_order(genres_from_tags)[:5]), 256)
if tag_strings:
joined_tags = ", ".join(tag_strings[:50])
set_field("keywords", joined_tags, 2000)
if not comment:
comment = joined_tags
if comment:
set_field("comment", comment, 2000)
set_field("description", comment, 2000)
return metadata
def apply_mutagen_metadata(path: Path, metadata: Dict[str, str], fmt: str) -> None:
"""Best-effort metadata writing for audio containers."""
if fmt != "audio":
return
if not metadata:
return
if mutagen is None:
return
try:
audio = mutagen.File(path, easy=True) # type: ignore[attr-defined]
except Exception as exc: # pragma: no cover - best effort only
log(f"mutagen load failed: {exc}", file=sys.stderr)
return
if audio is None:
return
field_map = {
"title": "title",
"artist": "artist",
"album": "album",
"genre": "genre",
"comment": "comment",
"description": "comment",
"date": "date",
}
changed = False
for source_key, target_key in field_map.items():
value = metadata.get(source_key)
if not value:
continue
try:
audio[target_key] = [value]
changed = True
except Exception: # pragma: no cover
continue
if not changed:
return
try:
audio.save()
except Exception as exc: # pragma: no cover
log(f"mutagen save failed: {exc}", file=sys.stderr)
def build_ffmpeg_command(
ffmpeg_path: str,
input_path: Path,
output_path: Path,
fmt: str,
max_width: int,
metadata: Optional[Dict[str,
str]] = None,
2025-12-11 23:21:45 -08:00
) -> List[str]:
"""Build an ffmpeg command line for common export formats."""
cmd: List[str] = [ffmpeg_path, "-y", "-i", str(input_path)]
if fmt in {"mp4",
"webm"} and max_width and max_width > 0:
2025-12-11 23:21:45 -08:00
cmd.extend(["-vf", f"scale='min({max_width},iw)':-2"])
if metadata:
for key, value in metadata.items():
cmd.extend(["-metadata", f"{key}={value}"])
# Video formats
if fmt == "mp4":
cmd.extend(
[
"-c:v",
"libx265",
"-preset",
"medium",
"-crf",
"26",
"-tag:v",
"hvc1",
"-pix_fmt",
"yuv420p",
"-c:a",
"aac",
"-b:a",
"192k",
"-movflags",
"+faststart",
]
)
elif fmt == "webm":
cmd.extend(
[
"-c:v",
"libvpx-vp9",
"-b:v",
"0",
"-crf",
"32",
"-c:a",
"libopus",
"-b:a",
"160k",
]
)
cmd.extend(["-f", "webm"])
# Audio formats
elif fmt == "mp3":
cmd.extend(["-vn", "-c:a", "libmp3lame", "-b:a", "192k"])
cmd.extend(["-f", "mp3"])
elif fmt == "flac":
cmd.extend(["-vn", "-c:a", "flac"])
cmd.extend(["-f", "flac"])
elif fmt == "wav":
cmd.extend(["-vn", "-c:a", "pcm_s16le"])
cmd.extend(["-f", "wav"])
elif fmt == "aac":
cmd.extend(["-vn", "-c:a", "aac", "-b:a", "192k"])
cmd.extend(["-f", "adts"])
elif fmt == "m4a":
cmd.extend(["-vn", "-c:a", "aac", "-b:a", "192k"])
cmd.extend(["-f", "ipod"])
elif fmt == "ogg":
cmd.extend(["-vn", "-c:a", "libvorbis", "-b:a", "192k"])
cmd.extend(["-f", "ogg"])
elif fmt == "opus":
cmd.extend(["-vn", "-c:a", "libopus", "-b:a", "192k"])
cmd.extend(["-f", "opus"])
elif fmt == "audio":
# Legacy format name for mp3
cmd.extend(["-vn", "-c:a", "libmp3lame", "-b:a", "192k"])
cmd.extend(["-f", "mp3"])
elif fmt != "copy":
raise ValueError(f"Unsupported format: {fmt}")
cmd.append(str(output_path))
return cmd
2025-12-11 12:47:30 -08:00
def field(obj: Any, name: str, value: Any = None) -> Any:
2025-12-12 21:55:38 -08:00
"""Get or set a field on dict or object."""
if value is None:
if isinstance(obj, dict):
return obj.get(name)
return getattr(obj, name, None)
if isinstance(obj, dict):
obj[name] = value
else:
setattr(obj, name, value)
return value
2025-12-11 12:47:30 -08:00
2025-11-25 20:09:33 -08:00
def _generate_hydrus_url_variants(url: str) -> List[str]:
seen: Set[str] = set()
variants: List[str] = []
def push(candidate: Optional[str]) -> None:
if not candidate:
return
text = candidate.strip()
if not text or text in seen:
return
seen.add(text)
variants.append(text)
push(url)
try:
parsed = urlsplit(url)
except Exception:
return variants
if parsed.scheme in {"http",
"https"}:
2025-11-25 20:09:33 -08:00
alternate_scheme = "https" if parsed.scheme == "http" else "http"
2025-12-29 17:05:03 -08:00
push(
urlunsplit(
(
alternate_scheme,
parsed.netloc,
parsed.path,
parsed.query,
parsed.fragment
)
2025-12-29 17:05:03 -08:00
)
)
2025-11-25 20:09:33 -08:00
normalised_netloc = parsed.netloc.lower()
if normalised_netloc and normalised_netloc != parsed.netloc:
2025-12-29 17:05:03 -08:00
push(
urlunsplit(
(
parsed.scheme,
normalised_netloc,
parsed.path,
parsed.query,
parsed.fragment
)
2025-12-29 17:05:03 -08:00
)
)
2025-11-25 20:09:33 -08:00
if parsed.path:
2025-12-29 17:05:03 -08:00
trimmed_path = parsed.path.rstrip("/")
2025-11-25 20:09:33 -08:00
if trimmed_path != parsed.path:
2025-12-29 17:05:03 -08:00
push(
urlunsplit(
(
parsed.scheme,
parsed.netloc,
trimmed_path,
parsed.query,
parsed.fragment
)
2025-12-29 17:05:03 -08:00
)
)
2025-11-25 20:09:33 -08:00
else:
2025-12-29 17:05:03 -08:00
push(
urlunsplit(
(
parsed.scheme,
parsed.netloc,
parsed.path + "/",
parsed.query,
parsed.fragment
)
2025-12-29 17:05:03 -08:00
)
)
2025-11-25 20:09:33 -08:00
unquoted_path = unquote(parsed.path)
if unquoted_path != parsed.path:
2025-12-29 17:05:03 -08:00
push(
urlunsplit(
(
parsed.scheme,
parsed.netloc,
unquoted_path,
parsed.query,
parsed.fragment
)
2025-12-29 17:05:03 -08:00
)
)
2025-11-25 20:09:33 -08:00
if parsed.query or parsed.fragment:
2025-12-29 17:05:03 -08:00
push(urlunsplit((parsed.scheme, parsed.netloc, parsed.path, "", "")))
2025-11-25 20:09:33 -08:00
if parsed.path:
unquoted_path = unquote(parsed.path)
2025-12-29 17:05:03 -08:00
push(urlunsplit((parsed.scheme, parsed.netloc, unquoted_path, "", "")))
2025-11-25 20:09:33 -08:00
return variants
2025-12-14 00:53:52 -08:00
def normalize_urls(value: Any) -> List[str]:
"""Normalize a URL field into a stable, deduplicated list.
Accepts:
- None
- a single URL string (optionally containing multiple URLs)
- a list/tuple/set of URL strings
This helper is used by cmdlets/stores/pipeline to keep `url` consistent.
"""
def _iter_raw_urls(raw: Any) -> Iterable[str]:
if raw is None:
return
if isinstance(raw, str):
text = raw.strip()
if not text:
return
# Support legacy prefixes like "url:https://...".
if text.lower().startswith("url:"):
text = text.split(":", 1)[1].strip()
# Prefer extracting obvious URLs to avoid splitting inside query strings.
matches = re.findall(r"https?://[^\s,]+", text, flags=re.IGNORECASE)
if matches:
for m in matches:
yield m
return
# Fallback: split on commas/whitespace.
for token in text.replace("\n",
" ").replace("\r",
" ").replace(",",
" ").split():
2025-12-14 00:53:52 -08:00
if token:
yield token
return
if isinstance(raw, (list, tuple, set)):
for item in raw:
if item is None:
continue
if isinstance(item, str):
if item.strip():
yield item
else:
text = str(item).strip()
if text:
yield text
return
# Last resort: string-coerce.
text = str(raw).strip()
if text:
yield text
def _canonicalize(url_text: str) -> Optional[str]:
u = str(url_text or "").strip()
if not u:
return None
# Trim common wrappers and trailing punctuation.
u = u.strip("<>\"' ")
2025-12-29 17:05:03 -08:00
u = u.rstrip(')].,;"')
2025-12-14 00:53:52 -08:00
if not u:
return None
2025-12-25 04:49:22 -08:00
# IMPORTANT: URLs can be case-sensitive in the path/query on some hosts
# (e.g., https://0x0.st/PzGY.webp). Do not lowercase or otherwise rewrite
# the URL here; preserve exact casing and percent-encoding.
return u
2025-12-14 00:53:52 -08:00
seen: Set[str] = set()
out: List[str] = []
for raw_url in _iter_raw_urls(value):
canonical = _canonicalize(raw_url)
if not canonical:
continue
if canonical in seen:
continue
seen.add(canonical)
out.append(canonical)
return out
2025-11-25 20:09:33 -08:00
def value_normalize(value: str) -> str:
"""Normalize whitespace: collapse internal spaces, strip, remove newlines."""
value = value.replace("\n", " ").replace("\r", " ")
value = re.sub(r"\s+", " ", value).strip()
return value
def import_pending_sidecars(db_root: Path, db: Any) -> None:
2025-12-11 23:21:45 -08:00
"""Import pending sidecars (.tag/.metadata/.notes) into the database."""
2025-11-25 20:09:33 -08:00
try:
2025-12-29 17:05:03 -08:00
sidecar_patterns = ["**/*.tag", "**/*.metadata", "**/*.notes"]
2025-12-11 12:47:30 -08:00
2025-11-25 20:09:33 -08:00
for pattern in sidecar_patterns:
for sidecar_path in db_root.glob(pattern):
2025-12-29 17:05:03 -08:00
if ".downlow" in sidecar_path.parts:
2025-11-25 20:09:33 -08:00
continue
2025-12-11 12:47:30 -08:00
try:
2025-12-29 17:05:03 -08:00
base_path = sidecar_path.with_suffix("")
2025-12-11 12:47:30 -08:00
except Exception:
2025-11-25 20:09:33 -08:00
continue
2025-12-11 12:47:30 -08:00
if not base_path.exists():
2025-11-25 20:09:33 -08:00
continue
2025-12-11 12:47:30 -08:00
# Ensure file entry exists
2025-12-14 00:53:52 -08:00
file_id: Optional[int] = None
2025-11-25 20:09:33 -08:00
try:
cursor = db.connection.cursor() if db.connection else None
if cursor:
2025-12-29 17:05:03 -08:00
cursor.execute(
"SELECT id FROM files WHERE file_path = ?",
(str(base_path),
)
2025-12-29 17:05:03 -08:00
)
2025-11-25 20:09:33 -08:00
result = cursor.fetchone()
file_id = result[0] if result else None
except Exception:
file_id = None
2025-12-11 12:47:30 -08:00
2025-11-25 20:09:33 -08:00
if not file_id:
try:
cursor = db.connection.cursor() if db.connection else None
if cursor:
cursor.execute(
'INSERT INTO files (file_path, indexed_at, updated_at) VALUES (?, datetime("now"), datetime("now"))',
(str(base_path),
),
2025-11-25 20:09:33 -08:00
)
db.connection.commit()
file_id = cursor.lastrowid
except Exception:
continue
2025-12-11 12:47:30 -08:00
if not file_id:
continue
2025-12-29 17:05:03 -08:00
if sidecar_path.suffix == ".tag":
2025-11-25 20:09:33 -08:00
try:
2025-12-29 17:05:03 -08:00
content = sidecar_path.read_text(encoding="utf-8")
2025-12-11 12:47:30 -08:00
except Exception:
continue
tags = [
line.strip() for line in content.splitlines() if line.strip()
]
2025-12-11 12:47:30 -08:00
if tags:
try:
2025-11-25 20:09:33 -08:00
cursor = db.connection.cursor() if db.connection else None
if cursor:
2025-12-14 00:53:52 -08:00
file_hash_value: Optional[str] = None
2025-12-29 17:05:03 -08:00
if hasattr(db, "get_file_hash"):
2025-12-14 00:53:52 -08:00
try:
file_hash_value = db.get_file_hash(file_id)
except Exception:
file_hash_value = None
2025-11-25 20:09:33 -08:00
for tag in tags:
cursor.execute(
2025-12-29 17:05:03 -08:00
"INSERT OR IGNORE INTO tags (hash, tag) VALUES (?, ?)",
(file_hash_value,
tag),
2025-11-25 20:09:33 -08:00
)
db.connection.commit()
2025-12-11 12:47:30 -08:00
except Exception:
pass
2025-12-29 17:05:03 -08:00
elif sidecar_path.suffix == ".metadata":
2025-12-11 12:47:30 -08:00
url: List[str] = []
relationships: List[str] = []
hash_value: Optional[str] = None
try:
2025-12-29 17:05:03 -08:00
content = sidecar_path.read_text(encoding="utf-8")
2025-11-25 20:09:33 -08:00
except Exception:
2025-12-29 17:05:03 -08:00
content = ""
2025-12-11 12:47:30 -08:00
for raw_line in content.splitlines():
line = raw_line.strip()
2025-12-29 17:05:03 -08:00
if not line or line.startswith("#"):
2025-12-11 12:47:30 -08:00
continue
lower = line.lower()
2025-12-29 17:05:03 -08:00
if lower.startswith("hash:"):
hash_value = line.split(":", 1)[1].strip() or None
elif lower.startswith("url:") or lower.startswith("url:"):
url_part = line.split(":", 1)[1].strip()
2025-12-11 12:47:30 -08:00
if url_part:
2025-12-29 17:05:03 -08:00
for url_segment in url_part.replace(",", " ").split():
2025-12-11 12:47:30 -08:00
clean = url_segment.strip()
if clean and clean not in url:
url.append(clean)
2025-12-29 17:05:03 -08:00
elif lower.startswith("relationship:"):
rel_value = line.split(":", 1)[1].strip()
2025-12-11 12:47:30 -08:00
if rel_value:
relationships.append(rel_value)
if sha256_file and base_path.exists():
try:
hash_value = sha256_file(base_path)
except Exception:
pass
2025-11-25 20:09:33 -08:00
try:
cursor = db.connection.cursor() if db.connection else None
2025-12-11 12:47:30 -08:00
if cursor:
2025-11-25 20:09:33 -08:00
cursor.execute(
2025-12-11 12:47:30 -08:00
'INSERT OR REPLACE INTO metadata (file_id, hash, url, relationships, time_imported, time_modified) VALUES (?, ?, ?, ?, datetime("now"), datetime("now"))',
2025-11-25 20:09:33 -08:00
(
file_id,
2025-12-11 12:47:30 -08:00
hash_value,
json.dumps(url),
json.dumps(relationships),
2025-12-29 17:05:03 -08:00
),
2025-11-25 20:09:33 -08:00
)
db.connection.commit()
except Exception:
pass
2025-12-11 12:47:30 -08:00
2025-12-29 17:05:03 -08:00
elif sidecar_path.suffix == ".notes":
2025-12-11 12:47:30 -08:00
try:
2025-12-29 17:05:03 -08:00
content = sidecar_path.read_text(encoding="utf-8").strip()
2025-12-11 12:47:30 -08:00
except Exception:
2025-12-29 17:05:03 -08:00
content = ""
2025-12-11 12:47:30 -08:00
if content:
try:
cursor = db.connection.cursor() if db.connection else None
if cursor:
cursor.execute(
'INSERT INTO notes (file_id, note, created_at, updated_at) VALUES (?, ?, datetime("now"), datetime("now")) ON CONFLICT(file_id) DO UPDATE SET note = excluded.note, updated_at = datetime("now")',
(file_id,
content),
2025-12-11 12:47:30 -08:00
)
db.connection.commit()
except Exception:
pass
2025-11-25 20:09:33 -08:00
except Exception:
pass
def _extract_from_sequence(values: Sequence) -> Iterable[str]:
"""Extract string values from a sequence of mixed types (dicts, strings, etc.)."""
seen = set()
for item in values:
candidate = None
if isinstance(item, dict):
2025-12-29 17:05:03 -08:00
candidate = (
item.get("name") or item.get("title") or item.get("value")
or item.get("text") or item.get("id") or item.get("imdb_id")
2025-12-29 17:05:03 -08:00
)
2025-11-25 20:09:33 -08:00
else:
candidate = str(item)
if candidate:
normalized = value_normalize(str(candidate))
if normalized and normalized not in seen:
seen.add(normalized)
yield normalized
def _add_tag(tags: List[str], namespace: str, value: Optional[str]) -> None:
"""Add a single namespaced tag (e.g., 'artist:Beatles')."""
if not value:
return
value = value_normalize(str(value))
if not value:
return
tags.append(f"{namespace}:{value}")
def _extend_tags(tags: List[str], namespace: str, values) -> None:
"""Extend tags from a single value or sequence, with optional namespace."""
if not values:
return
if isinstance(values, set):
values = list(values)
if isinstance(values, (list, tuple)):
for candidate in _extract_from_sequence(values):
_add_tag(tags, namespace, candidate)
else:
_add_tag(tags, namespace, values)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
def imdb_tag(imdb_id: str) -> Dict[str, object]:
movie = get_movie(imdb_id)
if movie is None:
raise ValueError(f"IMDb title not found: {imdb_id}")
if hasattr(movie, "model_dump"):
info = movie.model_dump()
elif hasattr(movie, "dict"):
info = movie.dict()
else:
info = {}
tags: List[str] = []
canonical_id = getattr(movie, "imdb_id", None) or info.get("imdb_id") or imdb_id
if canonical_id:
canonical_id = str(canonical_id).strip().lower()
if not canonical_id.startswith("tt"):
canonical_id = f"tt{canonical_id}"
else:
canonical_id = imdb_id.lower()
if not canonical_id.startswith("tt"):
canonical_id = f"tt{canonical_id}"
_add_tag(tags, "imdb", canonical_id)
_add_tag(tags, "title", info.get("title") or getattr(movie, "title", None))
2025-12-29 17:05:03 -08:00
_add_tag(
tags,
"year",
info.get("year") or info.get("start_year") or getattr(movie,
"year",
None)
2025-12-29 17:05:03 -08:00
)
2025-11-25 20:09:33 -08:00
_add_tag(tags, "rating", info.get("rating"))
runtime_value = None
if isinstance(info.get("runtime"), (str, int)):
runtime_value = info["runtime"]
elif isinstance(info.get("runtimes"), (list, tuple)) and info["runtimes"]:
runtime_value = info["runtimes"][0]
elif info.get("duration"):
runtime_value = info["duration"]
_add_tag(tags, "runtime", runtime_value)
kind = None
if hasattr(movie, "is_series") and movie.is_series():
kind = "series"
elif hasattr(movie, "is_episode") and movie.is_episode():
kind = "episode"
else:
kind = info.get("kind") or "movie"
_add_tag(tags, "kind", kind)
_extend_tags(tags, "genre", info.get("genres") or info.get("genre"))
_extend_tags(tags, "language", info.get("languages"))
_extend_tags(tags, "country", info.get("countries"))
2025-12-29 17:05:03 -08:00
creators = (
info.get("directors") or info.get("director") or info.get("producers")
2025-12-29 17:05:03 -08:00
or info.get("writers")
)
2025-11-25 20:09:33 -08:00
if creators:
_extend_tags(tags, "creator", creators)
info_episode = getattr(movie, "info_episode", None)
series_title = None
season = info.get("season") or info.get("series_season")
episode = info.get("episode") or info.get("series_episode")
if info_episode:
if hasattr(info_episode, "model_dump"):
episode_meta = info_episode.model_dump()
elif hasattr(info_episode, "dict"):
episode_meta = info_episode.dict()
else:
episode_meta = getattr(info_episode,
"__dict__",
{}) or {}
2025-11-25 20:09:33 -08:00
season = season or episode_meta.get("season") or episode_meta.get("season_n")
episode = episode or episode_meta.get("episode"
) or episode_meta.get("episode_n")
2025-11-25 20:09:33 -08:00
series_title = episode_meta.get("series_title")
if not series_title:
series_title = getattr(getattr(movie, "series_info", None), "title", None)
if kind == "episode" and not season:
season = getattr(getattr(movie, "series_info", None), "season", None)
if season:
_add_tag(tags, "season", season)
if episode:
_add_tag(tags, "episode", episode)
2025-12-29 17:05:03 -08:00
series_title = (
series_title or info.get("series_title") or info.get("series")
or getattr(getattr(movie,
"series_info",
None),
"title",
None)
2025-12-29 17:05:03 -08:00
)
2025-11-25 20:09:33 -08:00
if series_title:
_add_tag(tags, "series", series_title)
summary = info.get("plot outline") or info.get("plot_outline") or info.get("plot")
if isinstance(summary, (list, tuple)):
summary = summary[0] if summary else None
if not summary and hasattr(movie, "plot_outline"):
summary = getattr(movie, "plot_outline")
if not summary:
summaries = info.get("summaries")
if isinstance(summaries, (list, tuple)) and summaries:
summary = summaries[0]
if summary:
_add_tag(tags, "summary", summary)
2025-12-29 17:05:03 -08:00
cast_sources = (
info.get("cast") or info.get("actors") or info.get("cast_members")
or info.get("stars")
2025-12-29 17:05:03 -08:00
)
2025-11-25 20:09:33 -08:00
cast_names: List[str] = []
if cast_sources:
for name in _extract_from_sequence(cast_sources):
if name:
cast_names.append(name)
if len(cast_names) >= 10:
break
if cast_names:
_extend_tags(tags, "cast", cast_names)
return {
"source": "imdb",
"id": canonical_id,
"tag": tags
}
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
def fetch_musicbrainz_tags(mbid: str, entity: str) -> Dict[str, object]:
if not musicbrainzngs:
raise RuntimeError("musicbrainzngs package is not available")
entity = entity.lower()
if entity not in {"release",
"recording",
"artist"}:
2025-11-25 20:09:33 -08:00
raise ValueError("Unsupported MusicBrainz entity: %s" % entity)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
def _fetch_with_fallback(getter, key: str, includes: List[str]):
try:
return getter(mbid, includes=includes)[key]
except MusicBrainzRequestError as exc:
if "Bad includes" in str(exc) and "genres" in includes:
fallback = [inc for inc in includes if inc != "genres"]
return getter(mbid, includes=fallback)[key]
raise
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
include = ["tags", "genres"]
match entity:
case "release":
include.extend(["artist-credits", "release-groups"])
data = _fetch_with_fallback(
musicbrainzngs.get_release_by_id,
"release",
include
)
2025-11-25 20:09:33 -08:00
case "recording":
include.extend(["artists", "releases"])
data = _fetch_with_fallback(
musicbrainzngs.get_recording_by_id,
"recording",
include
)
2025-11-25 20:09:33 -08:00
case _:
include.extend(["release-groups", "aliases"])
data = _fetch_with_fallback(
musicbrainzngs.get_artist_by_id,
"artist",
include
)
2025-11-25 20:09:33 -08:00
tags: List[str] = []
_add_tag(tags, "musicbrainz", mbid)
_add_tag(tags, "entity", entity)
_add_tag(tags, "title", data.get("title"))
if entity != "artist":
date = data.get("date") or data.get("first-release-date")
if date:
_add_tag(tags, "date", date)
_add_tag(tags, "year", date[:4])
if data.get("country"):
_add_tag(tags, "country", data["country"])
if data.get("status"):
_add_tag(tags, "status", data["status"])
artist_credit = data.get("artist-credit") or data.get("artists")
if artist_credit:
names = []
for item in artist_credit:
if isinstance(item, dict):
name = item.get("name") or item.get("artist",
{}).get("name")
2025-11-25 20:09:33 -08:00
if name:
names.append(name)
_extend_tags(tags, "artist", names)
tag_list = data.get("tag-list") or data.get("tags") or []
for tag in tag_list:
if isinstance(tag, dict) and tag.get("name"):
_add_tag(tags, "tag", tag["name"])
genre_list = data.get("genre-list") or data.get("genres") or []
for genre in genre_list:
if isinstance(genre, dict) and genre.get("name"):
_add_tag(tags, "genre", genre["name"])
return {
"source": "musicbrainz",
"id": mbid,
"tag": tags,
"entity": entity
}
2025-11-25 20:09:33 -08:00
def _append_unique(target: List[str], seen: Set[str], value: Optional[str]) -> None:
"""Append a single value if not already in seen set (deduplication)."""
if value is None:
return
normalized = value_normalize(str(value))
if not normalized or normalized in seen:
return
seen.add(normalized)
target.append(normalized)
2025-12-29 17:05:03 -08:00
def _extend_namespaced(
target: List[str],
seen: Set[str],
namespace: str,
values: Iterable[Optional[str]]
2025-12-29 17:05:03 -08:00
) -> None:
2025-11-25 20:09:33 -08:00
"""Append namespaced values if not already in seen set."""
for val in values:
if val:
_append_unique(target, seen, f"{namespace}:{val}")
def _coerce_duration(metadata: Dict[str, Any]) -> Optional[float]:
for key in ("duration", "duration_seconds", "length", "duration_sec"):
value = metadata.get(key)
if value is None:
continue
if isinstance(value, (int, float)):
if value > 0:
return float(value)
elif isinstance(value, str):
try:
candidate = float(value.strip())
except ValueError:
continue
if candidate > 0:
return candidate
return None
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
def _sanitize_url(value: Optional[str]) -> Optional[str]:
"""Sanitize URL: normalize and remove ytdl:// prefix."""
if value is None:
return None
cleaned = value_normalize(str(value))
if not cleaned:
return None
if cleaned.lower().startswith("ytdl://"):
cleaned = cleaned[7:]
return cleaned
def _clean_existing_tags(existing: Any) -> List[str]:
tags: List[str] = []
seen: Set[str] = set()
if isinstance(existing, (list, tuple, set)):
iterable = existing
elif existing is None:
iterable = []
else:
iterable = [existing]
for tag in iterable:
_append_unique(tags, seen, tag)
return tags
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
def _should_fetch_url(url: Optional[str]) -> bool:
if not url or not isinstance(url, str):
return False
2025-12-29 17:05:03 -08:00
return url.lower().startswith(("http://", "https://"))
def fetch_remote_metadata(url: str,
options: Dict[str,
Any]) -> Tuple[Optional[Dict[str,
Any]],
List[str]]:
2025-11-25 20:09:33 -08:00
warnings: List[str] = []
info: Optional[Dict[str, Any]] = None
if yt_dlp is not None:
try: # pragma: no cover - depends on runtime availability
ydl_opts = {
2025-12-29 17:05:03 -08:00
"quiet": True,
"no_warnings": True,
"skip_download": True,
"noplaylist": True,
2025-11-25 20:09:33 -08:00
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl: # type: ignore[attr-defined]
info_dict = ydl.extract_info(url, download=False)
if info_dict is not None:
info = dict(info_dict)
except Exception as exc: # pragma: no cover - best effort
warnings.append(f"yt_dlp extract failed: {exc}")
if info is None:
2025-12-29 17:05:03 -08:00
executable = str(options.get("ytdlp_path") or "yt-dlp")
extra_args = options.get("ytdlp_args") or []
2025-11-25 20:09:33 -08:00
if isinstance(extra_args, (str, bytes)):
extra_args = [extra_args]
2025-12-29 17:05:03 -08:00
cmd = [
executable,
"--dump-single-json",
"--no-playlist",
"--skip-download",
"--no-warnings",
]
2025-11-25 20:09:33 -08:00
cmd.extend(str(arg) for arg in extra_args)
cmd.append(url)
2025-12-29 17:05:03 -08:00
timeout = float(options.get("timeout") or 45.0)
2025-11-25 20:09:33 -08:00
try:
2025-12-29 17:05:03 -08:00
completed = subprocess.run(
cmd,
capture_output=True,
text=True,
check=False,
timeout=timeout
2025-12-29 17:05:03 -08:00
)
2025-11-25 20:09:33 -08:00
except Exception as exc: # pragma: no cover - subprocess failure
warnings.append(f"yt-dlp invocation failed: {exc}")
return None, warnings
if completed.returncode != 0:
2025-12-29 17:05:03 -08:00
message = (
completed.stderr.strip() or completed.stdout.strip()
2025-12-29 17:05:03 -08:00
or f"status {completed.returncode}"
)
2025-11-25 20:09:33 -08:00
warnings.append(message)
return None, warnings
try:
info = json.loads(completed.stdout)
except json.JSONDecodeError as exc: # pragma: no cover - parse failure
warnings.append(f"invalid JSON from yt-dlp: {exc}")
return None, warnings
2025-12-29 17:05:03 -08:00
if isinstance(info, dict) and "entries" in info:
entries = info.get("entries")
2025-11-25 20:09:33 -08:00
if isinstance(entries, list) and entries:
info = entries[0]
if isinstance(info, dict):
2025-12-29 17:05:03 -08:00
info.setdefault("source_url", url)
2025-11-25 20:09:33 -08:00
return info if isinstance(info, dict) else None, warnings
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
def resolve_remote_metadata(payload: Dict[str, Any]) -> Dict[str, Any]:
2025-12-29 17:05:03 -08:00
options_raw = payload.get("options")
options: Dict[str,
Any] = options_raw if isinstance(options_raw,
dict) else {}
2025-12-29 17:05:03 -08:00
source_url = payload.get("source_url")
2025-11-25 20:09:33 -08:00
sanitized = _sanitize_url(source_url) or source_url
2025-12-29 17:05:03 -08:00
existing_tags = _clean_existing_tags(payload.get("existing_tags"))
2025-11-25 20:09:33 -08:00
metadata_sources: List[Dict[str, Any]] = []
2025-12-29 17:05:03 -08:00
for key in ("metadata", "mpv_metadata", "remote_metadata", "info"):
2025-11-25 20:09:33 -08:00
candidate = payload.get(key)
if isinstance(candidate, dict):
metadata_sources.append(candidate)
remote_info: Optional[Dict[str, Any]] = None
warnings: List[str] = []
2025-12-29 17:05:03 -08:00
if not options.get("no_fetch"):
2025-11-25 20:09:33 -08:00
fetch_url = sanitized
if _should_fetch_url(fetch_url):
2025-12-29 17:05:03 -08:00
remote_info, fetch_warnings = fetch_remote_metadata(fetch_url or "", options)
2025-11-25 20:09:33 -08:00
warnings.extend(fetch_warnings)
if remote_info:
metadata_sources.append(remote_info)
combined_metadata = {}
for source in metadata_sources:
if isinstance(source, dict):
combined_metadata.update(source)
context = {
"source_url": sanitized
}
2025-11-25 20:09:33 -08:00
bundle = build_remote_bundle(combined_metadata, existing_tags, context)
merged_metadata = {
**combined_metadata,
**(bundle.get("metadata") or {})
}
2025-12-29 17:05:03 -08:00
bundle["metadata"] = merged_metadata
if not bundle.get("source_url"):
bundle["source_url"] = sanitized
mpv_meta_candidate = payload.get("mpv_metadata")
2025-11-25 20:09:33 -08:00
mpv_metadata = mpv_meta_candidate if isinstance(mpv_meta_candidate, dict) else None
2025-12-29 17:05:03 -08:00
result_tags = bundle.get("tags") or existing_tags
2025-12-11 12:47:30 -08:00
result = {
2025-12-29 17:05:03 -08:00
"source": "remote-metadata",
"id": sanitized or "unknown",
"tags": result_tags,
"title": bundle.get("title"),
"source_url": bundle.get("source_url") or sanitized,
"duration": bundle.get("duration"),
"metadata": merged_metadata,
"remote_metadata": remote_info,
"warnings": warnings,
"mpv_metadata": mpv_metadata,
2025-12-11 12:47:30 -08:00
}
return result
2025-11-25 20:09:33 -08:00
def _ensure_hydrus_client() -> None:
if (HydrusClient is None or HydrusRequestSpec
is None): # pragma: no cover - depends on optional module
2025-11-25 20:09:33 -08:00
raise RuntimeError("Hydrus helpers are unavailable")
def _normalize_hash(value: Any) -> str:
2025-12-29 17:05:03 -08:00
candidate = str(value or "").strip().lower()
2025-11-25 20:09:33 -08:00
if not candidate:
raise ValueError("Hydrus hash is required")
2025-12-29 17:05:03 -08:00
if len(candidate) != 64 or any(ch not in "0123456789abcdef" for ch in candidate):
2025-11-25 20:09:33 -08:00
raise ValueError("Hydrus hash must be a 64-character hex string")
return candidate
def _normalize_tag(tag: Any) -> Optional[str]:
if tag is None:
return None
if isinstance(tag, str):
candidate = tag.strip()
else:
candidate = str(tag).strip()
return candidate or None
def _extract_tag_services(entry: Dict[str, Any]) -> List[Dict[str, Any]]:
2025-12-29 17:05:03 -08:00
tags_section = entry.get("tags")
2025-11-25 20:09:33 -08:00
services: List[Dict[str, Any]] = []
if not isinstance(tags_section, dict):
return services
2025-12-29 17:05:03 -08:00
names_map = tags_section.get("service_keys_to_names")
2025-11-25 20:09:33 -08:00
if not isinstance(names_map, dict):
names_map = {}
def get_record(service_key: Optional[str],
service_name: Optional[str]) -> Dict[str,
Any]:
2025-11-25 20:09:33 -08:00
key_lower = service_key.lower() if isinstance(service_key, str) else None
name_lower = service_name.lower() if isinstance(service_name, str) else None
for record in services:
2025-12-29 17:05:03 -08:00
existing_key = record.get("service_key")
if key_lower and isinstance(existing_key,
str) and existing_key.lower() == key_lower:
2025-12-29 17:05:03 -08:00
if service_name and not record.get("service_name"):
record["service_name"] = service_name
2025-11-25 20:09:33 -08:00
return record
2025-12-29 17:05:03 -08:00
existing_name = record.get("service_name")
if (name_lower and isinstance(existing_name,
str) and existing_name.lower() == name_lower):
2025-12-29 17:05:03 -08:00
if service_key and not record.get("service_key"):
record["service_key"] = service_key
2025-11-25 20:09:33 -08:00
return record
record = {
2025-12-29 17:05:03 -08:00
"service_key": service_key,
"service_name": service_name,
"tags": [],
2025-11-25 20:09:33 -08:00
}
services.append(record)
return record
def _iter_current_status_lists(container: Any) -> Iterable[List[Any]]:
if isinstance(container, dict):
for status_key, tags_list in container.items():
2025-12-29 17:05:03 -08:00
if str(status_key) != "0":
2025-11-25 20:09:33 -08:00
continue
if isinstance(tags_list, list):
yield tags_list
elif isinstance(container, list):
yield container
2025-12-29 17:05:03 -08:00
statuses_map = tags_section.get("service_keys_to_statuses_to_tags")
2025-11-25 20:09:33 -08:00
if isinstance(statuses_map, dict):
for service_key, status_map in statuses_map.items():
2025-12-29 17:05:03 -08:00
record = get_record(
service_key if isinstance(service_key,
str) else None,
names_map.get(service_key)
2025-12-29 17:05:03 -08:00
)
2025-11-25 20:09:33 -08:00
for tags_list in _iter_current_status_lists(status_map):
for tag in tags_list:
normalized = _normalize_tag(tag)
if normalized:
2025-12-29 17:05:03 -08:00
record["tags"].append(normalized)
2025-11-25 20:09:33 -08:00
ignored_keys = {
2025-12-29 17:05:03 -08:00
"service_keys_to_statuses_to_tags",
"service_keys_to_statuses_to_display_tags",
"service_keys_to_display_friendly_tags",
"service_keys_to_names",
"tag_display_types_to_namespaces",
"namespace_display_string_lookup",
"tag_display_decoration_colour_lookup",
2025-11-25 20:09:33 -08:00
}
for key, service in tags_section.items():
if key in ignored_keys:
continue
if isinstance(service, dict):
service_key = service.get("service_key"
) or (key if isinstance(key,
str) else None)
2025-12-29 17:05:03 -08:00
service_name = (
service.get("service_name") or service.get("name")
or names_map.get(service_key)
)
record = get_record(
service_key if isinstance(service_key,
str) else None,
service_name
2025-12-29 17:05:03 -08:00
)
storage = (
service.get("storage_tags") or service.get("statuses_to_tags")
2025-12-29 17:05:03 -08:00
or service.get("tags")
)
2025-11-25 20:09:33 -08:00
if isinstance(storage, dict):
for tags_list in _iter_current_status_lists(storage):
for tag in tags_list:
normalized = _normalize_tag(tag)
if normalized:
2025-12-29 17:05:03 -08:00
record["tags"].append(normalized)
2025-11-25 20:09:33 -08:00
elif isinstance(storage, list):
for tag in storage:
normalized = _normalize_tag(tag)
if normalized:
2025-12-29 17:05:03 -08:00
record["tags"].append(normalized)
2025-11-25 20:09:33 -08:00
# Use canonical dedup function
for record in services:
2025-12-29 17:05:03 -08:00
record["tags"] = dedup_tags_by_namespace(record["tags"], keep_first=True)
2025-11-25 20:09:33 -08:00
return services
2025-12-29 17:05:03 -08:00
def _select_primary_tags(
services: List[Dict[str,
Any]],
aggregated: List[str],
prefer_service: Optional[str]
) -> Tuple[Optional[str],
List[str]]:
2025-11-25 20:09:33 -08:00
prefer_lower = prefer_service.lower() if isinstance(prefer_service, str) else None
if prefer_lower:
for record in services:
2025-12-29 17:05:03 -08:00
name = record.get("service_name")
if isinstance(name,
str) and name.lower() == prefer_lower and record["tags"]:
2025-12-29 17:05:03 -08:00
return record.get("service_key"), record["tags"]
2025-11-25 20:09:33 -08:00
for record in services:
2025-12-29 17:05:03 -08:00
if record["tags"]:
return record.get("service_key"), record["tags"]
2025-11-25 20:09:33 -08:00
return None, aggregated
2025-12-29 17:05:03 -08:00
def _derive_title(
tags_primary: List[str],
tags_aggregated: List[str],
entry: Dict[str,
Any]
2025-12-29 17:05:03 -08:00
) -> Optional[str]:
2025-11-25 20:09:33 -08:00
for source in (tags_primary, tags_aggregated):
for tag in source:
2025-12-29 17:05:03 -08:00
namespace, sep, value = tag.partition(":")
if sep and namespace and namespace.lower() == "title":
2025-11-25 20:09:33 -08:00
cleaned = value.strip()
if cleaned:
return cleaned
2025-12-29 17:05:03 -08:00
for key in (
"title",
"display_name",
"pretty_name",
"original_display_filename",
"original_filename",
2025-12-29 17:05:03 -08:00
):
2025-11-25 20:09:33 -08:00
value = entry.get(key)
if isinstance(value, str):
cleaned = value.strip()
if cleaned:
return cleaned
return None
2025-12-29 17:05:03 -08:00
def _derive_clip_time(
tags_primary: List[str],
tags_aggregated: List[str],
entry: Dict[str,
Any]
2025-12-29 17:05:03 -08:00
) -> Optional[str]:
namespaces = {"clip",
"clip_time",
"cliptime"}
2025-11-25 20:09:33 -08:00
for source in (tags_primary, tags_aggregated):
for tag in source:
2025-12-29 17:05:03 -08:00
namespace, sep, value = tag.partition(":")
2025-11-25 20:09:33 -08:00
if sep and namespace and namespace.lower() in namespaces:
cleaned = value.strip()
if cleaned:
return cleaned
2025-12-29 17:05:03 -08:00
clip_value = entry.get("clip_time")
2025-11-25 20:09:33 -08:00
if isinstance(clip_value, str):
cleaned_clip = clip_value.strip()
if cleaned_clip:
return cleaned_clip
return None
2025-12-29 17:05:03 -08:00
def _summarize_hydrus_entry(
entry: Dict[str,
Any],
prefer_service: Optional[str]
) -> Tuple[Dict[str,
Any],
List[str],
Optional[str],
Optional[str],
Optional[str]]:
2025-11-25 20:09:33 -08:00
services = _extract_tag_services(entry)
aggregated: List[str] = []
seen: Set[str] = set()
for record in services:
2025-12-29 17:05:03 -08:00
for tag in record["tags"]:
2025-11-25 20:09:33 -08:00
if tag not in seen:
seen.add(tag)
aggregated.append(tag)
service_key, primary_tags = _select_primary_tags(services, aggregated, prefer_service)
title = _derive_title(primary_tags, aggregated, entry)
clip_time = _derive_clip_time(primary_tags, aggregated, entry)
summary = dict(entry)
2025-12-29 17:05:03 -08:00
if title and not summary.get("title"):
summary["title"] = title
if clip_time and not summary.get("clip_time"):
summary["clip_time"] = clip_time
summary["tag_service_key"] = service_key
summary["has_current_file_service"] = _has_current_file_service(entry)
if "is_local" not in summary:
summary["is_local"] = bool(entry.get("is_local"))
2025-11-25 20:09:33 -08:00
return summary, primary_tags, service_key, title, clip_time
def _looks_like_hash(value: Any) -> bool:
if not isinstance(value, str):
return False
candidate = value.strip().lower()
2025-12-29 17:05:03 -08:00
return len(candidate) == 64 and all(ch in "0123456789abcdef" for ch in candidate)
2025-11-25 20:09:33 -08:00
def _collect_relationship_hashes(payload: Any, accumulator: Set[str]) -> None:
if isinstance(payload, dict):
for value in payload.values():
_collect_relationship_hashes(value, accumulator)
elif isinstance(payload, (list, tuple, set)):
for value in payload:
_collect_relationship_hashes(value, accumulator)
elif isinstance(payload, str) and _looks_like_hash(payload):
accumulator.add(payload)
def _build_hydrus_query(
hashes: Optional[Sequence[str]],
file_ids: Optional[Sequence[int]],
include_relationships: bool,
minimal: bool,
) -> Dict[str,
str]:
query: Dict[str,
str] = {}
2025-11-25 20:09:33 -08:00
if hashes:
2025-12-29 17:05:03 -08:00
query["hashes"] = json.dumps(list(hashes))
2025-11-25 20:09:33 -08:00
if file_ids:
2025-12-29 17:05:03 -08:00
query["file_ids"] = json.dumps([int(value) for value in file_ids])
2025-11-25 20:09:33 -08:00
if not query:
2025-12-29 17:05:03 -08:00
raise ValueError("hashes or file_ids must be provided")
query["include_service_keys_to_tags"] = json.dumps(True)
query["include_tag_services"] = json.dumps(True)
query["include_file_services"] = json.dumps(True)
2025-11-25 20:09:33 -08:00
if include_relationships:
2025-12-29 17:05:03 -08:00
query["include_file_relationships"] = json.dumps(True)
2025-11-25 20:09:33 -08:00
if not minimal:
extras = (
2025-12-29 17:05:03 -08:00
"include_url",
"include_size",
"include_width",
"include_height",
"include_duration",
"include_mime",
"include_has_audio",
"include_is_trashed",
2025-11-25 20:09:33 -08:00
)
for key in extras:
query[key] = json.dumps(True)
return query
def _fetch_hydrus_entries(
client: Any,
hashes: Optional[Sequence[str]],
file_ids: Optional[Sequence[int]],
include_relationships: bool,
minimal: bool,
) -> List[Dict[str,
Any]]:
2025-11-25 20:09:33 -08:00
if not hashes and not file_ids:
return []
assert HydrusRequestSpec is not None
spec = HydrusRequestSpec(
2025-12-29 17:05:03 -08:00
method="GET",
endpoint="/get_files/file_metadata",
query=_build_hydrus_query(hashes,
file_ids,
include_relationships,
minimal),
2025-11-25 20:09:33 -08:00
)
response = client._perform_request(spec) # type: ignore[attr-defined]
2025-12-29 17:05:03 -08:00
metadata = response.get("metadata") if isinstance(response, dict) else None
2025-11-25 20:09:33 -08:00
if isinstance(metadata, list):
return [entry for entry in metadata if isinstance(entry, dict)]
return []
def _has_current_file_service(entry: Dict[str, Any]) -> bool:
2025-12-29 17:05:03 -08:00
services = entry.get("file_services")
2025-11-25 20:09:33 -08:00
if not isinstance(services, dict):
return False
2025-12-29 17:05:03 -08:00
current = services.get("current")
2025-11-25 20:09:33 -08:00
if isinstance(current, dict):
for value in current.values():
if value:
return True
return False
if isinstance(current, list):
return len(current) > 0
return False
def _compute_file_flags(entry: Dict[str, Any]) -> Tuple[bool, bool, bool]:
2025-12-29 17:05:03 -08:00
mime = entry.get("mime")
mime_lower = mime.lower() if isinstance(mime, str) else ""
is_video = mime_lower.startswith("video/")
is_audio = mime_lower.startswith("audio/")
2025-11-25 20:09:33 -08:00
is_deleted = False
2025-12-29 17:05:03 -08:00
if entry.get("is_trashed"):
2025-11-25 20:09:33 -08:00
is_deleted = True
2025-12-29 17:05:03 -08:00
file_services = entry.get("file_services")
2025-11-25 20:09:33 -08:00
if not is_deleted and isinstance(file_services, dict):
2025-12-29 17:05:03 -08:00
deleted = file_services.get("deleted")
2025-11-25 20:09:33 -08:00
if isinstance(deleted, dict) and deleted:
is_deleted = True
return is_video, is_audio, is_deleted
def fetch_hydrus_metadata(payload: Dict[str, Any]) -> Dict[str, Any]:
_ensure_hydrus_client()
assert HydrusClient is not None
hash_hex = None
2025-12-29 17:05:03 -08:00
raw_hash_value = payload.get("hash")
2025-11-25 20:09:33 -08:00
if raw_hash_value is not None:
hash_hex = _normalize_hash(raw_hash_value)
file_ids: List[int] = []
2025-12-29 17:05:03 -08:00
raw_file_ids = payload.get("file_ids")
2025-11-25 20:09:33 -08:00
if isinstance(raw_file_ids, (list, tuple, set)):
for value in raw_file_ids:
try:
file_ids.append(int(value))
except (TypeError, ValueError):
continue
elif raw_file_ids is not None:
try:
file_ids.append(int(raw_file_ids))
except (TypeError, ValueError):
file_ids = []
2025-12-29 17:05:03 -08:00
raw_file_id = payload.get("file_id")
2025-11-25 20:09:33 -08:00
if raw_file_id is not None:
try:
coerced = int(raw_file_id)
except (TypeError, ValueError):
coerced = None
if coerced is not None and coerced not in file_ids:
file_ids.append(coerced)
2025-12-29 17:05:03 -08:00
base_url = str(payload.get("api_url") or "").strip()
2025-11-25 20:09:33 -08:00
if not base_url:
2025-12-29 17:05:03 -08:00
raise ValueError("Hydrus api_url is required")
access_key = str(payload.get("access_key") or "").strip()
options_raw = payload.get("options")
options = options_raw if isinstance(options_raw,
dict) else {}
2025-12-29 17:05:03 -08:00
prefer_service = options.get("prefer_service_name")
2025-11-25 20:09:33 -08:00
if isinstance(prefer_service, str):
prefer_service = prefer_service.strip()
else:
prefer_service = None
2025-12-29 17:05:03 -08:00
include_relationships = bool(options.get("include_relationships"))
minimal = bool(options.get("minimal"))
timeout = float(options.get("timeout") or 60.0)
2025-11-25 20:09:33 -08:00
client = HydrusClient(base_url, access_key, timeout)
hashes: Optional[List[str]] = None
if hash_hex:
hashes = [hash_hex]
if not hashes and not file_ids:
2025-12-29 17:05:03 -08:00
raise ValueError("Hydrus hash or file id is required")
2025-11-25 20:09:33 -08:00
try:
2025-12-29 17:05:03 -08:00
entries = _fetch_hydrus_entries(
client,
hashes,
file_ids or None,
include_relationships,
minimal
2025-12-29 17:05:03 -08:00
)
2025-11-25 20:09:33 -08:00
except HydrusRequestError as exc: # type: ignore[misc]
raise RuntimeError(str(exc))
if not entries:
response: Dict[str,
Any] = {
"hash": hash_hex,
"metadata": {},
"tags": [],
"warnings":
[f"No Hydrus metadata for {hash_hex or file_ids}"],
"error": "not_found",
}
2025-11-25 20:09:33 -08:00
if file_ids:
2025-12-29 17:05:03 -08:00
response["file_id"] = file_ids[0]
2025-11-25 20:09:33 -08:00
return response
entry = entries[0]
if not hash_hex:
2025-12-29 17:05:03 -08:00
entry_hash = entry.get("hash")
2025-11-25 20:09:33 -08:00
if isinstance(entry_hash, str) and entry_hash:
hash_hex = entry_hash
hashes = [hash_hex]
2025-12-29 17:05:03 -08:00
summary, primary_tags, service_key, title, clip_time = _summarize_hydrus_entry(
entry, prefer_service
)
2025-11-25 20:09:33 -08:00
is_video, is_audio, is_deleted = _compute_file_flags(entry)
has_current_file_service = _has_current_file_service(entry)
2025-12-29 17:05:03 -08:00
is_local = bool(entry.get("is_local"))
size_bytes = entry.get("size") or entry.get("file_size")
2025-11-25 20:09:33 -08:00
filesize_mb = None
if isinstance(size_bytes, (int, float)) and size_bytes > 0:
filesize_mb = float(size_bytes) / (1024.0 * 1024.0)
2025-12-29 17:05:03 -08:00
duration = entry.get("duration")
if duration is None and isinstance(entry.get("duration_ms"), (int, float)):
duration = float(entry["duration_ms"]) / 1000.0
2025-11-25 20:09:33 -08:00
warnings: List[str] = []
if not primary_tags:
2025-12-29 17:05:03 -08:00
warnings.append("No tags returned for preferred service")
2025-11-25 20:09:33 -08:00
relationships = None
relationship_metadata: Dict[str,
Dict[str,
Any]] = {}
2025-11-25 20:09:33 -08:00
if include_relationships and hash_hex:
try:
assert HydrusRequestSpec is not None
rel_spec = HydrusRequestSpec(
2025-12-29 17:05:03 -08:00
method="GET",
endpoint="/manage_file_relationships/get_file_relationships",
query={
"hash": hash_hex
},
2025-11-25 20:09:33 -08:00
)
relationships = client._perform_request(
rel_spec
) # type: ignore[attr-defined]
2025-11-25 20:09:33 -08:00
except HydrusRequestError as exc: # type: ignore[misc]
2025-12-29 17:05:03 -08:00
warnings.append(f"Relationship lookup failed: {exc}")
2025-11-25 20:09:33 -08:00
relationships = None
if isinstance(relationships, dict):
related_hashes: Set[str] = set()
_collect_relationship_hashes(relationships, related_hashes)
related_hashes.discard(hash_hex)
if related_hashes:
try:
2025-12-29 17:05:03 -08:00
related_entries = _fetch_hydrus_entries(
client,
sorted(related_hashes),
None,
False,
True
2025-12-29 17:05:03 -08:00
)
2025-11-25 20:09:33 -08:00
except HydrusRequestError as exc: # type: ignore[misc]
2025-12-29 17:05:03 -08:00
warnings.append(f"Relationship metadata fetch failed: {exc}")
2025-11-25 20:09:33 -08:00
else:
for rel_entry in related_entries:
2025-12-29 17:05:03 -08:00
rel_hash = rel_entry.get("hash")
2025-11-25 20:09:33 -08:00
if not isinstance(rel_hash, str):
continue
2025-12-29 17:05:03 -08:00
rel_summary, rel_tags, _, rel_title, rel_clip = _summarize_hydrus_entry(
rel_entry, prefer_service
)
rel_summary["tags"] = rel_tags
2025-11-25 20:09:33 -08:00
if rel_title:
2025-12-29 17:05:03 -08:00
rel_summary["title"] = rel_title
2025-11-25 20:09:33 -08:00
if rel_clip:
2025-12-29 17:05:03 -08:00
rel_summary["clip_time"] = rel_clip
2025-11-25 20:09:33 -08:00
relationship_metadata[rel_hash] = rel_summary
result: Dict[str,
Any] = {
"hash": entry.get("hash") or hash_hex,
"metadata": summary,
"tags": primary_tags,
"tag_service_key": service_key,
"title": title,
"clip_time": clip_time,
"duration": duration,
"filesize_mb": filesize_mb,
"is_video": is_video,
"is_audio": is_audio,
"is_deleted": is_deleted,
"is_local": is_local,
"has_current_file_service": has_current_file_service,
"matched_hash": entry.get("hash") or hash_hex,
"swap_recommended": False,
}
2025-12-29 17:05:03 -08:00
file_id_value = entry.get("file_id")
2025-11-25 20:09:33 -08:00
if isinstance(file_id_value, (int, float)):
2025-12-29 17:05:03 -08:00
result["file_id"] = int(file_id_value)
2025-11-25 20:09:33 -08:00
if relationships is not None:
2025-12-29 17:05:03 -08:00
result["relationships"] = relationships
2025-11-25 20:09:33 -08:00
if relationship_metadata:
2025-12-29 17:05:03 -08:00
result["relationship_metadata"] = relationship_metadata
2025-11-25 20:09:33 -08:00
if warnings:
2025-12-29 17:05:03 -08:00
result["warnings"] = warnings
2025-11-25 20:09:33 -08:00
return result
def fetch_hydrus_metadata_by_url(payload: Dict[str, Any]) -> Dict[str, Any]:
_ensure_hydrus_client()
assert HydrusClient is not None
2025-12-29 17:05:03 -08:00
raw_url = payload.get("url") or payload.get("source_url")
url = str(raw_url or "").strip()
2025-11-25 20:09:33 -08:00
if not url:
2025-12-29 17:05:03 -08:00
raise ValueError("URL is required to fetch Hydrus metadata by URL")
base_url = str(payload.get("api_url") or "").strip()
2025-11-25 20:09:33 -08:00
if not base_url:
2025-12-29 17:05:03 -08:00
raise ValueError("Hydrus api_url is required")
access_key = str(payload.get("access_key") or "").strip()
options_raw = payload.get("options")
options = options_raw if isinstance(options_raw,
dict) else {}
2025-12-29 17:05:03 -08:00
timeout = float(options.get("timeout") or 60.0)
2025-11-25 20:09:33 -08:00
client = HydrusClient(base_url, access_key, timeout)
hashes: Optional[List[str]] = None
file_ids: Optional[List[int]] = None
matched_url = None
normalised_reported = None
seen: Set[str] = set()
queue = deque()
for variant in _generate_hydrus_url_variants(url):
queue.append(variant)
if not queue:
queue.append(url)
tried_variants: List[str] = []
while queue:
candidate = queue.popleft()
2025-12-29 17:05:03 -08:00
candidate = str(candidate or "").strip()
2025-11-25 20:09:33 -08:00
if not candidate or candidate in seen:
continue
seen.add(candidate)
tried_variants.append(candidate)
assert HydrusRequestSpec is not None
spec = HydrusRequestSpec(
2025-12-29 17:05:03 -08:00
method="GET",
endpoint="/add_urls/get_url_files",
query={
"url": candidate
},
2025-11-25 20:09:33 -08:00
)
try:
response = client._perform_request(spec) # type: ignore[attr-defined]
except HydrusRequestError as exc: # type: ignore[misc]
raise RuntimeError(str(exc))
response_hashes_list: List[str] = []
response_file_ids_list: List[int] = []
if isinstance(response, dict):
2025-12-29 17:05:03 -08:00
normalised_value = response.get("normalised_url")
2025-11-25 20:09:33 -08:00
if isinstance(normalised_value, str):
trimmed = normalised_value.strip()
if trimmed:
normalised_reported = normalised_reported or trimmed
if trimmed not in seen:
queue.append(trimmed)
2025-12-29 17:05:03 -08:00
for redirect_key in ("redirect_url", "url"):
2025-11-25 20:09:33 -08:00
redirect_value = response.get(redirect_key)
if isinstance(redirect_value, str):
redirect_trimmed = redirect_value.strip()
if redirect_trimmed and redirect_trimmed not in seen:
queue.append(redirect_trimmed)
2025-12-29 17:05:03 -08:00
raw_hashes = response.get("hashes") or response.get("file_hashes")
2025-11-25 20:09:33 -08:00
if isinstance(raw_hashes, list):
for item in raw_hashes:
try:
normalized = _normalize_hash(item)
except ValueError:
continue
if normalized:
response_hashes_list.append(normalized)
2025-12-29 17:05:03 -08:00
raw_ids = response.get("file_ids") or response.get("file_id")
2025-11-25 20:09:33 -08:00
if isinstance(raw_ids, list):
for item in raw_ids:
try:
response_file_ids_list.append(int(item))
except (TypeError, ValueError):
continue
elif raw_ids is not None:
try:
response_file_ids_list.append(int(raw_ids))
except (TypeError, ValueError):
pass
2025-12-29 17:05:03 -08:00
statuses = response.get("url_file_statuses")
2025-11-25 20:09:33 -08:00
if isinstance(statuses, list):
for entry in statuses:
if not isinstance(entry, dict):
continue
2025-12-29 17:05:03 -08:00
status_hash = entry.get("hash") or entry.get("file_hash")
2025-11-25 20:09:33 -08:00
if status_hash:
try:
normalized = _normalize_hash(status_hash)
except ValueError:
normalized = None
if normalized:
response_hashes_list.append(normalized)
2025-12-29 17:05:03 -08:00
status_id = entry.get("file_id") or entry.get("fileid")
2025-11-25 20:09:33 -08:00
if status_id is not None:
try:
response_file_ids_list.append(int(status_id))
except (TypeError, ValueError):
continue
if response_hashes_list:
hashes = response_hashes_list
if response_file_ids_list:
file_ids = response_file_ids_list
if hashes or file_ids:
matched_url = candidate
break
if not hashes and not file_ids:
result = {
2025-12-29 17:05:03 -08:00
"found": False,
"url": url,
"variants": tried_variants,
"metadata": {},
"tags": [],
"warnings": [f"No Hydrus file found for {url}"],
"error": "not_found",
2025-11-25 20:09:33 -08:00
}
if normalised_reported:
2025-12-29 17:05:03 -08:00
result["normalised_url"] = normalised_reported
2025-11-25 20:09:33 -08:00
return result
hash_value = str(hashes[0]) if hashes else None
followup_payload: Dict[str,
Any] = {
"api_url": base_url,
"access_key": access_key,
"options": options,
}
2025-11-25 20:09:33 -08:00
if hash_value:
2025-12-29 17:05:03 -08:00
followup_payload["hash"] = hash_value
2025-11-25 20:09:33 -08:00
if file_ids:
2025-12-29 17:05:03 -08:00
followup_payload["file_id"] = file_ids[0]
2025-11-25 20:09:33 -08:00
result = fetch_hydrus_metadata(followup_payload)
2025-12-29 17:05:03 -08:00
result["found"] = True
result["url"] = url
2025-11-25 20:09:33 -08:00
if matched_url and matched_url != url:
2025-12-29 17:05:03 -08:00
result["matched_url"] = matched_url
2025-11-25 20:09:33 -08:00
if file_ids:
2025-12-29 17:05:03 -08:00
result["file_id"] = file_ids[0]
2025-11-25 20:09:33 -08:00
if normalised_reported:
2025-12-29 17:05:03 -08:00
result["normalised_url"] = normalised_reported
result["variants"] = tried_variants
2025-11-25 20:09:33 -08:00
return result
def _normalise_string_list(values: Optional[Iterable[Any]]) -> List[str]:
if not values:
return []
seen: Set[str] = set()
items: List[str] = []
for value in values:
if value is None:
continue
2025-12-20 23:57:44 -08:00
text = str(value).strip().lower()
2025-11-25 20:09:33 -08:00
if not text:
continue
if text in seen:
continue
seen.add(text)
items.append(text)
return items
def _derive_sidecar_path(media_path: Path) -> Path:
2025-12-11 23:21:45 -08:00
"""Return sidecar path (.tag)."""
2025-11-25 20:09:33 -08:00
try:
2025-12-29 17:05:03 -08:00
preferred = media_path.parent / (media_path.name + ".tag")
2025-11-25 20:09:33 -08:00
except ValueError:
2025-12-29 17:05:03 -08:00
preferred = media_path.with_name(media_path.name + ".tag")
2025-12-11 12:47:30 -08:00
return preferred
2025-11-25 20:09:33 -08:00
2025-12-29 17:05:03 -08:00
def _read_sidecar_metadata(
sidecar_path: Path,
) -> tuple[Optional[str],
List[str],
List[str]]: # pyright: ignore[reportUnusedFunction]
2025-12-11 23:21:45 -08:00
"""Read hash, tags, and url from sidecar file.
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
Consolidated with read_tags_from_file - this extracts extra metadata (hash, url).
2025-11-25 20:09:33 -08:00
"""
if not sidecar_path.exists():
return None, [], []
try:
2025-12-29 17:05:03 -08:00
raw = sidecar_path.read_text(encoding="utf-8")
2025-11-25 20:09:33 -08:00
except OSError:
return None, [], []
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
hash_value: Optional[str] = None
tags: List[str] = []
2025-12-14 00:53:52 -08:00
urls: List[str] = []
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
for raw_line in raw.splitlines():
line = raw_line.strip()
2025-12-29 17:05:03 -08:00
if not line or line.startswith("#"):
2025-11-25 20:09:33 -08:00
continue
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
lower = line.lower()
2025-12-29 17:05:03 -08:00
if lower.startswith("hash:"):
hash_value = line.split(":", 1)[1].strip() if ":" in line else ""
elif lower.startswith("url:") or lower.startswith("url:"):
2025-12-11 12:47:30 -08:00
# Parse url (handle legacy 'url:' format)
2025-12-29 17:05:03 -08:00
url_part = line.split(":", 1)[1].strip() if ":" in line else ""
2025-12-11 12:47:30 -08:00
if url_part:
2025-12-29 17:05:03 -08:00
for url_segment in url_part.split(","):
2025-12-14 00:53:52 -08:00
for url_token in url_segment.split():
url_clean = url_token.strip()
if url_clean and url_clean not in urls:
urls.append(url_clean)
2025-11-25 20:09:33 -08:00
else:
# Everything else is a tag (including relationship: lines)
2025-12-20 23:57:44 -08:00
tags.append(line.lower())
2025-11-25 20:09:33 -08:00
2025-12-29 17:05:03 -08:00
return hash_value, tags, urls
2025-11-25 20:09:33 -08:00
2025-11-27 10:59:01 -08:00
def rename(file_path: Path, tags: Iterable[str]) -> Optional[Path]:
2025-12-12 21:55:38 -08:00
"""Rename a file based on a title: tag.
2025-12-11 23:21:45 -08:00
If a title: tag is present, renames the file and any .tag/.metadata sidecars.
2025-12-12 21:55:38 -08:00
"""
new_title: Optional[str] = None
for tag in tags:
if isinstance(tag, str) and tag.lower().startswith("title:"):
new_title = tag.split(":", 1)[1].strip()
break
if not new_title or not file_path.exists():
return None
old_name = file_path.name
old_suffix = file_path.suffix
new_name = f"{new_title}{old_suffix}"
new_path = file_path.with_name(new_name)
if new_path == file_path:
return None
def _rename_sidecar(ext: str) -> None:
old_sidecar = file_path.parent / (old_name + ext)
if not old_sidecar.exists():
return
new_sidecar = file_path.parent / (new_name + ext)
if new_sidecar.exists():
try:
new_sidecar.unlink()
except Exception as exc:
2025-12-29 17:05:03 -08:00
debug(
f"Warning: Could not replace target sidecar {new_sidecar.name}: {exc}",
file=sys.stderr,
)
2025-12-12 21:55:38 -08:00
return
old_sidecar.rename(new_sidecar)
debug(
f"Renamed sidecar: {old_sidecar.name} -> {new_sidecar.name}",
file=sys.stderr
)
2025-12-12 21:55:38 -08:00
try:
if new_path.exists():
try:
new_path.unlink()
debug(f"Replaced existing file: {new_name}", file=sys.stderr)
except Exception as exc:
debug(
f"Warning: Could not replace target file {new_name}: {exc}",
file=sys.stderr
)
2025-12-12 21:55:38 -08:00
return None
file_path.rename(new_path)
debug(f"Renamed file: {old_name} -> {new_name}", file=sys.stderr)
_rename_sidecar(".tag")
_rename_sidecar(".metadata")
return new_path
except Exception as exc:
debug(f"Warning: Failed to rename file: {exc}", file=sys.stderr)
return None
2025-11-25 20:09:33 -08:00
2025-12-29 17:05:03 -08:00
def write_tags(
media_path: Path,
tags: Iterable[str],
url: Iterable[str],
hash_value: Optional[str] = None,
db=None,
) -> None:
2025-12-11 12:47:30 -08:00
"""Write tags to database or sidecar file (tags only).
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
Hash/URL data is no longer written to the tag sidecar; it belongs in metadata.
If db is provided, inserts tags only into LocalLibraryDB. Otherwise, writes .tag sidecar.
2025-11-25 20:09:33 -08:00
"""
if media_path.exists() and media_path.is_dir():
raise ValueError(f"write_tags_sidecar: media_path is a directory: {media_path}")
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Prepare tags lines and convert to list if needed (tags only)
2025-11-25 20:09:33 -08:00
tag_list = list(tags) if not isinstance(tags, list) else tags
2025-12-20 23:57:44 -08:00
tag_list = [str(tag).strip().lower() for tag in tag_list if str(tag).strip()]
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# If database provided, insert directly and skip sidecar
if db is not None:
try:
2025-12-20 23:57:44 -08:00
db_tags = [str(tag).strip().lower() for tag in tag_list if str(tag).strip()]
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
if db_tags:
db.add_tags(media_path, db_tags)
2025-11-27 10:59:01 -08:00
debug(f"Added tags to database for {media_path.name}")
2025-11-25 20:09:33 -08:00
return
except Exception as e:
2025-11-27 10:59:01 -08:00
debug(f"Failed to add tags to database: {e}", file=sys.stderr)
2025-11-25 20:09:33 -08:00
# Fall through to sidecar creation as fallback
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Create sidecar path
try:
2025-12-29 17:05:03 -08:00
sidecar = media_path.parent / (media_path.name + ".tag")
2025-11-25 20:09:33 -08:00
except Exception:
2025-12-29 17:05:03 -08:00
sidecar = media_path.with_name(media_path.name + ".tag")
2025-11-25 20:09:33 -08:00
# Handle edge case: empty/invalid base name
try:
if not sidecar.stem or sidecar.name in {".tag",
"-.tag",
"_.tag"}:
2025-12-29 17:05:03 -08:00
fallback_base = (
media_path.stem
or _sanitize_title_for_filename(extract_title(tag_list) or "")
or "untitled"
)
2025-12-11 12:47:30 -08:00
sidecar = media_path.parent / f"{fallback_base}.tag"
2025-11-25 20:09:33 -08:00
except Exception:
pass
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Write via consolidated function
try:
lines = []
2025-12-20 23:57:44 -08:00
lines.extend(str(tag).strip().lower() for tag in tag_list if str(tag).strip())
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
if lines:
sidecar.write_text("\n".join(lines) + "\n", encoding="utf-8")
2025-11-27 10:59:01 -08:00
debug(f"Tags: {sidecar}")
2025-11-25 20:09:33 -08:00
else:
try:
sidecar.unlink()
except FileNotFoundError:
pass
except OSError as exc:
2025-11-27 10:59:01 -08:00
debug(f"Failed to write tag sidecar {sidecar}: {exc}", file=sys.stderr)
2025-11-25 20:09:33 -08:00
2025-12-29 17:05:03 -08:00
def write_metadata(
media_path: Path,
hash_value: Optional[str] = None,
url: Optional[Iterable[str]] = None,
relationships: Optional[Iterable[str]] = None,
db=None,
) -> None:
2025-11-25 20:09:33 -08:00
"""Write metadata to database or sidecar file.
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
If db is provided, inserts into LocalLibraryDB and skips sidecar file creation.
2025-12-11 12:47:30 -08:00
Otherwise, creates .metadata sidecar file with hash, url, and relationships.
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Args:
media_path: Path to the media file
hash_value: Optional hash value for the file
2025-12-11 12:47:30 -08:00
url: Optional iterable of known URL strings
2025-11-25 20:09:33 -08:00
relationships: Optional iterable of relationship strings
db: Optional LocalLibraryDB instance. If provided, skips sidecar creation.
"""
if media_path.exists() and media_path.is_dir():
raise ValueError(
f"write_metadata_sidecar: media_path is a directory: {media_path}"
)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Prepare metadata lines
2025-12-11 12:47:30 -08:00
url_list = list(url) if url else []
2025-11-25 20:09:33 -08:00
rel_list = list(relationships) if relationships else []
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# If database provided, insert directly and skip sidecar
if db is not None:
try:
# Build metadata tag list
db_tags = []
if hash_value:
db_tags.append(f"hash:{hash_value}")
for url in url_list:
if str(url).strip():
2025-12-11 12:47:30 -08:00
clean = str(url).strip()
db_tags.append(f"url:{clean}")
2025-11-25 20:09:33 -08:00
for rel in rel_list:
if str(rel).strip():
db_tags.append(f"relationship:{str(rel).strip()}")
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
if db_tags:
db.add_tags(media_path, db_tags)
2025-11-27 10:59:01 -08:00
debug(f"Added metadata to database for {media_path.name}")
2025-11-25 20:09:33 -08:00
return
except Exception as e:
2025-11-27 10:59:01 -08:00
debug(f"Failed to add metadata to database: {e}", file=sys.stderr)
2025-11-25 20:09:33 -08:00
# Fall through to sidecar creation as fallback
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Create sidecar path
try:
2025-12-29 17:05:03 -08:00
sidecar = media_path.parent / (media_path.name + ".metadata")
2025-11-25 20:09:33 -08:00
except Exception:
2025-12-29 17:05:03 -08:00
sidecar = media_path.with_name(media_path.name + ".metadata")
2025-11-25 20:09:33 -08:00
try:
lines = []
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Add hash if available
if hash_value:
lines.append(f"hash:{hash_value}")
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Add known url
2025-11-25 20:09:33 -08:00
for url in url_list:
if str(url).strip():
2025-12-11 12:47:30 -08:00
clean = str(url).strip()
lines.append(f"url:{clean}")
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Add relationships
for rel in rel_list:
if str(rel).strip():
lines.append(f"relationship:{str(rel).strip()}")
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Write metadata file
if lines:
sidecar.write_text("\n".join(lines) + "\n", encoding="utf-8")
2025-11-27 10:59:01 -08:00
debug(f"Wrote metadata to {sidecar}")
2025-11-25 20:09:33 -08:00
else:
# Remove if no content
try:
sidecar.unlink()
except FileNotFoundError:
pass
except OSError as exc:
2025-11-27 10:59:01 -08:00
debug(f"Failed to write metadata sidecar {sidecar}: {exc}", file=sys.stderr)
2025-11-25 20:09:33 -08:00
def extract_title(tags: Iterable[str]) -> Optional[str]:
"""
Extracts a title from a list of tags (looks for 'title:...').
"""
for tag in tags:
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
tag = tag.strip()
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
if tag.lower().startswith("title:"):
title_tag = tag.split(":", 1)[1].strip()
if title_tag:
return title_tag
return None
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
def _sanitize_title_for_filename(title: str) -> str:
# Allow alnum, hyphen, underscore, and space; replace other chars with space
temp = []
for ch in title:
if ch.isalnum() or ch in {"-",
"_",
" "}:
2025-11-25 20:09:33 -08:00
temp.append(ch)
else:
temp.append(" ")
# Collapse whitespace and trim hyphens/underscores around words
rough = "".join(temp)
tokens = []
for seg in rough.split():
cleaned = seg.strip("-_ ")
if cleaned:
tokens.append(cleaned)
sanitized = "_".join(tokens)
sanitized = sanitized.strip("-_")
return sanitized or "untitled"
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
def apply_title_to_path(media_path: Path, tags: Iterable[str]) -> Path:
"""
If a title tag is present, returns a new Path with the title as filename; else returns original path.
"""
title = extract_title(tags)
if not title:
return media_path
parent = media_path.parent
sanitized = _sanitize_title_for_filename(title)
destination = parent / f"{sanitized}{media_path.suffix}"
return destination
def sync_sidecar(payload: Dict[str, Any]) -> Dict[str, Any]:
2025-12-29 17:05:03 -08:00
path_value = payload.get("path")
2025-12-11 23:21:45 -08:00
if not path_value:
2025-12-29 17:05:03 -08:00
raise ValueError("path is required to synchronise sidecar")
2025-12-11 23:21:45 -08:00
candidate = Path(str(path_value)).expanduser()
2025-12-29 17:05:03 -08:00
if candidate.suffix.lower() == ".tag":
2025-12-11 23:21:45 -08:00
sidecar_path = candidate
2025-11-25 20:09:33 -08:00
else:
2025-12-11 23:21:45 -08:00
sidecar_path = _derive_sidecar_path(candidate)
2025-12-29 17:05:03 -08:00
tags = _normalise_string_list(payload.get("tag"))
2025-12-11 23:21:45 -08:00
if not tags and sidecar_path.exists():
tags = read_tags_from_file(sidecar_path)
sidecar_path.parent.mkdir(parents=True, exist_ok=True)
if tags:
2025-12-29 17:05:03 -08:00
sidecar_path.write_text("\n".join(tags) + "\n", encoding="utf-8")
2025-11-25 20:09:33 -08:00
return {
2025-12-29 17:05:03 -08:00
"path": str(sidecar_path),
"tag": tags,
2025-11-25 20:09:33 -08:00
}
2025-12-11 23:21:45 -08:00
try:
sidecar_path.unlink()
except FileNotFoundError:
pass
2025-11-25 20:09:33 -08:00
return {
2025-12-29 17:05:03 -08:00
"path": str(sidecar_path),
"tag": [],
"deleted": True,
2025-11-25 20:09:33 -08:00
}
def _build_hydrus_context(
payload: Dict[str,
Any]
) -> Tuple[Any,
str,
str,
float,
Optional[str]]:
2025-11-25 20:09:33 -08:00
_ensure_hydrus_client()
assert HydrusClient is not None
2025-12-29 17:05:03 -08:00
base_url = str(payload.get("api_url") or "").strip()
2025-11-25 20:09:33 -08:00
if not base_url:
2025-12-29 17:05:03 -08:00
raise ValueError("Hydrus api_url is required")
access_key = str(payload.get("access_key") or "").strip()
options_raw = payload.get("options")
options = options_raw if isinstance(options_raw,
dict) else {}
2025-12-29 17:05:03 -08:00
timeout = float(options.get("timeout") or payload.get("timeout") or 60.0)
prefer_service = payload.get("prefer_service_name"
) or options.get("prefer_service_name")
2025-11-25 20:09:33 -08:00
if isinstance(prefer_service, str):
prefer_service = prefer_service.strip() or None
else:
prefer_service = None
client = HydrusClient(base_url, access_key, timeout)
return client, base_url, access_key, timeout, prefer_service
2025-12-29 17:05:03 -08:00
def _refetch_hydrus_summary(
base_url: str,
access_key: str,
hash_hex: str,
timeout: float,
prefer_service: Optional[str]
) -> Dict[str,
Any]:
payload: Dict[str,
Any] = {
"hash": hash_hex,
"api_url": base_url,
"access_key": access_key,
"options": {
"minimal": True,
"include_relationships": False,
"timeout": timeout,
},
}
2025-11-25 20:09:33 -08:00
if prefer_service:
2025-12-29 17:05:03 -08:00
payload["options"]["prefer_service_name"] = prefer_service
2025-11-25 20:09:33 -08:00
return fetch_hydrus_metadata(payload)
2025-12-29 17:05:03 -08:00
def _apply_hydrus_tag_mutation(
payload: Dict[str,
Any],
add: Iterable[Any],
remove: Iterable[Any]
) -> Dict[str,
Any]:
2025-11-25 20:09:33 -08:00
client, base_url, access_key, timeout, prefer_service = _build_hydrus_context(payload)
2025-12-29 17:05:03 -08:00
hash_hex = _normalize_hash(payload.get("hash"))
2025-11-25 20:09:33 -08:00
add_list = [_normalize_tag(tag) for tag in add if _normalize_tag(tag)]
remove_list = [_normalize_tag(tag) for tag in remove if _normalize_tag(tag)]
if not add_list and not remove_list:
2025-12-29 17:05:03 -08:00
raise ValueError("No tag changes supplied")
service_key = payload.get("service_key") or payload.get("tag_service_key")
2025-11-25 20:09:33 -08:00
summary = None
if not service_key:
summary = _refetch_hydrus_summary(
base_url,
access_key,
hash_hex,
timeout,
prefer_service
)
2025-12-29 17:05:03 -08:00
service_key = summary.get("tag_service_key")
2025-11-25 20:09:33 -08:00
if not isinstance(service_key, str) or not service_key:
2025-12-29 17:05:03 -08:00
raise RuntimeError("Unable to determine Hydrus tag service key")
actions: Dict[str,
List[str]] = {}
2025-11-25 20:09:33 -08:00
if add_list:
2025-12-29 17:05:03 -08:00
actions["0"] = [tag for tag in add_list if tag]
2025-11-25 20:09:33 -08:00
if remove_list:
2025-12-29 17:05:03 -08:00
actions["1"] = [tag for tag in remove_list if tag]
2025-11-25 20:09:33 -08:00
if not actions:
2025-12-29 17:05:03 -08:00
raise ValueError("Tag mutation produced no actionable changes")
2025-11-25 20:09:33 -08:00
request_payload = {
2025-12-29 17:05:03 -08:00
"hashes": [hash_hex],
"service_keys_to_actions_to_tags": {
2025-11-25 20:09:33 -08:00
service_key: actions,
},
}
try:
assert HydrusRequestSpec is not None
tag_spec = HydrusRequestSpec(
2025-12-29 17:05:03 -08:00
method="POST",
endpoint="/add_tags/add_tags",
2025-11-25 20:09:33 -08:00
data=request_payload,
)
client._perform_request(tag_spec)
except HydrusRequestError as exc: # type: ignore[misc]
raise RuntimeError(str(exc))
summary_after = _refetch_hydrus_summary(
base_url,
access_key,
hash_hex,
timeout,
prefer_service
)
2025-11-25 20:09:33 -08:00
result = dict(summary_after)
2025-12-29 17:05:03 -08:00
result["added_tags"] = actions.get("0", [])
result["removed_tags"] = actions.get("1", [])
result["tag_service_key"] = summary_after.get("tag_service_key")
2025-11-25 20:09:33 -08:00
return result
def apply_tag_mutation(payload: Dict[str,
Any],
operation: str = "add") -> Dict[str,
Any]:
2025-11-25 20:09:33 -08:00
"""Unified tag mutation for add and update operations (Hydrus and local).
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Consolidates: add_tag, update_tag, _add_local_tag, _update_local_tag
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Args:
payload: Mutation payload with type, tags, old_tag, new_tag
operation: 'add' or 'update'
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Returns:
Dict with tags and operation result
"""
2025-12-29 17:05:03 -08:00
file_type = str(payload.get("type", "local")).lower()
if file_type == "hydrus":
if operation == "add":
new_tag = _normalize_tag(payload.get("new_tag"))
2025-11-25 20:09:33 -08:00
if not new_tag:
2025-12-29 17:05:03 -08:00
raise ValueError("new_tag is required")
2025-11-25 20:09:33 -08:00
result = _apply_hydrus_tag_mutation(payload, [new_tag], [])
2025-12-29 17:05:03 -08:00
result["added"] = True
2025-11-25 20:09:33 -08:00
return result
else: # update
2025-12-29 17:05:03 -08:00
old_tag = _normalize_tag(payload.get("old_tag"))
new_tag = _normalize_tag(payload.get("new_tag"))
2025-11-25 20:09:33 -08:00
result = _apply_hydrus_tag_mutation(
payload,
[new_tag] if new_tag else [],
[old_tag] if old_tag else []
2025-11-25 20:09:33 -08:00
)
2025-12-29 17:05:03 -08:00
result["updated"] = True
2025-11-25 20:09:33 -08:00
return result
else: # local
2025-12-29 17:05:03 -08:00
tag = _clean_existing_tags(payload.get("tag"))
if operation == "add":
new_tag = _normalize_tag(payload.get("new_tag"))
2025-11-25 20:09:33 -08:00
if not new_tag:
2025-12-29 17:05:03 -08:00
raise ValueError("new_tag is required")
2025-12-11 23:21:45 -08:00
added = new_tag not in tag
2025-11-25 20:09:33 -08:00
if added:
2025-12-11 23:21:45 -08:00
tag.append(new_tag)
return {
"tag": tag,
"added": added
}
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
else: # update
2025-12-29 17:05:03 -08:00
old_tag = _normalize_tag(payload.get("old_tag"))
new_tag = _normalize_tag(payload.get("new_tag"))
2025-11-25 20:09:33 -08:00
if not old_tag:
2025-12-29 17:05:03 -08:00
raise ValueError("old_tag is required")
2025-11-25 20:09:33 -08:00
remaining = []
removed_count = 0
2025-12-11 23:21:45 -08:00
for item in tag:
if item == old_tag:
2025-11-25 20:09:33 -08:00
removed_count += 1
else:
2025-12-11 23:21:45 -08:00
remaining.append(item)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
if new_tag and removed_count > 0:
remaining.extend([new_tag] * removed_count)
2025-12-29 17:05:03 -08:00
2025-12-11 23:21:45 -08:00
updated = removed_count > 0 or (bool(new_tag) and new_tag not in tag)
return {
"tag": remaining,
"updated": updated,
"removed_count": removed_count
}
2025-11-25 20:09:33 -08:00
def extract_ytdlp_tags(entry: Dict[str, Any]) -> List[str]:
2025-12-29 17:05:03 -08:00
""" """
2025-11-25 20:09:33 -08:00
tags: List[str] = []
seen_namespaces: Set[str] = set()
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Meaningful yt-dlp fields that should become tags
# This mapping excludes technical fields: filesize, duration, format_id, vcodec, acodec, ext, etc.
field_to_namespace = {
2025-12-29 17:05:03 -08:00
"artist": "artist",
"album": "album",
"creator": "creator",
"uploader": "creator", # Map uploader to creator (deduplicate)
"uploader_id": "creator",
"channel": "channel",
"genre": "genre",
"track": "track",
"track_number": "track_number",
"release_date": "release_date",
"upload_date": "upload_date",
"title": "title",
"license": "license",
"location": "location",
2025-11-25 20:09:33 -08:00
}
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Extract simple field mappings
for yt_field, namespace in field_to_namespace.items():
value = entry.get(yt_field)
if value is not None:
value_str = value_normalize(str(value))
if value_str:
# Prevent duplicate creator tags (only use first creator)
2025-12-29 17:05:03 -08:00
if namespace == "creator":
if "creator" in seen_namespaces:
2025-11-25 20:09:33 -08:00
continue
2025-12-29 17:05:03 -08:00
seen_namespaces.add("creator")
2025-11-25 20:09:33 -08:00
_add_tag(tags, namespace, value_str)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Handle tags field specially (could be list, dict, or string)
# For list/sequence tags, capture as freeform (no namespace prefix)
2025-12-29 17:05:03 -08:00
tags_field = entry.get("tags")
2025-11-25 20:09:33 -08:00
if tags_field is not None:
if isinstance(tags_field, list):
# Tags is list: ["tag1", "tag2", ...] → capture as freeform tags (no "tag:" prefix)
# These are typically genre/category tags from the source (BandCamp genres, etc.)
for tag_value in tags_field:
if tag_value:
normalized = value_normalize(str(tag_value))
if normalized and normalized not in tags:
tags.append(normalized)
elif isinstance(tags_field, dict):
# Tags is dict: {"key": "val"} → tag:key:val
for key, val in tags_field.items():
if key and val:
key_normalized = value_normalize(str(key))
val_normalized = value_normalize(str(val))
if key_normalized and val_normalized:
2025-12-29 17:05:03 -08:00
_add_tag(tags, f"tag:{key_normalized}", val_normalized)
2025-11-25 20:09:33 -08:00
else:
# Tags is string or other: add as freeform
if tags_field:
normalized = value_normalize(str(tags_field))
if normalized and normalized not in tags:
tags.append(normalized)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
return tags
def dedup_tags_by_namespace(tags: List[str], keep_first: bool = True) -> List[str]:
"""Deduplicate tags by namespace, keeping consistent order.
2025-12-29 17:05:03 -08:00
2025-12-12 21:55:38 -08:00
This is the UNIFIED API for tag deduplication used across all cmdlet.
2025-11-25 20:09:33 -08:00
Replaces custom deduplication logic in merge_file.py and other modules.
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Groups tags by namespace (e.g., "artist", "album", "tag") and keeps
either the first or last occurrence of each namespace, then preserves
order based on first appearance.
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Args:
tags: List of tags (with or without namespace prefixes)
keep_first: If True, keep first occurrence per namespace (default).
If False, keep last occurrence per namespace.
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Returns:
Deduplicated tag list with consistent order
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Example:
>>> tags = [
... 'artist:Beatles', 'album:Abbey Road',
... 'artist:Beatles', 'tag:rock',
... 'album:Abbey Road', 'artist:Beatles'
... ]
>>> dedup = dedup_tags_by_namespace(tags)
2025-11-27 10:59:01 -08:00
>>> debug(dedup)
2025-11-25 20:09:33 -08:00
['artist:Beatles', 'album:Abbey Road', 'tag:rock']
"""
if not tags:
return []
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Group tags by namespace
namespace_to_tags: Dict[Optional[str],
List[Tuple[int,
str]]] = (
{}
) # namespace → [(index, full_tag), ...]
first_appearance: Dict[Optional[str],
int] = {} # namespace → first_index
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
for idx, tag in enumerate(tags):
# Extract namespace (part before ':')
2025-12-29 17:05:03 -08:00
if ":" in tag:
namespace: Optional[str] = tag.split(":", 1)[0]
2025-11-25 20:09:33 -08:00
else:
namespace = None # No namespace
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Track first appearance
if namespace not in first_appearance:
first_appearance[namespace] = idx
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Store tag with its index
if namespace not in namespace_to_tags:
namespace_to_tags[namespace] = []
namespace_to_tags[namespace].append((idx, tag))
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Build result: keep first or last occurrence per namespace
result: List[Tuple[int, str]] = [] # (first_appearance_index, tag)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
for namespace, tag_list in namespace_to_tags.items():
if keep_first:
chosen_tag = tag_list[0][1] # First occurrence
else:
chosen_tag = tag_list[-1][1] # Last occurrence
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
result.append((first_appearance[namespace], chosen_tag))
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Sort by first appearance order, then extract tags
result.sort(key=lambda x: x[0])
return [tag for _, tag in result]
def merge_multiple_tag_lists(sources: List[List[str]],
strategy: str = "first") -> List[str]:
2025-11-25 20:09:33 -08:00
"""Intelligently merge multiple tag lists with smart deduplication.
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
This is the UNIFIED API for merging tags from multiple sources
(e.g., when merging multiple files or combining metadata sources).
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Strategies:
- 'first': Keep first occurrence of each namespace (default)
- 'all': Keep all different values (different artists possible)
- 'combine': For non-namespace tags, combine all unique values
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Args:
sources: List of tag lists to merge
strategy: Merge strategy - 'first', 'all', or 'combine'
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Returns:
Merged and deduplicated tag list
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Example:
>>> list1 = ['artist:Beatles', 'album:Abbey Road']
>>> list2 = ['artist:Beatles', 'album:Abbey Road', 'tag:rock']
>>> merged = merge_multiple_tag_lists([list1, list2])
2025-11-27 10:59:01 -08:00
>>> debug(merged)
2025-11-25 20:09:33 -08:00
['artist:Beatles', 'album:Abbey Road', 'tag:rock']
"""
if not sources:
return []
2025-12-29 17:05:03 -08:00
if strategy == "first":
2025-11-25 20:09:33 -08:00
# Concatenate all lists and deduplicate by namespace
all_tags = []
for tag_list in sources:
all_tags.extend(tag_list or [])
return dedup_tags_by_namespace(all_tags, keep_first=True)
2025-12-29 17:05:03 -08:00
elif strategy == "all":
2025-11-25 20:09:33 -08:00
# Keep all different values per namespace
namespace_to_values: Dict[Optional[str],
Set[str]] = {}
2025-11-25 20:09:33 -08:00
order: List[Tuple[int, str, str]] = [] # (first_index, namespace, value)
global_index = 0
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
for source in sources:
if not source:
continue
for tag in source:
2025-12-29 17:05:03 -08:00
if ":" in tag:
namespace: Optional[str] = tag.split(":", 1)[0]
value = tag.split(":", 1)[1]
2025-11-25 20:09:33 -08:00
else:
namespace = None
value = tag
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
if namespace not in namespace_to_values:
namespace_to_values[namespace] = set()
2025-12-29 17:05:03 -08:00
order.append((global_index, namespace or "", tag))
2025-11-25 20:09:33 -08:00
elif value not in namespace_to_values[namespace]:
2025-12-29 17:05:03 -08:00
order.append((global_index, namespace or "", tag))
2025-11-25 20:09:33 -08:00
namespace_to_values[namespace].add(value)
global_index += 1
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Sort by order of first appearance and extract
order.sort(key=lambda x: x[0])
return [tag for _, _, tag in order]
2025-12-29 17:05:03 -08:00
elif strategy == "combine":
2025-11-25 20:09:33 -08:00
# Combine all unique plain (non-namespace) tags
all_tags = []
namespaced: Dict[str,
str] = {} # namespace → tag (first occurrence)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
for source in sources:
if not source:
continue
for tag in source:
2025-12-29 17:05:03 -08:00
if ":" in tag:
namespace = tag.split(":", 1)[0]
2025-11-25 20:09:33 -08:00
if namespace not in namespaced:
namespaced[namespace] = tag
all_tags.append(tag)
else:
if tag not in all_tags:
all_tags.append(tag)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
return all_tags
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
else:
raise ValueError(f"Unknown merge strategy: {strategy}")
def read_tags_from_file(file_path: Path) -> List[str]:
2025-12-11 23:21:45 -08:00
"""Read and normalize tags from .tag sidecar file.
2025-12-29 17:05:03 -08:00
2025-12-12 21:55:38 -08:00
This is the UNIFIED API for reading .tag files across all cmdlet.
2025-11-25 20:09:33 -08:00
Handles normalization, deduplication, and format validation.
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Args:
2025-12-11 23:21:45 -08:00
file_path: Path to .tag sidecar file
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Returns:
List of normalized tag strings
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Raises:
FileNotFoundError: If file doesn't exist
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Example:
2025-12-11 23:21:45 -08:00
>>> tags = read_tags_from_file(Path('file.txt.tag'))
2025-11-27 10:59:01 -08:00
>>> debug(tags)
2025-11-25 20:09:33 -08:00
['artist:Beatles', 'album:Abbey Road']
"""
file_path = Path(file_path)
if not file_path.exists():
raise FileNotFoundError(f"Tag file not found: {file_path}")
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
tags: List[str] = []
seen: Set[str] = set()
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
try:
2025-12-29 17:05:03 -08:00
with open(file_path, "r", encoding="utf-8") as f:
2025-11-25 20:09:33 -08:00
for line in f:
# Strip whitespace and skip empty lines
line = line.strip()
if not line:
continue
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Skip comment lines
2025-12-29 17:05:03 -08:00
if line.startswith("#"):
2025-11-25 20:09:33 -08:00
continue
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Normalize the tag
2025-12-20 23:57:44 -08:00
normalized = value_normalize(line).lower()
2025-11-25 20:09:33 -08:00
if normalized and normalized not in seen:
seen.add(normalized)
tags.append(normalized)
except Exception as exc:
raise ValueError(f"Error reading tag file {file_path}: {exc}")
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
return tags
def embed_metadata_in_file(
file_path: Path,
tags: List[str],
file_kind: str = ""
) -> bool:
2025-12-29 17:05:03 -08:00
""" """
2025-11-25 20:09:33 -08:00
if not tags:
return True
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
file_path = Path(file_path)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Tag namespace to FFmpeg metadata key mapping
tag_map = {
2025-12-29 17:05:03 -08:00
"title": "title",
"artist": "artist",
"album": "album",
"track": "track",
"track_number": "track",
"date": "date",
"year": "date",
"genre": "genre",
"composer": "composer",
"comment": "comment",
"url": "comment", # Embed known url in comment field
"creator": "artist", # Map creator to artist
"channel": "album_artist", # Map channel to album_artist
2025-11-25 20:09:33 -08:00
}
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Extract metadata from tags
metadata = {}
2025-12-11 12:47:30 -08:00
comments = [] # Collect comments (including url)
2025-11-25 20:09:33 -08:00
for tag in tags:
tag_str = str(tag).strip()
2025-12-29 17:05:03 -08:00
if ":" in tag_str:
namespace, value = tag_str.split(":", 1)
2025-11-25 20:09:33 -08:00
namespace = namespace.lower().strip()
value = value.strip()
if namespace in tag_map and value:
ffmpeg_key = tag_map[namespace]
2025-12-29 17:05:03 -08:00
if namespace == "url":
2025-12-11 12:47:30 -08:00
# Collect url as comments
2025-11-25 20:09:33 -08:00
comments.append(f"URL: {value}")
2025-12-29 17:05:03 -08:00
elif ffmpeg_key == "comment":
2025-11-25 20:09:33 -08:00
# Collect other comment-type tags
comments.append(value)
elif ffmpeg_key not in metadata:
# Don't overwrite if already set from earlier tag
metadata[ffmpeg_key] = value
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Add collected comments to metadata
if comments:
2025-12-29 17:05:03 -08:00
if "comment" in metadata:
metadata["comment"] = metadata["comment"] + " | " + " | ".join(comments)
2025-11-25 20:09:33 -08:00
else:
2025-12-29 17:05:03 -08:00
metadata["comment"] = " | ".join(comments)
2025-11-25 20:09:33 -08:00
# Apply sensible defaults for audio files
if file_kind == "audio" or (not file_kind and file_path.suffix.lower() in {".mp3",
".flac",
".wav",
".m4a",
".aac",
".ogg",
".opus",
".mka"}):
2025-11-25 20:09:33 -08:00
# If no album, use title as album
2025-12-29 17:05:03 -08:00
if "album" not in metadata and "title" in metadata:
metadata["album"] = metadata["title"]
2025-11-25 20:09:33 -08:00
# If no track, default to 1
2025-12-29 17:05:03 -08:00
if "track" not in metadata:
metadata["track"] = "1"
2025-11-25 20:09:33 -08:00
# If no album_artist, use artist
2025-12-29 17:05:03 -08:00
if "artist" in metadata:
metadata["album_artist"] = metadata["artist"]
2025-11-25 20:09:33 -08:00
if not metadata:
return True
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Check if FFmpeg is available
2025-12-29 17:05:03 -08:00
ffmpeg_path = shutil.which("ffmpeg")
2025-11-25 20:09:33 -08:00
if not ffmpeg_path:
debug(
f"⚠️ FFmpeg not found; cannot embed metadata in {file_path.name}",
file=sys.stderr
)
2025-11-25 20:09:33 -08:00
return False
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Create temporary file for output
temp_file = file_path.parent / f"{file_path.stem}.ffmpeg_tmp{file_path.suffix}"
try:
2025-12-29 17:05:03 -08:00
cmd = [ffmpeg_path, "-y", "-i", str(file_path)]
2025-11-25 20:09:33 -08:00
for key, value in metadata.items():
2025-12-29 17:05:03 -08:00
cmd.extend(["-metadata", f"{key}={value}"])
cmd.extend(["-c", "copy", str(temp_file)])
2025-11-25 20:09:33 -08:00
# Run ffmpeg with error handling for non-UTF8 output
result = subprocess.run(
2025-12-29 17:05:03 -08:00
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
2025-11-25 20:09:33 -08:00
text=False, # Don't decode as text - ffmpeg may output binary data
2025-12-29 17:05:03 -08:00
timeout=30,
2025-11-25 20:09:33 -08:00
)
if result.returncode == 0 and temp_file.exists():
# Replace original with temp file
file_path.unlink()
temp_file.rename(file_path)
2025-12-11 12:47:30 -08:00
debug(f"Embedded metadata in file: {file_path.name}", file=sys.stderr)
2025-11-25 20:09:33 -08:00
return True
else:
# Clean up temp file if it exists
if temp_file.exists():
temp_file.unlink()
debug(
f"❌ FFmpeg metadata embedding failed for {file_path.name}",
file=sys.stderr
)
2025-11-25 20:09:33 -08:00
if result.stderr:
# Safely decode stderr, ignoring invalid UTF-8 bytes
try:
2025-12-29 17:05:03 -08:00
stderr_text = result.stderr.decode("utf-8", errors="replace")[:200]
2025-11-27 10:59:01 -08:00
debug(f"FFmpeg stderr: {stderr_text}", file=sys.stderr)
2025-11-25 20:09:33 -08:00
except Exception:
pass
return False
except Exception as exc:
if temp_file.exists():
try:
temp_file.unlink()
except Exception:
pass
2025-11-27 10:59:01 -08:00
debug(f"❌ Error embedding metadata: {exc}", file=sys.stderr)
2025-11-25 20:09:33 -08:00
return False
def write_tags_to_file(
file_path: Path,
tags: List[str],
source_hashes: Optional[List[str]] = None,
2025-12-11 12:47:30 -08:00
url: Optional[List[str]] = None,
2025-12-29 17:05:03 -08:00
append: bool = False,
2025-11-25 20:09:33 -08:00
) -> bool:
2025-12-11 23:21:45 -08:00
"""Write tags to .tag sidecar file.
2025-12-29 17:05:03 -08:00
2025-12-12 21:55:38 -08:00
This is the UNIFIED API for writing .tag files across all cmdlet.
2025-11-25 20:09:33 -08:00
Uses consistent format and handles file creation/overwriting.
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Args:
2025-12-11 23:21:45 -08:00
file_path: Path to .tag file (will be created if doesn't exist)
2025-11-25 20:09:33 -08:00
tags: List of tags to write
source_hashes: Optional source file hashes (written as source:hash1,hash2)
2025-12-11 12:47:30 -08:00
url: Optional known url (each written on separate line as url:url)
2025-11-25 20:09:33 -08:00
append: If True, append to existing file; if False, overwrite (default)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Returns:
True if successful
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Raises:
Exception: If file write fails
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Example:
>>> tags = ['artist:Beatles', 'album:Abbey Road']
2025-12-11 23:21:45 -08:00
>>> write_tags_to_file(Path('file.txt.tag'), tags)
2025-11-25 20:09:33 -08:00
True
"""
file_path = Path(file_path)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
try:
# Prepare content
content_lines: List[str] = []
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Add source hashes if provided
if source_hashes:
content_lines.append(f"source:{','.join(source_hashes)}")
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Add known url if provided - each on separate line to prevent corruption
if url:
2025-12-14 00:53:52 -08:00
for url_item in url:
content_lines.append(f"url:{url_item}")
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Add tags
if tags:
content_lines.extend(
[str(t).strip().lower() for t in tags if str(t).strip()]
)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Write to file
2025-12-29 17:05:03 -08:00
mode = "a" if (append and file_path.exists()) else "w"
with open(file_path, mode, encoding="utf-8") as f:
2025-11-25 20:09:33 -08:00
for line in content_lines:
2025-12-29 17:05:03 -08:00
f.write(line + "\n")
2025-11-25 20:09:33 -08:00
return True
except Exception as exc:
raise ValueError(f"Error writing tag file {file_path}: {exc}")
def normalize_tags_from_source(source_data: Any,
source_type: str = "auto") -> List[str]:
2025-11-25 20:09:33 -08:00
"""Normalize tags from any source format.
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Universal function to normalize tags from different sources:
- yt-dlp entry dicts
- Raw tag lists
2025-12-11 23:21:45 -08:00
- .tag file content strings
2025-11-25 20:09:33 -08:00
- Metadata dictionaries
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Args:
source_data: Source data (type determined by source_type or auto-detected)
source_type: One of 'auto', 'ytdlp', 'list', 'text', 'dict'
'auto' attempts to auto-detect the type
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Returns:
Normalized, deduplicated tag list
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Example:
>>> entry = {'artist': 'Beatles', 'album': 'Abbey Road'}
>>> tags = normalize_tags_from_source(entry, 'ytdlp')
2025-11-27 10:59:01 -08:00
>>> debug(tags)
2025-11-25 20:09:33 -08:00
['artist:Beatles', 'album:Abbey Road']
"""
2025-12-29 17:05:03 -08:00
if source_type == "auto":
2025-11-25 20:09:33 -08:00
# Auto-detect source type
if isinstance(source_data, dict):
# Check if it looks like a yt-dlp entry (has id, title, url, etc.)
2025-12-29 17:05:03 -08:00
if "id" in source_data or "title" in source_data or "uploader" in source_data:
source_type = "ytdlp"
2025-11-25 20:09:33 -08:00
else:
2025-12-29 17:05:03 -08:00
source_type = "dict"
2025-11-25 20:09:33 -08:00
elif isinstance(source_data, list):
2025-12-29 17:05:03 -08:00
source_type = "list"
2025-11-25 20:09:33 -08:00
elif isinstance(source_data, str):
2025-12-29 17:05:03 -08:00
source_type = "text"
2025-11-25 20:09:33 -08:00
else:
2025-12-29 17:05:03 -08:00
source_type = "dict"
2025-11-25 20:09:33 -08:00
# Process based on detected/specified type
2025-12-29 17:05:03 -08:00
if source_type == "ytdlp":
2025-11-25 20:09:33 -08:00
if not isinstance(source_data, dict):
raise ValueError("ytdlp source must be a dict")
return extract_ytdlp_tags(source_data)
2025-12-29 17:05:03 -08:00
elif source_type == "list":
2025-11-25 20:09:33 -08:00
if not isinstance(source_data, (list, tuple)):
raise ValueError("list source must be a list or tuple")
# Normalize each tag in the list
result = []
for tag in source_data:
normalized = value_normalize(str(tag))
if normalized:
result.append(normalized)
return result
2025-12-29 17:05:03 -08:00
elif source_type == "text":
2025-11-25 20:09:33 -08:00
if not isinstance(source_data, str):
raise ValueError("text source must be a string")
# Split by lines and normalize
2025-12-29 17:05:03 -08:00
lines = source_data.split("\n")
2025-11-25 20:09:33 -08:00
result = []
seen = set()
for line in lines:
line = line.strip()
2025-12-29 17:05:03 -08:00
if line and not line.startswith("#"):
2025-11-25 20:09:33 -08:00
normalized = value_normalize(line)
if normalized and normalized not in seen:
seen.add(normalized)
result.append(normalized)
return result
2025-12-29 17:05:03 -08:00
elif source_type == "dict":
2025-11-25 20:09:33 -08:00
if not isinstance(source_data, dict):
raise ValueError("dict source must be a dict")
# Extract as generic metadata (similar to yt-dlp but from any dict)
return extract_ytdlp_tags(source_data)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
else:
raise ValueError(f"Unknown source type: {source_type}")
def detect_metadata_request(tag: str) -> Optional[Dict[str, str]]:
trimmed = value_normalize(tag)
if not trimmed:
return None
lower = trimmed.lower()
2025-12-29 17:05:03 -08:00
imdb_match = re.match(r"^imdb:\s*(tt[\w]+)$", lower)
2025-11-25 20:09:33 -08:00
if imdb_match:
imdb_id = imdb_match.group(1)
return {
2025-12-29 17:05:03 -08:00
"source": "imdb",
"id": imdb_id,
"base": f"imdb:{imdb_id}",
2025-11-25 20:09:33 -08:00
}
2025-12-29 17:05:03 -08:00
remainder = re.match(r"^musicbrainz:\s*(.+)$", lower)
2025-11-25 20:09:33 -08:00
if remainder:
raw = remainder.group(1)
2025-12-29 17:05:03 -08:00
entity = "release"
2025-11-25 20:09:33 -08:00
identifier = raw
2025-12-29 17:05:03 -08:00
specific = re.match(r"^(?P<entity>[a-zA-Z]+)\s*:\s*(?P<id>[\w-]+)$", raw)
2025-11-25 20:09:33 -08:00
if specific:
2025-12-29 17:05:03 -08:00
entity = specific.group("entity")
identifier = specific.group("id")
identifier = identifier.replace(" ", "")
2025-11-25 20:09:33 -08:00
if identifier:
return {
2025-12-29 17:05:03 -08:00
"source": "musicbrainz",
"entity": entity.lower(),
"id": identifier,
"base": f"musicbrainz:{identifier}",
2025-11-25 20:09:33 -08:00
}
return None
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
def expand_metadata_tag(payload: Dict[str, Any]) -> Dict[str, Any]:
2025-12-29 17:05:03 -08:00
tag = payload.get("tag")
2025-11-25 20:09:33 -08:00
if not isinstance(tag, str):
return {
"tag": []
}
2025-11-25 20:09:33 -08:00
trimmed = value_normalize(tag)
if not trimmed:
return {
"tag": []
}
2025-11-25 20:09:33 -08:00
request = detect_metadata_request(trimmed)
tags: List[str] = []
seen: Set[str] = set()
if request:
2025-12-29 17:05:03 -08:00
_append_unique(tags, seen, request["base"])
2025-11-25 20:09:33 -08:00
else:
_append_unique(tags, seen, trimmed)
return {
"tag": tags
}
2025-11-25 20:09:33 -08:00
try:
2025-12-29 17:05:03 -08:00
if request["source"] == "imdb":
data = imdb_tag(request["id"])
2025-11-25 20:09:33 -08:00
else:
2025-12-29 17:05:03 -08:00
data = fetch_musicbrainz_tags(request["id"], request["entity"])
2025-11-25 20:09:33 -08:00
except Exception as exc: # pragma: no cover - network/service errors
return {
"tag": tags,
"error": str(exc)
}
2025-11-25 20:09:33 -08:00
# Add tags from fetched data (no namespace, just unique append)
2025-12-29 17:05:03 -08:00
raw_tags = data.get("tag") if isinstance(data, dict) else None
2025-12-14 00:53:52 -08:00
if isinstance(raw_tags, str):
tag_iter: Iterable[str] = [raw_tags]
elif isinstance(raw_tags, (list, tuple, set)):
tag_iter = [t for t in raw_tags if isinstance(t, str)]
else:
tag_iter = []
for tag_value in tag_iter:
_append_unique(tags, seen, tag_value)
2025-11-25 20:09:33 -08:00
result = {
2025-12-29 17:05:03 -08:00
"tag": tags,
"source": request["source"],
"id": request["id"],
2025-11-25 20:09:33 -08:00
}
2025-12-29 17:05:03 -08:00
if request["source"] == "musicbrainz":
result["entity"] = request["entity"]
2025-11-25 20:09:33 -08:00
return result
2025-12-29 17:05:03 -08:00
def build_remote_bundle(
metadata: Optional[Dict[str,
Any]],
2025-12-29 17:05:03 -08:00
existing: Optional[Sequence[str]] = None,
context: Optional[Dict[str,
Any]] = None,
) -> Dict[str,
Any]:
2025-11-25 20:09:33 -08:00
metadata = metadata or {}
context = context or {}
tags: List[str] = []
seen: Set[str] = set()
if existing:
for tag in existing:
_append_unique(tags, seen, tag)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Add tags from various sources
2025-12-29 17:05:03 -08:00
for tag in metadata.get("tag") or []:
2025-11-25 20:09:33 -08:00
_append_unique(tags, seen, tag)
2025-12-29 17:05:03 -08:00
for tag in metadata.get("categories") or []:
2025-11-25 20:09:33 -08:00
_append_unique(tags, seen, tag)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Extract and namespace genres
raw_genres = metadata.get("genres")
keywords = metadata.get("keywords")
if isinstance(keywords, str):
for token in keywords.split(","):
_append_unique(tags, seen, token)
if raw_genres:
for genre in (raw_genres if isinstance(raw_genres,
(list,
tuple)) else [raw_genres]):
2025-11-25 20:09:33 -08:00
if genre:
_append_unique(tags, seen, f"genre:{genre}")
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Extract creators/artists
artists = metadata.get("artists") or metadata.get("artist")
if artists:
artist_list = artists if isinstance(artists, (list, tuple)) else [artists]
for artist in artist_list:
if artist:
_append_unique(tags, seen, f"creator:{artist}")
2025-12-29 17:05:03 -08:00
creator = (
metadata.get("uploader") or metadata.get("channel") or metadata.get("artist")
2025-12-29 17:05:03 -08:00
or metadata.get("creator")
)
2025-11-25 20:09:33 -08:00
if creator:
_append_unique(tags, seen, f"creator:{creator}")
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Extract title
title_value = metadata.get("title")
if title_value:
_extend_namespaced(tags, seen, "title", [title_value])
2025-12-29 17:05:03 -08:00
source_url = (
context.get("source_url") or metadata.get("original_url")
or metadata.get("webpage_url") or metadata.get("url")
2025-12-29 17:05:03 -08:00
)
2025-11-25 20:09:33 -08:00
clean_title = value_normalize(str(title_value)) if title_value is not None else None
result = {
2025-12-11 23:21:45 -08:00
"tag": tags,
2025-11-25 20:09:33 -08:00
"title": clean_title,
"source_url": _sanitize_url(source_url),
"duration": _coerce_duration(metadata),
"metadata": metadata,
}
return result
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
def _load_payload(value: Optional[str]) -> Dict[str, Any]:
text = value
if text is None:
text = sys.stdin.read()
if text is None or text.strip() == "":
raise ValueError("Expected JSON payload")
data = json.loads(text)
if not isinstance(data, dict):
raise ValueError("Payload must be a JSON object")
return data
import typer
app = typer.Typer(help="Fetch metadata tags for known services")
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
@app.command(help="Lookup an IMDb title")
def imdb(imdb_id: str = typer.Argument(..., help="IMDb identifier (ttXXXXXXX)")):
"""Lookup an IMDb title."""
try:
result = imdb_tag(imdb_id)
2025-11-27 10:59:01 -08:00
debug(json.dumps(result, ensure_ascii=False), flush=True)
2025-11-25 20:09:33 -08:00
except Exception as exc:
error_payload = {
"error": str(exc)
}
2025-11-27 10:59:01 -08:00
debug(json.dumps(error_payload, ensure_ascii=False), flush=True)
2025-11-25 20:09:33 -08:00
raise typer.Exit(code=1)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
@app.command(help="Lookup a MusicBrainz entity")
def musicbrainz(
mbid: str = typer.Argument(...,
help="MusicBrainz identifier (UUID)"),
entity: str = typer.Option(
"release",
help="Entity type (release, recording, artist)"
),
2025-11-25 20:09:33 -08:00
):
"""Lookup a MusicBrainz entity."""
try:
result = fetch_musicbrainz_tags(mbid, entity)
2025-11-27 10:59:01 -08:00
debug(json.dumps(result, ensure_ascii=False), flush=True)
2025-11-25 20:09:33 -08:00
except Exception as exc:
error_payload = {
"error": str(exc)
}
2025-11-27 10:59:01 -08:00
debug(json.dumps(error_payload, ensure_ascii=False), flush=True)
2025-11-25 20:09:33 -08:00
raise typer.Exit(code=1)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
@app.command(name="remote-tags", help="Normalize a remote metadata payload")
2025-12-29 17:05:03 -08:00
def remote_tags(
payload: Optional[str] = typer.Option(
None,
"--payload",
help="JSON payload; reads stdin if omitted"
2025-12-29 17:05:03 -08:00
)
):
2025-11-25 20:09:33 -08:00
"""Normalize a remote metadata payload."""
try:
payload_data = _load_payload(payload)
metadata = payload_data.get("metadata") or {}
existing = payload_data.get("existing_tags") or []
context = payload_data.get("context") or {}
if not isinstance(existing, list):
raise ValueError("existing_tags must be a list")
if context and not isinstance(context, dict):
raise ValueError("context must be an object")
result = build_remote_bundle(metadata, existing, context)
2025-11-27 10:59:01 -08:00
debug(json.dumps(result, ensure_ascii=False), flush=True)
2025-11-25 20:09:33 -08:00
except Exception as exc:
error_payload = {
"error": str(exc)
}
2025-11-27 10:59:01 -08:00
debug(json.dumps(error_payload, ensure_ascii=False), flush=True)
2025-11-25 20:09:33 -08:00
raise typer.Exit(code=1)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
@app.command(name="remote-fetch", help="Resolve remote metadata bundle")
2025-12-29 17:05:03 -08:00
def remote_fetch(
payload: Optional[str] = typer.Option(
None,
"--payload",
help="JSON payload; reads stdin if omitted"
2025-12-29 17:05:03 -08:00
)
):
2025-11-25 20:09:33 -08:00
"""Resolve remote metadata bundle."""
try:
payload_data = _load_payload(payload)
result = resolve_remote_metadata(payload_data)
2025-11-27 10:59:01 -08:00
debug(json.dumps(result, ensure_ascii=False), flush=True)
2025-11-25 20:09:33 -08:00
except Exception as exc:
error_payload = {
"error": str(exc)
}
2025-11-27 10:59:01 -08:00
debug(json.dumps(error_payload, ensure_ascii=False), flush=True)
2025-11-25 20:09:33 -08:00
raise typer.Exit(code=1)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
@app.command(name="expand-tag", help="Expand metadata references into tags")
2025-12-29 17:05:03 -08:00
def expand_tag(
payload: Optional[str] = typer.Option(
None,
"--payload",
help="JSON payload; reads stdin if omitted"
2025-12-29 17:05:03 -08:00
)
):
2025-11-25 20:09:33 -08:00
"""Expand metadata references into tags."""
try:
payload_data = _load_payload(payload)
result = expand_metadata_tag(payload_data)
2025-11-27 10:59:01 -08:00
debug(json.dumps(result, ensure_ascii=False), flush=True)
2025-11-25 20:09:33 -08:00
except Exception as exc:
error_payload = {
"error": str(exc)
}
2025-11-27 10:59:01 -08:00
debug(json.dumps(error_payload, ensure_ascii=False), flush=True)
2025-11-25 20:09:33 -08:00
raise typer.Exit(code=1)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
@app.command(name="hydrus-fetch", help="Fetch Hydrus metadata for a file")
2025-12-29 17:05:03 -08:00
def hydrus_fetch(
payload: Optional[str] = typer.Option(
None,
"--payload",
help="JSON payload; reads stdin if omitted"
2025-12-29 17:05:03 -08:00
)
):
2025-11-25 20:09:33 -08:00
"""Fetch Hydrus metadata for a file."""
try:
payload_data = _load_payload(payload)
result = fetch_hydrus_metadata(payload_data)
2025-11-27 10:59:01 -08:00
debug(json.dumps(result, ensure_ascii=False), flush=True)
2025-11-25 20:09:33 -08:00
except Exception as exc:
error_payload = {
"error": str(exc)
}
2025-11-27 10:59:01 -08:00
debug(json.dumps(error_payload, ensure_ascii=False), flush=True)
2025-11-25 20:09:33 -08:00
raise typer.Exit(code=1)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
@app.command(name="hydrus-fetch-url", help="Fetch Hydrus metadata using a source URL")
2025-12-29 17:05:03 -08:00
def hydrus_fetch_url(
payload: Optional[str] = typer.Option(
None,
"--payload",
help="JSON payload; reads stdin if omitted"
2025-12-29 17:05:03 -08:00
)
):
2025-11-25 20:09:33 -08:00
"""Fetch Hydrus metadata using a source URL."""
try:
payload_data = _load_payload(payload)
result = fetch_hydrus_metadata_by_url(payload_data)
2025-11-27 10:59:01 -08:00
debug(json.dumps(result, ensure_ascii=False), flush=True)
2025-11-25 20:09:33 -08:00
except Exception as exc:
error_payload = {
"error": str(exc)
}
2025-11-27 10:59:01 -08:00
debug(json.dumps(error_payload, ensure_ascii=False), flush=True)
2025-11-25 20:09:33 -08:00
raise typer.Exit(code=1)
2025-12-29 17:05:03 -08:00
2025-12-11 23:21:45 -08:00
@app.command(name="sync-sidecar", help="Synchronise .tag sidecar with supplied data")
2025-12-29 17:05:03 -08:00
def sync_sidecar_cmd(
payload: Optional[str] = typer.Option(
None,
"--payload",
help="JSON payload; reads stdin if omitted"
2025-12-29 17:05:03 -08:00
)
):
2025-12-11 23:21:45 -08:00
"""Synchronise .tag sidecar with supplied data."""
2025-11-25 20:09:33 -08:00
try:
payload_data = _load_payload(payload)
result = sync_sidecar(payload_data)
2025-11-27 10:59:01 -08:00
debug(json.dumps(result, ensure_ascii=False), flush=True)
2025-11-25 20:09:33 -08:00
except Exception as exc:
error_payload = {
"error": str(exc)
}
2025-11-27 10:59:01 -08:00
debug(json.dumps(error_payload, ensure_ascii=False), flush=True)
2025-11-25 20:09:33 -08:00
raise typer.Exit(code=1)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
@app.command(name="update-tag", help="Update or rename a tag")
2025-12-29 17:05:03 -08:00
def update_tag_cmd(
payload: Optional[str] = typer.Option(
None,
"--payload",
help="JSON payload; reads stdin if omitted"
2025-12-29 17:05:03 -08:00
)
):
2025-11-25 20:09:33 -08:00
"""Update or rename a tag."""
try:
payload_data = _load_payload(payload)
2025-12-29 17:05:03 -08:00
result = apply_tag_mutation(payload_data, "update")
2025-11-27 10:59:01 -08:00
debug(json.dumps(result, ensure_ascii=False), flush=True)
2025-11-25 20:09:33 -08:00
except Exception as exc:
error_payload = {
"error": str(exc)
}
2025-11-27 10:59:01 -08:00
debug(json.dumps(error_payload, ensure_ascii=False), flush=True)
2025-11-25 20:09:33 -08:00
raise typer.Exit(code=1)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
def main(argv: Optional[List[str]] = None) -> int:
"""Main entry point using Typer."""
try:
app(argv, standalone_mode=False)
return 0
except SystemExit as e:
return e.code if isinstance(e.code, int) else 1
# ============================================================================
# TAG OPERATIONS - Consolidated from tag_operations.py and tag_helpers.py
# ============================================================================
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
def sort_tags(tags: List[str]) -> List[str]:
"""
Sort tags into namespace tags and freeform tags, then alphabetically.
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Args:
tags: List of tag strings
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Returns:
Sorted list with namespace tags first, then freeform tags
"""
if not tags:
return []
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
namespace_tags = []
freeform_tags = []
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
for tag in tags:
if isinstance(tag, str):
2025-12-29 17:05:03 -08:00
if ":" in tag:
2025-11-25 20:09:33 -08:00
namespace_tags.append(tag)
else:
freeform_tags.append(tag)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
namespace_tags.sort()
freeform_tags.sort()
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
return namespace_tags + freeform_tags
def format_tags_display(tags: List[str],
namespace_filter: Optional[str] = None) -> List[str]:
2025-11-25 20:09:33 -08:00
"""
Format tags for display, optionally filtered by namespace.
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Args:
tags: List of tags
namespace_filter: Optional namespace to filter by (e.g., "creator:")
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Returns:
Formatted list of tags
"""
if not tags:
return []
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
if namespace_filter:
filtered = [t for t in tags if t.startswith(namespace_filter)]
return sort_tags(filtered)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
return sort_tags(tags)
def split_tag(tag: str) -> tuple[str, str]:
"""
Split a tag into namespace and value.
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Args:
tag: Tag string (e.g., "creator:Author Name" or "freeform tag")
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Returns:
Tuple of (namespace, value). For freeform tags, namespace is empty string.
"""
2025-12-29 17:05:03 -08:00
if ":" in tag:
parts = tag.split(":", 1)
2025-11-25 20:09:33 -08:00
return parts[0], parts[1]
2025-12-29 17:05:03 -08:00
return "", tag
2025-11-25 20:09:33 -08:00
def filter_tags_by_namespace(tags: List[str], namespace: str) -> List[str]:
"""
Get all tags in a specific namespace.
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Args:
tags: List of tags
namespace: Namespace to filter by
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Returns:
List of values in that namespace
"""
2025-12-29 17:05:03 -08:00
prefix = namespace + ":"
2025-11-25 20:09:33 -08:00
return [split_tag(t)[1] for t in tags if t.startswith(prefix)]
def ensure_title_tag(tags: List[str], title: str) -> List[str]:
"""
Ensure there's a title: tag with the given title.
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Args:
tags: List of existing tags
title: Title to ensure exists
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Returns:
Updated tag list
"""
if not title:
return tags
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Remove any existing title tags
2025-12-29 17:05:03 -08:00
filtered = [t for t in tags if not t.startswith("title:")]
2025-11-25 20:09:33 -08:00
# Add new title tag
2025-12-29 17:05:03 -08:00
new_tags = filtered + [f"title:{title}"]
2025-11-25 20:09:33 -08:00
return sort_tags(new_tags)
def remove_title_tags(tags: List[str]) -> List[str]:
"""Remove all title: tags."""
2025-12-29 17:05:03 -08:00
return [t for t in tags if not t.startswith("title:")]
2025-11-25 20:09:33 -08:00
def is_namespace_tag(tag: str) -> bool:
"""Check if a tag is a namespace tag (contains :)."""
2025-12-29 17:05:03 -08:00
return ":" in tag if isinstance(tag, str) else False
2025-11-25 20:09:33 -08:00
def validate_tag(tag: str) -> bool:
"""
Validate that a tag is properly formatted.
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Args:
tag: Tag to validate
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Returns:
True if tag is valid
"""
if not isinstance(tag, str) or not tag.strip():
return False
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Tag shouldn't have leading/trailing whitespace
if tag != tag.strip():
return False
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Tag shouldn't be empty
if not tag:
return False
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
return True
def normalize_tags(tags: List[Any]) -> List[str]:
"""
Normalize a tag list by filtering and cleaning.
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Args:
tags: List of tags (may contain invalid entries)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Returns:
Cleaned list of valid tags
"""
if not tags:
return []
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
normalized = []
for tag in tags:
if isinstance(tag, str):
trimmed = tag.strip()
if trimmed and validate_tag(trimmed):
normalized.append(trimmed)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
return sort_tags(normalized)
2025-12-20 23:57:44 -08:00
def compute_namespaced_tag_overwrite(
existing_tags: Sequence[Any],
incoming_tags: Sequence[Any],
) -> Tuple[List[str],
List[str],
List[str]]:
2025-12-20 23:57:44 -08:00
"""Compute a tag mutation with namespace overwrite semantics.
Rules:
- Incoming namespaced tags ("ns:value") overwrite any existing tags in that namespace.
- Overwrite is based on namespace match (case-insensitive).
- Additions are deduped case-insensitively against kept existing tags and within the incoming list.
- If an existing tag matches an incoming tag exactly, it is kept (no remove/add).
Returns:
(tags_to_remove, tags_to_add, merged_tags)
Notes:
This is intentionally store-agnostic: stores decide how to persist/apply
the returned mutation (DB merge write, Hydrus delete/add, etc.).
"""
def _clean(values: Sequence[Any]) -> List[str]:
out: List[str] = []
for v in values or []:
if not isinstance(v, str):
continue
t = v.strip()
if t:
out.append(t.lower())
return out
def _ns_of(tag: str) -> str:
if ":" not in tag:
return ""
return tag.split(":", 1)[0].strip().lower()
existing = _clean(existing_tags)
incoming = _clean(incoming_tags)
if not incoming:
return [], [], existing
namespaces_to_replace: Set[str] = set()
for t in incoming:
ns = _ns_of(t)
if ns:
namespaces_to_replace.add(ns)
kept_existing: List[str] = []
kept_existing_lower: Set[str] = set()
tags_to_remove: List[str] = []
for t in existing:
ns = _ns_of(t)
if ns and ns in namespaces_to_replace:
# If it matches exactly, keep it; otherwise remove it.
if t in incoming:
kept_existing.append(t)
kept_existing_lower.add(t.lower())
else:
# If incoming has the same tag value but different casing, treat as replace.
tags_to_remove.append(t)
continue
kept_existing.append(t)
kept_existing_lower.add(t.lower())
tags_to_add: List[str] = []
added_lower: Set[str] = set()
for t in incoming:
tl = t.lower()
if tl in kept_existing_lower:
continue
if tl in added_lower:
continue
tags_to_add.append(t)
added_lower.add(tl)
merged = kept_existing + tags_to_add
return tags_to_remove, tags_to_add, merged
2025-11-25 20:09:33 -08:00
def merge_tag_lists(*tag_lists: List[str]) -> List[str]:
"""
Merge multiple tag lists, removing duplicates.
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Args:
*tag_lists: Variable number of tag lists
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Returns:
Merged, deduplicated, sorted list
"""
merged = set()
for tag_list in tag_lists:
if isinstance(tag_list, list):
merged.update(tag_list)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
return sort_tags(list(merged))
def tag_diff(old_tags: List[str], new_tags: List[str]) -> Dict[str, List[str]]:
"""
Calculate the difference between two tag lists.
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Args:
old_tags: Original tags
new_tags: New tags
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Returns:
Dict with 'added' and 'removed' keys
"""
old_set = set(old_tags) if old_tags else set()
new_set = set(new_tags) if new_tags else set()
2025-12-29 17:05:03 -08:00
return {
"added": sorted(list(new_set - old_set)),
"removed": sorted(list(old_set - new_set))
}
2025-11-25 20:09:33 -08:00
def expand_tag_lists(tags_set: Set[str]) -> Set[str]:
"""Expand tag list references like {psychology} to actual tags from adjective.json.
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Removes the reference after expansion (e.g., {psychology} is deleted, psychology tags added).
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Args:
tags_set: Set of tag strings that may include {list_name} references
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Returns:
Set of expanded tags with all {list_name} references replaced with actual tags
"""
# Load adjective.json from workspace root
adjective_path = Path(__file__).parent / "adjective.json"
if not adjective_path.exists():
2025-12-14 00:53:52 -08:00
debug(f"adjective.json not found at {adjective_path}")
2025-11-25 20:09:33 -08:00
return tags_set
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
try:
2025-12-29 17:05:03 -08:00
with open(adjective_path, "r") as f:
2025-11-25 20:09:33 -08:00
adjective_lists = json.load(f)
except Exception as e:
2025-12-14 00:53:52 -08:00
debug(f"Error loading adjective.json: {e}")
2025-11-25 20:09:33 -08:00
return tags_set
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
expanded_tags = set()
for tag in tags_set:
# Check if tag is a list reference like {psychology}
2025-12-29 17:05:03 -08:00
if tag.startswith("{") and tag.endswith("}"):
2025-11-25 20:09:33 -08:00
list_name = tag[1:-1].lower() # Extract name, make lowercase
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Find matching list (case-insensitive)
matched_list = None
for key in adjective_lists.keys():
if key.lower() == list_name:
matched_list = adjective_lists[key]
break
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
if matched_list:
# Add all tags from the list
expanded_tags.update(matched_list)
2025-12-14 00:53:52 -08:00
debug(f"Expanded {tag} to {len(matched_list)} tags")
2025-11-25 20:09:33 -08:00
else:
# List not found, log warning but don't add the reference
2025-12-14 00:53:52 -08:00
debug(f"Tag list '{list_name}' not found in adjective.json")
2025-11-25 20:09:33 -08:00
else:
# Regular tag, keep as is
expanded_tags.add(tag)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
return expanded_tags
def process_tags_from_string(tags_str: str, expand_lists: bool = False) -> Set[str]:
"""Process a tag string into a set of tags.
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Handles:
- Multiple formats: comma-separated, newline-separated, space-separated
- Tag list expansion: {psychology} -> psychology tags (if expand_lists=True)
- Whitespace trimming
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Args:
tags_str: Raw tag string
expand_lists: If True, expand {list_name} references using adjective.json
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Returns:
Set of processed tags
"""
if not tags_str:
return set()
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Try to detect delimiter and split accordingly
# Prefer newlines, then commas, then spaces
2025-12-29 17:05:03 -08:00
if "\n" in tags_str:
delimiter = "\n"
elif "," in tags_str:
delimiter = ","
2025-11-25 20:09:33 -08:00
else:
2025-12-29 17:05:03 -08:00
delimiter = " "
2025-11-25 20:09:33 -08:00
# Split and clean tags
tags_set = set()
for tag in tags_str.split(delimiter):
tag = tag.strip()
if tag:
tags_set.add(tag)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Expand list references if requested
if expand_lists:
tags_set = expand_tag_lists(tags_set)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
return tags_set
2025-12-05 03:42:57 -08:00
def build_book_tags(
*,
title: Optional[str] = None,
author: Optional[str] = None,
isbn: Optional[str] = None,
year: Optional[str] = None,
source: Optional[str] = None,
extra: Optional[Sequence[str]] = None,
) -> List[str]:
"""Build consistent book tags for downloads (LibGen, OpenLibrary, etc.)."""
tags: List[str] = ["book"]
def _add(tag: Optional[str]) -> None:
if tag and isinstance(tag, str) and tag.strip():
tags.append(tag.strip())
_add(source)
if title:
_add(f"title:{title}")
if author:
_add(f"author:{author}")
if isbn:
_add(f"isbn:{isbn}")
if year:
_add(f"year:{year}")
if extra:
for tag in extra:
_add(tag)
# Deduplicate while preserving order
deduped = list(dict.fromkeys(tags))
return deduped
2025-11-25 20:09:33 -08:00
def enrich_playlist_entries(entries: list, extractor: str) -> list:
"""Enrich playlist entries with full metadata by fetching individual entry info.
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
When extract_flat is used, entries contain minimal info (title, id, url).
This function fetches full metadata for each entry.
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Args:
entries: List of entry dicts from probe_url
extractor: Extractor name
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Returns:
List of enriched entry dicts
"""
# Import here to avoid circular dependency
2025-12-11 19:04:02 -08:00
from SYS.download import is_url_supported_by_ytdlp
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
if not entries:
return entries
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
enriched = []
for entry in entries:
# If entry has a direct URL, fetch its full metadata
entry_url = entry.get("url")
if entry_url and is_url_supported_by_ytdlp(entry_url):
try:
import yt_dlp
2025-12-29 17:05:03 -08:00
2025-12-14 00:53:52 -08:00
ydl_opts: Any = {
2025-11-25 20:09:33 -08:00
"quiet": True,
"no_warnings": True,
"skip_download": True,
"noprogress": True,
"socket_timeout": 5,
"retries": 1,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
full_info = ydl.extract_info(entry_url, download=False)
if full_info:
enriched.append(full_info)
continue
except Exception:
pass
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Fallback to original entry if fetch failed
enriched.append(entry)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
return enriched
def format_playlist_entry(entry: Dict[str,
Any],
index: int,
extractor: str) -> Dict[str,
Any]:
2025-11-25 20:09:33 -08:00
"""Format a playlist entry for display in result table.
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Args:
entry: Single playlist entry from yt-dlp (fully enriched if possible)
index: 1-based track number
extractor: Extractor name (youtube, bandcamp, spotify, etc.)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Returns:
Dict with displayable fields for result table
"""
result = {
"index": index,
"title": entry.get("title",
"Unknown"),
2025-11-25 20:09:33 -08:00
"duration": entry.get("duration") or entry.get("length") or 0,
"uploader": entry.get("uploader") or entry.get("creator") or "",
"artist": entry.get("artist") or entry.get("uploader") or entry.get("creator")
or "",
2025-11-25 20:09:33 -08:00
"album": entry.get("album") or "",
"track_number": entry.get("track_number") or index,
}
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Normalize extractor for comparison
ext_lower = extractor.lower().replace(":", "").replace(" ", "")
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Add site-specific fields
if "youtube" in ext_lower:
result["video_id"] = entry.get("id", "")
result["channel"] = entry.get("uploader") or entry.get("channel", "")
result["views"] = entry.get("view_count", 0)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
elif "bandcamp" in ext_lower:
result["track_number"] = entry.get("track_number") or index
# For Bandcamp album entries, track info may be in different fields
result["artist"] = entry.get("artist") or entry.get("uploader", "")
result["album"] = entry.get("album") or ""
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
elif "spotify" in ext_lower:
result["artists"] = entry.get("creator") or entry.get("uploader", "")
result["album"] = entry.get("album", "")
result["release_date"] = entry.get("release_date", "")
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
return result
2025-12-11 12:47:30 -08:00
# ============================================================================
# Metadata helper functions for tag processing and scraping
# ============================================================================
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
def extract_title_from_tags(tags_list: List[str]) -> Optional[str]:
"""Extract title from tags list."""
try:
extracted = extract_title(tags_list)
if extracted:
return extracted
except Exception:
pass
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
for t in tags_list:
if isinstance(t, str) and t.lower().startswith("title:"):
val = t.split(":", 1)[1].strip()
if val:
return val
return None
def summarize_tags(tags_list: List[str], limit: int = 8) -> str:
"""Create a summary of tags for display."""
shown = [t for t in tags_list[:limit] if t]
summary = ", ".join(shown)
remaining = max(0, len(tags_list) - len(shown))
if remaining > 0:
summary = f"{summary} (+{remaining} more)" if summary else f"(+{remaining} more)"
if len(summary) > 200:
summary = summary[:197] + "..."
return summary
def extract_scrapable_identifiers(tags_list: List[str]) -> Dict[str, str]:
"""Extract scrapable identifiers from tags."""
identifiers = {}
scrapable_prefixes = {
2025-12-29 17:05:03 -08:00
"openlibrary",
"isbn",
"isbn_10",
"isbn_13",
"musicbrainz",
"musicbrainzalbum",
"imdb",
"tmdb",
"tvdb",
2025-12-11 12:47:30 -08:00
}
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
for tag in tags_list:
2025-12-29 17:05:03 -08:00
if not isinstance(tag, str) or ":" not in tag:
2025-12-11 12:47:30 -08:00
continue
2025-12-29 17:05:03 -08:00
parts = tag.split(":", 1)
2025-12-11 12:47:30 -08:00
if len(parts) != 2:
continue
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
key_raw = parts[0].strip().lower()
2025-12-29 17:05:03 -08:00
key = key_raw.replace("-", "_")
if key == "isbn10":
key = "isbn_10"
elif key == "isbn13":
key = "isbn_13"
2025-12-11 12:47:30 -08:00
value = parts[1].strip()
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Normalize ISBN values by removing hyphens for API friendliness
2025-12-29 17:05:03 -08:00
if key.startswith("isbn"):
value = value.replace("-", "")
2025-12-11 12:47:30 -08:00
if key in scrapable_prefixes and value:
identifiers[key] = value
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
return identifiers
def extract_tag_value(tags_list: List[str], namespace: str) -> Optional[str]:
"""Get first tag value for a namespace (e.g., artist:, title:)."""
ns = namespace.lower()
for tag in tags_list:
2025-12-29 17:05:03 -08:00
if not isinstance(tag, str) or ":" not in tag:
2025-12-11 12:47:30 -08:00
continue
2025-12-29 17:05:03 -08:00
prefix, _, value = tag.partition(":")
2025-12-11 12:47:30 -08:00
if prefix.strip().lower() != ns:
continue
candidate = value.strip()
if candidate:
return candidate
return None
2025-12-29 17:05:03 -08:00
def scrape_url_metadata(
url: str,
) -> Tuple[Optional[str],
List[str],
List[Tuple[str,
str]],
List[Dict[str,
Any]]]:
2025-12-11 12:47:30 -08:00
"""Scrape metadata from a URL using yt-dlp.
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
Returns:
(title, tags, formats, playlist_items) tuple where:
- title: Video/content title
- tags: List of extracted tags (both namespaced and freeform)
- formats: List of (display_label, format_id) tuples
- playlist_items: List of playlist entry dicts (empty if not a playlist)
"""
try:
import json as json_module
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
try:
from metadata import extract_ytdlp_tags
except ImportError:
extract_ytdlp_tags = None
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Build yt-dlp command with playlist support
# IMPORTANT: Do NOT use --flat-playlist! It strips metadata like artist, album, uploader, genre
# Without it, yt-dlp gives us full metadata in an 'entries' array within a single JSON object
# This ensures we get album-level metadata from sources like BandCamp, YouTube Music, etc.
cmd = [
"yt-dlp",
"-j", # Output JSON
"--no-warnings",
2025-12-29 17:05:03 -08:00
"--playlist-items",
"1-10", # Get first 10 items if it's a playlist (provides entries)
"-f",
"best",
url,
2025-12-11 12:47:30 -08:00
]
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
if result.returncode != 0:
log(f"yt-dlp error: {result.stderr}", file=sys.stderr)
return None, [], [], []
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Parse JSON output - WITHOUT --flat-playlist, we get ONE JSON object with 'entries' array
# This gives us full metadata instead of flat format
2025-12-29 17:05:03 -08:00
lines = result.stdout.strip().split("\n")
2025-12-11 12:47:30 -08:00
if not lines or not lines[0]:
log("yt-dlp returned empty output", file=sys.stderr)
return None, [], [], []
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Parse the single JSON object
try:
data = json_module.loads(lines[0])
except json_module.JSONDecodeError as e:
log(f"Failed to parse yt-dlp JSON: {e}", file=sys.stderr)
return None, [], [], []
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Extract title - use the main title
2025-12-29 17:05:03 -08:00
title = data.get("title", "Unknown")
2025-12-11 12:47:30 -08:00
# Determine if this is a playlist/album (has entries array)
# is_playlist = 'entries' in data and isinstance(data.get('entries'), list)
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Extract tags and playlist items
tags = []
playlist_items = []
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# IMPORTANT: Extract album/playlist-level tags FIRST (before processing entries)
# This ensures we get metadata about the collection, not just individual tracks
if extract_ytdlp_tags:
album_tags = extract_ytdlp_tags(data)
tags.extend(album_tags)
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Case 1: Entries are nested in the main object (standard playlist structure)
2025-12-29 17:05:03 -08:00
if "entries" in data and isinstance(data.get("entries"), list):
entries = data["entries"]
2025-12-11 12:47:30 -08:00
# Build playlist items with title and duration
for idx, entry in enumerate(entries, 1):
if isinstance(entry, dict):
2025-12-29 17:05:03 -08:00
item_title = entry.get("title", entry.get("id", f"Track {idx}"))
item_duration = entry.get("duration", 0)
playlist_items.append(
{
"index": idx,
"id": entry.get("id",
f"track_{idx}"),
2025-12-29 17:05:03 -08:00
"title": item_title,
"duration": item_duration,
"url": entry.get("url") or entry.get("webpage_url",
""),
2025-12-29 17:05:03 -08:00
}
)
2025-12-11 12:47:30 -08:00
# Extract tags from each entry and merge (but don't duplicate album-level tags)
# Only merge entry tags that are multi-value prefixes (not single-value like title:, artist:, etc.)
if extract_ytdlp_tags:
entry_tags = extract_ytdlp_tags(entry)
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Single-value namespaces that should not be duplicated from entries
2025-12-29 17:05:03 -08:00
single_value_namespaces = {
"title",
"artist",
"album",
"creator",
"channel",
"release_date",
"upload_date",
"license",
"location",
}
2025-12-11 12:47:30 -08:00
for tag in entry_tags:
# Extract the namespace (part before the colon)
tag_namespace = tag.split(":",
1)[0].lower(
) if ":" in tag else None
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Skip if this namespace already exists in tags (from album level)
if tag_namespace and tag_namespace in single_value_namespaces:
# Check if any tag with this namespace already exists in tags
already_has_namespace = any(
t.split(":",
1)[0].lower() == tag_namespace for t in tags
2025-12-29 17:05:03 -08:00
if ":" in t
2025-12-11 12:47:30 -08:00
)
if already_has_namespace:
continue # Skip this tag, keep the album-level one
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
if tag not in tags: # Avoid exact duplicates
tags.append(tag)
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Case 2: Playlist detected by playlist_count field (BandCamp albums, etc.)
# These need a separate call with --flat-playlist to get the actual entries
2025-12-29 17:05:03 -08:00
elif (data.get("playlist_count") or 0) > 0 and "entries" not in data:
2025-12-11 12:47:30 -08:00
try:
# Make a second call with --flat-playlist to get the actual tracks
flat_cmd = [
"yt-dlp",
"-j",
"--no-warnings",
"--flat-playlist",
"-f",
"best",
url
]
flat_result = subprocess.run(
flat_cmd,
capture_output=True,
text=True,
timeout=30
)
2025-12-11 12:47:30 -08:00
if flat_result.returncode == 0:
2025-12-29 17:05:03 -08:00
flat_lines = flat_result.stdout.strip().split("\n")
2025-12-11 12:47:30 -08:00
# With --flat-playlist, each line is a separate track JSON object
# (not nested in a playlist container), so process ALL lines
for idx, line in enumerate(flat_lines, 1):
2025-12-29 17:05:03 -08:00
if line.strip().startswith("{"):
2025-12-11 12:47:30 -08:00
try:
entry = json_module.loads(line)
item_title = entry.get(
"title",
entry.get("id",
f"Track {idx}")
)
2025-12-29 17:05:03 -08:00
item_duration = entry.get("duration", 0)
playlist_items.append(
{
"index":
idx,
"id":
entry.get("id",
f"track_{idx}"),
"title":
item_title,
"duration":
item_duration,
"url":
entry.get("url")
or entry.get("webpage_url",
""),
2025-12-29 17:05:03 -08:00
}
)
2025-12-11 12:47:30 -08:00
except json_module.JSONDecodeError:
pass
except Exception as e:
pass # Silently ignore if we can't get playlist entries
# Fallback: if still no tags detected, get from first item
if not tags and extract_ytdlp_tags:
tags = extract_ytdlp_tags(data)
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Extract formats from the main data object
formats = []
2025-12-29 17:05:03 -08:00
if "formats" in data:
formats = extract_url_formats(data.get("formats", []))
2025-12-11 12:47:30 -08:00
# Deduplicate tags by namespace to prevent duplicate title:, artist:, etc.
try:
if dedup_tags_by_namespace:
tags = dedup_tags_by_namespace(tags, keep_first=True)
except Exception:
pass # If dedup fails, return tags as-is
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
return title, tags, formats, playlist_items
except subprocess.TimeoutExpired:
log("yt-dlp timeout (>30s)", file=sys.stderr)
return None, [], [], []
except Exception as e:
log(f"URL scraping error: {e}", file=sys.stderr)
return None, [], [], []
def extract_url_formats(formats: list) -> List[Tuple[str, str]]:
"""Extract best formats from yt-dlp formats list.
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
Returns list of (display_label, format_id) tuples.
"""
try:
video_formats = {} # {resolution: format_data}
audio_formats = {} # {quality_label: format_data}
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
for fmt in formats:
2025-12-29 17:05:03 -08:00
vcodec = fmt.get("vcodec", "none")
acodec = fmt.get("acodec", "none")
height = fmt.get("height")
ext = fmt.get("ext", "unknown")
format_id = fmt.get("format_id", "")
tbr = fmt.get("tbr", 0)
abr = fmt.get("abr", 0)
2025-12-11 12:47:30 -08:00
# Video format
2025-12-29 17:05:03 -08:00
if vcodec and vcodec != "none" and height:
2025-12-11 12:47:30 -08:00
if height < 480:
continue
res_key = f"{height}p"
if res_key not in video_formats or tbr > video_formats[res_key].get(
"tbr",
0):
2025-12-11 12:47:30 -08:00
video_formats[res_key] = {
2025-12-29 17:05:03 -08:00
"label": f"{height}p ({ext})",
"format_id": format_id,
"tbr": tbr,
2025-12-11 12:47:30 -08:00
}
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Audio-only format
2025-12-29 17:05:03 -08:00
elif acodec and acodec != "none" and (not vcodec or vcodec == "none"):
2025-12-11 12:47:30 -08:00
audio_key = f"audio_{abr}"
if audio_key not in audio_formats or abr > audio_formats[audio_key].get(
"abr",
0):
2025-12-11 12:47:30 -08:00
audio_formats[audio_key] = {
2025-12-29 17:05:03 -08:00
"label": f"audio ({ext})",
"format_id": format_id,
"abr": abr,
2025-12-11 12:47:30 -08:00
}
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
result = []
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Add video formats in descending resolution order
for res in sorted(video_formats.keys(),
key=lambda x: int(x.replace("p", "")),
reverse=True):
2025-12-11 12:47:30 -08:00
fmt = video_formats[res]
2025-12-29 17:05:03 -08:00
result.append((fmt["label"], fmt["format_id"]))
2025-12-11 12:47:30 -08:00
# Add best audio format
if audio_formats:
2025-12-29 17:05:03 -08:00
best_audio = max(audio_formats.values(), key=lambda x: x.get("abr", 0))
result.append((best_audio["label"], best_audio["format_id"]))
2025-12-11 12:47:30 -08:00
return result
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
except Exception as e:
log(f"Error extracting formats: {e}", file=sys.stderr)
return []