Files
Medios-Macina/cmdlet/merge_file.py

1050 lines
39 KiB
Python
Raw Normal View History

2025-11-25 20:09:33 -08:00
"""Merge multiple files into a single output file."""
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
from __future__ import annotations
from typing import Any, Dict, Optional, Sequence, List
from pathlib import Path
import sys
2025-12-11 19:04:02 -08:00
from SYS.logger import log
2025-11-25 20:09:33 -08:00
import subprocess as _subprocess
import shutil as _shutil
2025-12-13 00:18:30 -08:00
import re as _re
from config import resolve_output_dir
2025-12-12 21:55:38 -08:00
2025-12-16 23:23:43 -08:00
from . import _shared as sh
Cmdlet = sh.Cmdlet
CmdletArg = sh.CmdletArg
2025-12-27 14:50:59 -08:00
SharedArgs = sh.SharedArgs
2025-12-16 23:23:43 -08:00
create_pipe_object_result = sh.create_pipe_object_result
get_field = sh.get_field
get_pipe_object_hash = sh.get_pipe_object_hash
get_pipe_object_path = sh.get_pipe_object_path
normalize_result_input = sh.normalize_result_input
parse_cmdlet_args = sh.parse_cmdlet_args
should_show_help = sh.should_show_help
2025-12-12 21:55:38 -08:00
import pipeline as ctx
2025-11-25 20:09:33 -08:00
try:
2025-12-16 01:45:01 -08:00
from pypdf import PdfWriter, PdfReader
2025-12-29 17:05:03 -08:00
2025-12-16 01:45:01 -08:00
HAS_PYPDF = True
2025-11-25 20:09:33 -08:00
except ImportError:
2025-12-16 01:45:01 -08:00
HAS_PYPDF = False
2025-11-25 20:09:33 -08:00
PdfWriter = None
PdfReader = None
try:
from metadata import (
read_tags_from_file,
2025-12-19 15:20:08 -08:00
merge_multiple_tag_lists,
2025-11-25 20:09:33 -08:00
)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
HAS_METADATA_API = True
except ImportError:
HAS_METADATA_API = False
2025-12-12 21:55:38 -08:00
def read_tags_from_file(file_path: Path) -> List[str]:
return []
2025-12-11 12:47:30 -08:00
2025-12-12 21:55:38 -08:00
def write_tags_to_file(
file_path: Path,
tags: List[str],
source_hashes: Optional[List[str]] = None,
url: Optional[List[str]] = None,
append: bool = False,
) -> bool:
return False
2025-11-25 20:09:33 -08:00
2025-12-12 21:55:38 -08:00
def dedup_tags_by_namespace(tags: List[str]) -> List[str]:
return tags
2025-11-25 20:09:33 -08:00
2025-12-29 17:05:03 -08:00
def merge_multiple_tag_lists(sources: List[List[str]], strategy: str = "first") -> List[str]:
2025-12-19 15:20:08 -08:00
out: List[str] = []
seen: set[str] = set()
for src in sources:
2025-12-29 17:05:03 -08:00
for t in src or []:
2025-12-19 15:20:08 -08:00
s = str(t)
if s and s not in seen:
out.append(s)
seen.add(s)
return out
2025-12-12 21:55:38 -08:00
def write_metadata(*_args: Any, **_kwargs: Any) -> None:
return None
2025-11-25 20:09:33 -08:00
def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
"""Merge multiple files into one."""
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Parse help
2025-12-11 12:47:30 -08:00
if should_show_help(args):
2025-12-12 21:55:38 -08:00
log(f"Cmdlet: {CMDLET.name}\nSummary: {CMDLET.summary}\nUsage: {CMDLET.usage}")
2025-12-11 12:47:30 -08:00
return 0
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Parse arguments
parsed = parse_cmdlet_args(args, CMDLET)
delete_after = parsed.get("delete", False)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
output_override: Optional[Path] = None
2025-12-27 14:50:59 -08:00
output_arg = parsed.get("path")
2025-11-25 20:09:33 -08:00
if output_arg:
try:
output_override = Path(str(output_arg)).expanduser()
except Exception:
output_override = None
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
format_spec = parsed.get("format")
if format_spec:
format_spec = str(format_spec).lower().strip()
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Collect files from piped results
# Use normalize_result_input to handle both single items and lists
files_to_merge: List[Dict[str, Any]] = normalize_result_input(result)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
if not files_to_merge:
log("No files provided to merge", file=sys.stderr)
return 1
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
if len(files_to_merge) < 2:
# Only 1 file - pass it through unchanged
# (merge only happens when multiple files are collected)
item = files_to_merge[0]
ctx.emit(item)
return 0
2025-12-20 23:57:44 -08:00
def _resolve_existing_path(item: Dict[str, Any]) -> Optional[Path]:
raw_path = get_pipe_object_path(item)
target_path: Optional[Path] = None
if isinstance(raw_path, Path):
target_path = raw_path
elif isinstance(raw_path, str) and raw_path.strip():
candidate = Path(raw_path).expanduser()
if candidate.exists():
target_path = candidate
if target_path and target_path.exists():
return target_path
return None
def _extract_url(item: Dict[str, Any]) -> Optional[str]:
u = get_field(item, "url") or get_field(item, "target")
if isinstance(u, str):
s = u.strip()
if s.lower().startswith(("http://", "https://")):
return s
return None
# If the user piped URL-only playlist selections (no local paths yet), download first.
# This keeps the pipeline order intuitive:
# @* | merge-file | add-file -store ...
urls_to_download: List[str] = []
for it in files_to_merge:
if _resolve_existing_path(it) is not None:
continue
u = _extract_url(it)
if u:
urls_to_download.append(u)
if urls_to_download and len(urls_to_download) >= 2:
try:
# Compute a batch hint (audio vs video + single-format id) once.
mode_hint: Optional[str] = None
forced_format: Optional[str] = None
try:
from cmdlet.download_media import list_formats
from tool.ytdlp import YtDlpTool
sample_url = urls_to_download[0]
cookiefile = None
try:
cookie_path = YtDlpTool(config).resolve_cookiefile()
if cookie_path is not None and cookie_path.is_file():
cookiefile = str(cookie_path)
except Exception:
cookiefile = None
2025-12-29 17:05:03 -08:00
fmts = list_formats(
sample_url, no_playlist=False, playlist_items=None, cookiefile=cookiefile
)
2025-12-20 23:57:44 -08:00
if isinstance(fmts, list) and fmts:
has_video = False
for f in fmts:
if not isinstance(f, dict):
continue
vcodec = str(f.get("vcodec", "none") or "none").strip().lower()
if vcodec and vcodec != "none":
has_video = True
break
mode_hint = "video" if has_video else "audio"
if len(fmts) == 1 and isinstance(fmts[0], dict):
fid = str(fmts[0].get("format_id") or "").strip()
if fid:
forced_format = fid
except Exception:
mode_hint = None
forced_format = None
from cmdlet.add_file import Add_File
expanded: List[Dict[str, Any]] = []
downloaded_any = False
for it in files_to_merge:
if _resolve_existing_path(it) is not None:
expanded.append(it)
continue
u = _extract_url(it)
if not u:
expanded.append(it)
continue
downloaded = Add_File._download_streaming_url_as_pipe_objects(
u,
config,
mode_hint=mode_hint,
ytdl_format_hint=forced_format,
)
if downloaded:
expanded.extend(downloaded)
downloaded_any = True
else:
expanded.append(it)
if downloaded_any:
files_to_merge = expanded
except Exception:
# If downloads fail, we fall back to the existing path-based merge behavior.
pass
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Extract file paths and metadata from result objects
source_files: List[Path] = []
source_hashes: List[str] = []
2025-12-11 12:47:30 -08:00
source_url: List[str] = []
2025-12-19 15:20:08 -08:00
source_tags: List[str] = [] # tags read from .tag sidecars
source_item_tag_lists: List[List[str]] = [] # tags carried in-memory on piped items
2025-11-25 20:09:33 -08:00
for item in files_to_merge:
2025-12-20 23:57:44 -08:00
target_path = _resolve_existing_path(item)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
if target_path and target_path.exists():
source_files.append(target_path)
2025-12-12 21:55:38 -08:00
2025-12-19 15:20:08 -08:00
# Track tags carried in the piped items (e.g. add-tag stage) so they survive merge.
try:
2025-12-29 17:05:03 -08:00
raw_tags = get_field(item, "tag", [])
2025-12-19 15:20:08 -08:00
if isinstance(raw_tags, str) and raw_tags.strip():
source_item_tag_lists.append([raw_tags.strip()])
elif isinstance(raw_tags, list):
2025-12-29 17:05:03 -08:00
source_item_tag_lists.append(
[str(t) for t in raw_tags if t is not None and str(t).strip()]
)
2025-12-19 15:20:08 -08:00
except Exception:
pass
2025-12-12 21:55:38 -08:00
# Track tags from the .tag sidecar for this source (if present)
2025-12-29 17:05:03 -08:00
tags_file = target_path.with_suffix(target_path.suffix + ".tag")
2025-12-12 21:55:38 -08:00
if tags_file.exists() and HAS_METADATA_API:
2025-11-25 20:09:33 -08:00
try:
2025-12-12 21:55:38 -08:00
source_tags.extend(read_tags_from_file(tags_file) or [])
2025-11-25 20:09:33 -08:00
except Exception:
pass
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Extract hash if available in item (as fallback)
hash_value = get_pipe_object_hash(item)
if hash_value and hash_value not in source_hashes:
source_hashes.append(str(hash_value))
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Extract known url if available
2025-12-29 17:05:03 -08:00
url = get_field(item, "url", [])
2025-12-11 12:47:30 -08:00
if isinstance(url, str):
source_url.append(url)
elif isinstance(url, list):
source_url.extend(url)
2025-11-25 20:09:33 -08:00
else:
2025-12-29 17:05:03 -08:00
title = get_field(item, "title", "unknown") or get_field(item, "id", "unknown")
2025-11-25 20:09:33 -08:00
log(f"Warning: Could not locate file for item: {title}", file=sys.stderr)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
if len(source_files) < 2:
log("At least 2 valid files required to merge", file=sys.stderr)
return 1
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Detect file types
file_types = set()
for f in source_files:
suffix = f.suffix.lower()
2025-12-29 17:05:03 -08:00
if suffix in {".mp3", ".flac", ".wav", ".m4a", ".aac", ".ogg", ".opus", ".mka"}:
file_types.add("audio")
elif suffix in {
".mp4",
".mkv",
".webm",
".mov",
".avi",
".flv",
".mpg",
".mpeg",
".ts",
".m4v",
".wmv",
}:
file_types.add("video")
elif suffix in {".pdf"}:
file_types.add("pdf")
elif suffix in {".txt", ".srt", ".vtt", ".md", ".log"}:
file_types.add("text")
2025-11-25 20:09:33 -08:00
else:
2025-12-29 17:05:03 -08:00
file_types.add("other")
if len(file_types) > 1 and "other" not in file_types:
2025-11-25 20:09:33 -08:00
log(f"Mixed file types detected: {', '.join(sorted(file_types))}", file=sys.stderr)
log(f"Can only merge files of the same type", file=sys.stderr)
return 1
2025-12-29 17:05:03 -08:00
file_kind = list(file_types)[0] if file_types else "other"
2025-11-25 20:09:33 -08:00
# Determine output format
2025-12-29 17:05:03 -08:00
output_format = format_spec or "auto"
if output_format == "auto":
if file_kind == "audio":
output_format = "mka" # Default audio codec - mka supports chapters and stream copy
elif file_kind == "video":
output_format = "mp4" # Default video codec
elif file_kind == "pdf":
output_format = "pdf"
2025-11-25 20:09:33 -08:00
else:
2025-12-29 17:05:03 -08:00
output_format = "txt"
2025-11-25 20:09:33 -08:00
# Determine output path
if output_override:
if output_override.is_dir():
2025-12-29 17:05:03 -08:00
base_title = get_field(files_to_merge[0], "title", "merged")
base_name = _sanitize_name(str(base_title or "merged"))
2025-11-25 20:09:33 -08:00
output_path = output_override / f"{base_name} (merged).{_ext_for_format(output_format)}"
else:
output_path = output_override
else:
first_file = source_files[0]
2025-12-13 00:18:30 -08:00
try:
base_dir = resolve_output_dir(config)
except Exception:
base_dir = first_file.parent
2025-12-29 17:05:03 -08:00
output_path = (
Path(base_dir) / f"{first_file.stem} (merged).{_ext_for_format(output_format)}"
)
2025-11-25 20:09:33 -08:00
# Ensure output directory exists
output_path.parent.mkdir(parents=True, exist_ok=True)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Perform merge based on file type
2025-12-29 17:05:03 -08:00
if file_kind == "audio":
2025-11-25 20:09:33 -08:00
success = _merge_audio(source_files, output_path, output_format)
2025-12-29 17:05:03 -08:00
elif file_kind == "video":
2025-11-25 20:09:33 -08:00
success = _merge_video(source_files, output_path, output_format)
2025-12-29 17:05:03 -08:00
elif file_kind == "pdf":
2025-11-25 20:09:33 -08:00
success = _merge_pdf(source_files, output_path)
2025-12-29 17:05:03 -08:00
elif file_kind == "text":
2025-11-25 20:09:33 -08:00
success = _merge_text(source_files, output_path)
else:
log(f"Unsupported file type: {file_kind}", file=sys.stderr)
return 1
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
if not success:
log("Merge failed", file=sys.stderr)
return 1
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
log(f"Merged {len(source_files)} files into: {output_path}", file=sys.stderr)
2025-12-12 21:55:38 -08:00
2025-12-19 15:20:08 -08:00
def _title_value_from_tags(tags: List[str]) -> Optional[str]:
for t in tags:
try:
s = str(t)
except Exception:
continue
2025-12-29 17:05:03 -08:00
if s.lower().startswith("title:"):
val = s.split(":", 1)[1].strip()
2025-12-19 15:20:08 -08:00
return val or None
return None
# Determine best title:
# - prefer a title tag shared across all inputs (typical when user did add-tag title:...)
# - otherwise fall back to first title tag encountered
shared_title: Optional[str] = None
try:
if source_item_tag_lists:
2025-12-29 17:05:03 -08:00
per_item_titles: List[Optional[str]] = [
_title_value_from_tags(tl) for tl in source_item_tag_lists
]
2025-12-19 15:20:08 -08:00
non_empty = [t for t in per_item_titles if t]
if non_empty:
candidate = non_empty[0]
if candidate and all((t == candidate) for t in non_empty):
shared_title = candidate
else:
shared_title = non_empty[0]
except Exception:
shared_title = None
merged_title = shared_title or output_path.stem
# Merge tags from:
# - in-memory PipeObject tags (from add-tag etc)
# - .tag sidecars (if present)
# Keep all unique plain tags, and keep the first value for namespaced tags.
2025-12-29 17:05:03 -08:00
merged_tags = merge_multiple_tag_lists(
source_item_tag_lists + ([source_tags] if source_tags else []), strategy="combine"
)
2025-12-19 15:20:08 -08:00
# Ensure we always have a title tag (and make sure it's the chosen title)
2025-12-29 17:05:03 -08:00
merged_tags = [t for t in merged_tags if not str(t).lower().startswith("title:")]
2025-12-19 15:20:08 -08:00
merged_tags.insert(0, f"title:{merged_title}")
2025-12-29 17:05:03 -08:00
2025-12-11 23:21:45 -08:00
# Emit a PipeObject-compatible dict so the merged file can be piped to next command
2025-11-25 20:09:33 -08:00
try:
2025-12-11 23:21:45 -08:00
from SYS.utils import sha256_file
2025-12-29 17:05:03 -08:00
2025-12-11 23:21:45 -08:00
merged_hash = sha256_file(output_path)
merged_item = create_pipe_object_result(
source="local",
identifier=output_path.name,
file_path=str(output_path),
cmdlet_name="merge-file",
2025-12-19 15:20:08 -08:00
title=merged_title,
2025-12-11 23:21:45 -08:00
hash_value=merged_hash,
tag=merged_tags,
url=source_url,
2025-11-25 20:09:33 -08:00
media_kind=file_kind,
2025-12-13 00:18:30 -08:00
store="PATH",
2025-11-25 20:09:33 -08:00
)
2025-12-01 01:10:16 -08:00
# Clear previous results to ensure only the merged file is passed down
ctx.clear_last_result()
2025-11-25 20:09:33 -08:00
ctx.emit(merged_item)
except Exception as e:
log(f"Warning: Could not emit pipeline item: {e}", file=sys.stderr)
# Still emit a string representation for feedback
ctx.emit(f"Merged: {output_path}")
2025-12-29 17:05:03 -08:00
2025-12-12 21:55:38 -08:00
# Cleanup
# - Delete source files only when -delete is set.
if delete_after:
2025-11-25 20:09:33 -08:00
for f in source_files:
try:
2025-12-12 21:55:38 -08:00
# Delete sidecar tags for the source (if any)
2025-12-29 17:05:03 -08:00
tag_file = f.with_suffix(f.suffix + ".tag")
2025-12-12 21:55:38 -08:00
if tag_file.exists():
try:
tag_file.unlink()
log(f"Deleted: {tag_file.name}", file=sys.stderr)
except Exception as e:
log(f"Warning: Could not delete {tag_file.name}: {e}", file=sys.stderr)
except Exception:
pass
try:
if f.exists():
f.unlink()
log(f"Deleted: {f.name}", file=sys.stderr)
2025-11-25 20:09:33 -08:00
except Exception as e:
log(f"Warning: Could not delete {f.name}: {e}", file=sys.stderr)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
return 0
def _sanitize_name(text: str) -> str:
"""Sanitize filename."""
allowed = []
for ch in text:
allowed.append(ch if (ch.isalnum() or ch in {"-", "_", " ", "."}) else " ")
return (" ".join("".join(allowed).split()) or "merged").strip()
def _ext_for_format(fmt: str) -> str:
"""Get file extension for format."""
format_map = {
2025-12-29 17:05:03 -08:00
"mp3": "mp3",
"m4a": "m4a",
"m4b": "m4b",
"aac": "aac",
"opus": "opus",
"mka": "mka", # Matroska Audio - EXCELLENT chapter support (recommended)
"mkv": "mkv",
"mp4": "mp4",
"webm": "webm",
"pdf": "pdf",
"txt": "txt",
"auto": "mka", # Default - MKA for chapters
2025-11-25 20:09:33 -08:00
}
2025-12-29 17:05:03 -08:00
return format_map.get(fmt.lower(), "mka")
2025-11-25 20:09:33 -08:00
def _merge_audio(files: List[Path], output: Path, output_format: str) -> bool:
"""Merge audio files with chapters based on file boundaries."""
import logging
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
logger = logging.getLogger(__name__)
2025-12-29 17:05:03 -08:00
ffmpeg_path = _shutil.which("ffmpeg")
2025-11-25 20:09:33 -08:00
if not ffmpeg_path:
log("ffmpeg not found in PATH", file=sys.stderr)
return False
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
try:
# Step 1: Get duration of each file to calculate chapter timestamps
chapters = []
current_time_ms = 0
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
log(f"Analyzing {len(files)} files for chapter information...", file=sys.stderr)
logger.info(f"[merge-file] Analyzing files for chapters")
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
for file_path in files:
# Get duration using ffprobe
try:
ffprobe_cmd = [
2025-12-29 17:05:03 -08:00
"ffprobe",
"-v",
"error",
"-show_entries",
"format=duration",
"-print_format",
"default=noprint_wrappers=1:nokey=1",
str(file_path),
2025-11-25 20:09:33 -08:00
]
2025-12-29 17:05:03 -08:00
probe_result = _subprocess.run(
ffprobe_cmd, capture_output=True, text=True, timeout=10
)
2025-11-25 20:09:33 -08:00
if probe_result.returncode == 0 and probe_result.stdout.strip():
try:
duration_sec = float(probe_result.stdout.strip())
except ValueError:
2025-12-29 17:05:03 -08:00
logger.warning(
f"[merge-file] Could not parse duration from ffprobe output: {probe_result.stdout}"
)
2025-11-25 20:09:33 -08:00
duration_sec = 0
else:
2025-12-29 17:05:03 -08:00
logger.warning(
f"[merge-file] ffprobe failed for {file_path.name}: {probe_result.stderr}"
)
2025-11-25 20:09:33 -08:00
duration_sec = 0
except Exception as e:
logger.warning(f"[merge-file] Could not get duration for {file_path.name}: {e}")
duration_sec = 0
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Create chapter entry - use title: tag from metadata if available
title = file_path.stem # Default to filename without extension
if HAS_METADATA_API:
try:
2025-12-11 23:21:45 -08:00
# Try to read tags from .tag sidecar file
2025-12-29 17:05:03 -08:00
tags_file = file_path.with_suffix(file_path.suffix + ".tag")
2025-11-25 20:09:33 -08:00
if tags_file.exists():
tags = read_tags_from_file(tags_file)
if tags:
# Look for title: tag
for tag in tags:
2025-12-29 17:05:03 -08:00
if isinstance(tag, str) and tag.lower().startswith("title:"):
2025-11-25 20:09:33 -08:00
# Extract the title value after the colon
2025-12-29 17:05:03 -08:00
title = tag.split(":", 1)[1].strip()
2025-11-25 20:09:33 -08:00
break
except Exception as e:
logger.debug(f"[merge-file] Could not read metadata for {file_path.name}: {e}")
pass # Fall back to filename
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Convert seconds to HH:MM:SS.mmm format
hours = int(current_time_ms // 3600000)
minutes = int((current_time_ms % 3600000) // 60000)
seconds = int((current_time_ms % 60000) // 1000)
millis = int(current_time_ms % 1000)
2025-12-29 17:05:03 -08:00
chapters.append(
{
"time_ms": current_time_ms,
"time_str": f"{hours:02d}:{minutes:02d}:{seconds:02d}.{millis:03d}",
"title": title,
"duration_sec": duration_sec,
}
)
logger.info(
f"[merge-file] Chapter: {title} @ {chapters[-1]['time_str']} (duration: {duration_sec:.2f}s)"
)
2025-11-25 20:09:33 -08:00
current_time_ms += int(duration_sec * 1000)
2025-12-13 00:18:30 -08:00
# If these came from a playlist/album, titles often look like:
# "Book Name - Chapter"
# If *all* titles share the same "Book Name" prefix, strip it.
if len(chapters) >= 2:
split_re = _re.compile(r"^(?P<prefix>.+?)\s+-\s+(?P<chapter>.+)$")
prefixes: List[str] = []
stripped_titles: List[str] = []
all_match = True
for ch in chapters:
2025-12-29 17:05:03 -08:00
raw_title = str(ch.get("title") or "").strip()
2025-12-13 00:18:30 -08:00
m = split_re.match(raw_title)
if not m:
all_match = False
break
2025-12-29 17:05:03 -08:00
prefix = m.group("prefix").strip()
chapter_title = m.group("chapter").strip()
2025-12-13 00:18:30 -08:00
if not prefix or not chapter_title:
all_match = False
break
prefixes.append(prefix.casefold())
stripped_titles.append(chapter_title)
if all_match and prefixes and len(set(prefixes)) == 1:
for idx, ch in enumerate(chapters):
2025-12-29 17:05:03 -08:00
ch["title"] = stripped_titles[idx]
logger.info(
f"[merge-file] Stripped common title prefix for chapters: {prefixes[0]}"
)
2025-11-25 20:09:33 -08:00
# Step 2: Create concat demuxer file
concat_file = output.parent / f".concat_{output.stem}.txt"
concat_lines = []
for f in files:
# Escape quotes in path
safe_path = str(f).replace("'", "'\\''")
concat_lines.append(f"file '{safe_path}'")
2025-12-29 17:05:03 -08:00
concat_file.write_text("\n".join(concat_lines), encoding="utf-8")
2025-11-25 20:09:33 -08:00
# Step 3: Create FFmpeg metadata file with chapters
metadata_file = output.parent / f".metadata_{output.stem}.txt"
2025-12-29 17:05:03 -08:00
metadata_lines = [";FFMETADATA1"]
2025-11-25 20:09:33 -08:00
for i, chapter in enumerate(chapters):
# FFMetadata format for chapters (note: [CHAPTER] not [CHAPTER01])
2025-12-29 17:05:03 -08:00
metadata_lines.append("[CHAPTER]")
metadata_lines.append("TIMEBASE=1/1000")
2025-11-25 20:09:33 -08:00
metadata_lines.append(f'START={chapter["time_ms"]}')
# Calculate end time (start of next chapter or end of file)
if i < len(chapters) - 1:
metadata_lines.append(f'END={chapters[i+1]["time_ms"]}')
else:
2025-12-29 17:05:03 -08:00
metadata_lines.append(f"END={current_time_ms}")
2025-11-25 20:09:33 -08:00
metadata_lines.append(f'title={chapter["title"]}')
2025-12-29 17:05:03 -08:00
metadata_file.write_text("\n".join(metadata_lines), encoding="utf-8")
2025-11-25 20:09:33 -08:00
log(f"Created chapters metadata file with {len(chapters)} chapters", file=sys.stderr)
logger.info(f"[merge-file] Created {len(chapters)} chapters")
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Step 4: Build FFmpeg command to merge and embed chapters
# Strategy: First merge audio, then add metadata in separate pass
2025-12-29 17:05:03 -08:00
cmd = [ffmpeg_path, "-y", "-f", "concat", "-safe", "0", "-i", str(concat_file)]
2025-11-25 20:09:33 -08:00
# Add threading options for speed
2025-12-29 17:05:03 -08:00
cmd.extend(["-threads", "0"]) # Use all available threads
2025-11-25 20:09:33 -08:00
# Audio codec selection for first input
2025-12-29 17:05:03 -08:00
if output_format == "mp3":
cmd.extend(["-c:a", "libmp3lame", "-q:a", "2"])
elif output_format in {"m4a", "m4b"}:
2025-11-25 20:09:33 -08:00
# Use copy if possible (much faster), otherwise re-encode
# Check if inputs are already AAC/M4A to avoid re-encoding
# For now, default to copy if format matches, otherwise re-encode
# But since we are merging potentially different codecs, re-encoding is safer
# To speed up re-encoding, we can use a faster preset or hardware accel if available
2025-12-29 17:05:03 -08:00
cmd.extend(["-c:a", "aac", "-b:a", "256k"]) # M4A with better quality
elif output_format == "aac":
cmd.extend(["-c:a", "aac", "-b:a", "192k"])
elif output_format == "opus":
cmd.extend(["-c:a", "libopus", "-b:a", "128k"])
elif output_format == "mka":
2025-11-25 20:09:33 -08:00
# FLAC is fast to encode but large. Copy is fastest if inputs are compatible.
# If we want speed, copy is best. If we want compatibility, re-encode.
# Let's try copy first if inputs are same format, but that's hard to detect here.
# Defaulting to copy for MKA as it's a container that supports many codecs
2025-12-29 17:05:03 -08:00
cmd.extend(["-c:a", "copy"])
2025-11-25 20:09:33 -08:00
else:
2025-12-29 17:05:03 -08:00
cmd.extend(["-c:a", "copy"]) # Copy without re-encoding
2025-11-25 20:09:33 -08:00
# Add the output file
cmd.append(str(output))
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
log(f"Merging {len(files)} audio files to {output_format}...", file=sys.stderr)
logger.info(f"[merge-file] Running ffmpeg merge: {' '.join(cmd)}")
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Run ffmpeg with progress monitoring
try:
2025-12-11 19:04:02 -08:00
from SYS.progress import print_progress, print_final_progress
2025-11-25 20:09:33 -08:00
import re
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
process = _subprocess.Popen(
cmd,
stdout=_subprocess.PIPE,
stderr=_subprocess.PIPE,
text=True,
2025-12-29 17:05:03 -08:00
encoding="utf-8",
errors="replace",
2025-11-25 20:09:33 -08:00
)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Monitor progress
duration_re = re.compile(r"time=(\d{2}):(\d{2}):(\d{2})\.(\d{2})")
total_duration_sec = current_time_ms / 1000.0
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
while True:
# Read stderr line by line (ffmpeg writes progress to stderr)
if process.stderr:
line = process.stderr.readline()
if not line and process.poll() is not None:
break
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
if line:
# Parse time=HH:MM:SS.mm
match = duration_re.search(line)
if match and total_duration_sec > 0:
h, m, s, cs = map(int, match.groups())
current_sec = h * 3600 + m * 60 + s + cs / 100.0
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Calculate speed/bitrate if available (optional)
# For now just show percentage
print_progress(
output.name,
2025-12-29 17:05:03 -08:00
int(current_sec * 1000), # Use ms as "bytes" for progress bar
2025-11-25 20:09:33 -08:00
int(total_duration_sec * 1000),
2025-12-29 17:05:03 -08:00
speed=0,
2025-11-25 20:09:33 -08:00
)
else:
break
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Wait for completion
stdout, stderr = process.communicate()
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
if process.returncode != 0:
log(f"FFmpeg error: {stderr}", file=sys.stderr)
2025-12-29 17:05:03 -08:00
raise _subprocess.CalledProcessError(
process.returncode, cmd, output=stdout, stderr=stderr
)
2025-11-25 20:09:33 -08:00
print_final_progress(output.name, int(total_duration_sec * 1000), 0)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
except Exception as e:
logger.exception(f"[merge-file] ffmpeg process error: {e}")
raise
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
log(f"Merge successful, adding chapters metadata...", file=sys.stderr)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Step 5: Embed chapters into container (MKA, MP4/M4A, or note limitation)
2025-12-29 17:05:03 -08:00
if output_format == "mka" or output.suffix.lower() == ".mka":
2025-11-25 20:09:33 -08:00
# MKA/MKV format has native chapter support via FFMetadata
# Re-mux the file with chapters embedded (copy streams, no re-encode)
log(f"Embedding chapters into Matroska container...", file=sys.stderr)
logger.info(f"[merge-file] Adding chapters to MKA file via FFMetadata")
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
temp_output = output.parent / f".temp_{output.stem}.mka"
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Use mkvmerge if available (best for MKA chapters), otherwise fall back to ffmpeg
2025-12-29 17:05:03 -08:00
mkvmerge_path = _shutil.which("mkvmerge")
2025-11-25 20:09:33 -08:00
if mkvmerge_path:
# mkvmerge is the best tool for embedding chapters in Matroska files
log(f"Using mkvmerge for optimal chapter embedding...", file=sys.stderr)
cmd2 = [
2025-12-29 17:05:03 -08:00
mkvmerge_path,
"-o",
str(temp_output),
"--chapters",
str(metadata_file),
str(output),
2025-11-25 20:09:33 -08:00
]
else:
# Fallback to ffmpeg with proper chapter embedding for Matroska
2025-12-29 17:05:03 -08:00
log(
f"Using ffmpeg for chapter embedding (install mkvtoolnix for better quality)...",
file=sys.stderr,
)
2025-11-25 20:09:33 -08:00
# For Matroska files, the metadata must be provided via -f ffmetadata input
cmd2 = [
2025-12-29 17:05:03 -08:00
ffmpeg_path,
"-y",
"-i",
str(output), # Input: merged audio
"-i",
str(metadata_file), # Input: FFMetadata file
"-c:a",
"copy", # Copy audio without re-encoding
"-threads",
"0", # Use all threads
"-map",
"0", # Map all from first input
"-map_chapters",
"1", # Map CHAPTERS from second input (FFMetadata)
str(temp_output), # Output
2025-11-25 20:09:33 -08:00
]
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
logger.info(f"[merge-file] Running chapter embedding: {' '.join(cmd2)}")
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
try:
# Run chapter embedding silently (progress handled by worker thread)
_subprocess.run(
cmd2,
capture_output=True,
text=True,
stdin=_subprocess.DEVNULL,
timeout=600,
2025-12-29 17:05:03 -08:00
check=False,
2025-11-25 20:09:33 -08:00
)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Replace original with temp if successful
if temp_output.exists() and temp_output.stat().st_size > 0:
try:
import shutil
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
if output.exists():
output.unlink()
shutil.move(str(temp_output), str(output))
log(f"✓ Chapters successfully embedded!", file=sys.stderr)
logger.info(f"[merge-file] Chapters embedded successfully")
except Exception as e:
logger.warning(f"[merge-file] Could not replace file: {e}")
2025-12-29 17:05:03 -08:00
log(
f"Warning: Could not embed chapters, using merge without chapters",
file=sys.stderr,
)
2025-11-25 20:09:33 -08:00
try:
temp_output.unlink()
except Exception:
pass
else:
logger.warning(f"[merge-file] Chapter embedding did not create output")
except Exception as e:
logger.exception(f"[merge-file] Chapter embedding failed: {e}")
2025-12-29 17:05:03 -08:00
log(
f"Warning: Chapter embedding failed, using merge without chapters",
file=sys.stderr,
)
elif output_format in {"m4a", "m4b"} or output.suffix.lower() in [".m4a", ".m4b", ".mp4"]:
2025-11-25 20:09:33 -08:00
# MP4/M4A format has native chapter support via iTunes metadata atoms
log(f"Embedding chapters into MP4 container...", file=sys.stderr)
logger.info(f"[merge-file] Adding chapters to M4A/MP4 file via iTunes metadata")
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
temp_output = output.parent / f".temp_{output.stem}{output.suffix}"
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# ffmpeg embeds chapters in MP4 using -map_metadata and -map_chapters
log(f"Using ffmpeg for MP4 chapter embedding...", file=sys.stderr)
cmd2 = [
2025-12-29 17:05:03 -08:00
ffmpeg_path,
"-y",
"-i",
str(output), # Input: merged audio
"-i",
str(metadata_file), # Input: FFMetadata file
"-c:a",
"copy", # Copy audio without re-encoding
"-threads",
"0", # Use all threads
"-map",
"0", # Map all from first input
"-map_metadata",
"1", # Map metadata from second input (FFMetadata)
"-map_chapters",
"1", # Map CHAPTERS from second input (FFMetadata)
str(temp_output), # Output
2025-11-25 20:09:33 -08:00
]
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
logger.info(f"[merge-file] Running MP4 chapter embedding: {' '.join(cmd2)}")
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
try:
# Run MP4 chapter embedding silently (progress handled by worker thread)
_subprocess.run(
cmd2,
capture_output=True,
text=True,
stdin=_subprocess.DEVNULL,
timeout=600,
2025-12-29 17:05:03 -08:00
check=False,
2025-11-25 20:09:33 -08:00
)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Replace original with temp if successful
if temp_output.exists() and temp_output.stat().st_size > 0:
try:
import shutil
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
if output.exists():
output.unlink()
shutil.move(str(temp_output), str(output))
log(f"✓ Chapters successfully embedded in MP4!", file=sys.stderr)
logger.info(f"[merge-file] MP4 chapters embedded successfully")
except Exception as e:
logger.warning(f"[merge-file] Could not replace file: {e}")
2025-12-29 17:05:03 -08:00
log(
f"Warning: Could not embed chapters, using merge without chapters",
file=sys.stderr,
)
2025-11-25 20:09:33 -08:00
try:
temp_output.unlink()
except Exception:
pass
else:
logger.warning(f"[merge-file] MP4 chapter embedding did not create output")
except Exception as e:
logger.exception(f"[merge-file] MP4 chapter embedding failed: {e}")
2025-12-29 17:05:03 -08:00
log(
f"Warning: MP4 chapter embedding failed, using merge without chapters",
file=sys.stderr,
)
2025-11-25 20:09:33 -08:00
else:
# For other formats, chapters would require external tools
logger.info(f"[merge-file] Format {output_format} does not have native chapter support")
log(f"Note: For chapter support, use MKA or M4A format", file=sys.stderr)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Clean up temp files
try:
concat_file.unlink()
except Exception:
pass
try:
metadata_file.unlink()
except Exception:
pass
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
return True
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
except Exception as e:
log(f"Audio merge error: {e}", file=sys.stderr)
logger.error(f"[merge-file] Audio merge error: {e}", exc_info=True)
return False
def _merge_video(files: List[Path], output: Path, output_format: str) -> bool:
"""Merge video files."""
2025-12-29 17:05:03 -08:00
ffmpeg_path = _shutil.which("ffmpeg")
2025-11-25 20:09:33 -08:00
if not ffmpeg_path:
log("ffmpeg not found in PATH", file=sys.stderr)
return False
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
try:
# Create concat demuxer file
concat_file = output.parent / f".concat_{output.stem}.txt"
concat_lines = []
for f in files:
safe_path = str(f).replace("'", "'\\''")
concat_lines.append(f"file '{safe_path}'")
2025-12-29 17:05:03 -08:00
concat_file.write_text("\n".join(concat_lines), encoding="utf-8")
2025-11-25 20:09:33 -08:00
# Build FFmpeg command for video merge
2025-12-29 17:05:03 -08:00
cmd = [ffmpeg_path, "-y", "-f", "concat", "-safe", "0", "-i", str(concat_file)]
2025-11-25 20:09:33 -08:00
# Video codec selection
2025-12-29 17:05:03 -08:00
if output_format == "mp4":
cmd.extend(
[
"-c:v",
"libx265",
"-preset",
"fast",
"-tag:v",
"hvc1",
"-c:a",
"aac",
"-b:a",
"192k",
]
)
elif output_format == "mkv":
cmd.extend(["-c:v", "libx265", "-preset", "fast", "-c:a", "aac", "-b:a", "192k"])
2025-11-25 20:09:33 -08:00
else:
2025-12-29 17:05:03 -08:00
cmd.extend(["-c", "copy"]) # Copy without re-encoding
2025-11-25 20:09:33 -08:00
cmd.append(str(output))
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
log(f"Merging {len(files)} video files...", file=sys.stderr)
result = _subprocess.run(cmd, capture_output=True, text=True)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Clean up concat file
try:
concat_file.unlink()
except Exception:
pass
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
if result.returncode != 0:
2025-12-29 17:05:03 -08:00
stderr = (result.stderr or "").strip()
2025-11-25 20:09:33 -08:00
log(f"FFmpeg error: {stderr}", file=sys.stderr)
return False
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
return True
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
except Exception as e:
log(f"Video merge error: {e}", file=sys.stderr)
return False
def _merge_text(files: List[Path], output: Path) -> bool:
"""Merge text files."""
try:
2025-12-29 17:05:03 -08:00
with open(output, "w", encoding="utf-8") as outf:
2025-11-25 20:09:33 -08:00
for i, f in enumerate(files):
if i > 0:
2025-12-29 17:05:03 -08:00
outf.write("\n---\n") # Separator between files
2025-11-25 20:09:33 -08:00
try:
2025-12-29 17:05:03 -08:00
content = f.read_text(encoding="utf-8", errors="replace")
2025-11-25 20:09:33 -08:00
outf.write(content)
except Exception as e:
log(f"Warning reading {f.name}: {e}", file=sys.stderr)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
return True
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
except Exception as e:
log(f"Text merge error: {e}", file=sys.stderr)
return False
def _merge_pdf(files: List[Path], output: Path) -> bool:
"""Merge PDF files."""
2025-12-16 01:45:01 -08:00
if (not HAS_PYPDF) or (PdfWriter is None) or (PdfReader is None):
log("pypdf is required for PDF merging. Install with: pip install pypdf", file=sys.stderr)
2025-11-25 20:09:33 -08:00
return False
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
try:
2025-12-12 21:55:38 -08:00
writer = PdfWriter()
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
for f in files:
try:
reader = PdfReader(f)
for page in reader.pages:
writer.add_page(page)
log(f"Added {len(reader.pages)} pages from {f.name}", file=sys.stderr)
except Exception as e:
log(f"Error reading PDF {f.name}: {e}", file=sys.stderr)
return False
2025-12-29 17:05:03 -08:00
with open(output, "wb") as outf:
2025-11-25 20:09:33 -08:00
writer.write(outf)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
return True
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
except Exception as e:
log(f"PDF merge error: {e}", file=sys.stderr)
return False
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
CMDLET = Cmdlet(
name="merge-file",
summary="Merge multiple files into a single output file. Supports audio, video, PDF, and text merging with optional cleanup.",
2025-12-29 17:05:03 -08:00
usage="merge-file [-delete] [-path <path>] [-format <auto|mka|m4a|m4b|mp3|aac|opus|mp4|mkv|pdf|txt>]",
2025-12-11 12:47:30 -08:00
arg=[
2025-12-29 17:05:03 -08:00
CmdletArg(
"-delete", type="flag", description="Delete source files after successful merge."
),
SharedArgs.PATH,
CmdletArg(
"-format",
description="Output format (auto/mka/m4a/m4b/mp3/aac/opus/mp4/mkv/pdf/txt). Default: auto-detect from first file.",
),
2025-11-25 20:09:33 -08:00
],
2025-12-11 12:47:30 -08:00
detail=[
2025-11-25 20:09:33 -08:00
"- Pipe multiple files: search-file query | [1,2,3] | merge-file",
"- Audio files merge with minimal quality loss using specified codec.",
"- Video files merge into MP4 or MKV containers.",
"- PDF files merge into a single PDF document.",
"- Text/document files are concatenated.",
"- Output name derived from first file with ' (merged)' suffix.",
"- -delete flag removes all source files after successful merge.",
],
)
2025-12-12 21:55:38 -08:00
CMDLET.exec = _run
CMDLET.register()