"""Download media from url using yt-dlp (streaming sites only). Focused cmdlet for video/audio downloads from yt-dlp-supported sites: - YouTube, Twitch, Dailymotion, Vimeo, etc. - No direct file downloads (use download-file for that) - Playlist detection with item selection - Clip extraction (time ranges) - Format selection and audio/video modes - Tags extraction and metadata integration """ from __future__ import annotations import glob # noqa: F401 import hashlib import json # noqa: F401 import random import re import string import subprocess import sys import tempfile import time import traceback from pathlib import Path from typing import Any, Dict, Iterator, List, Optional, Sequence from urllib.parse import urlparse from SYS.logger import log, debug from SYS.utils import sha256_file from models import DownloadError, DownloadOptions, DownloadMediaResult, DebugLogger, ProgressBar import pipeline as pipeline_context from result_table import ResultTable from rich.prompt import Confirm from rich_display import stderr_console as get_stderr_console from tool.ytdlp import YtDlpTool from . import _shared as sh Cmdlet = sh.Cmdlet CmdletArg = sh.CmdletArg SharedArgs = sh.SharedArgs create_pipe_object_result = sh.create_pipe_object_result parse_cmdlet_args = sh.parse_cmdlet_args register_url_with_local_library = sh.register_url_with_local_library coerce_to_pipe_object = sh.coerce_to_pipe_object get_field = sh.get_field def _live_ui_and_pipe_index() -> tuple[Optional[Any], int]: ui = None try: ui = pipeline_context.get_live_progress() if hasattr(pipeline_context, "get_live_progress") else None except Exception: ui = None pipe_idx: int = 0 try: stage_ctx = pipeline_context.get_stage_context() if hasattr(pipeline_context, "get_stage_context") else None maybe_idx = getattr(stage_ctx, "pipe_index", None) if stage_ctx is not None else None if isinstance(maybe_idx, int): pipe_idx = int(maybe_idx) except Exception: pipe_idx = 0 return ui, pipe_idx def _begin_live_steps(total_steps: int) -> None: """Declare the total number of steps for the current pipe.""" ui, pipe_idx = _live_ui_and_pipe_index() if ui is None: return try: begin = getattr(ui, "begin_pipe_steps", None) if callable(begin): begin(int(pipe_idx), total_steps=int(total_steps)) except Exception: return def _step(text: str) -> None: """Emit a *new* step (increments i/N and advances percent automatically).""" ui, pipe_idx = _live_ui_and_pipe_index() if ui is None: return try: adv = getattr(ui, "advance_pipe_step", None) if callable(adv): adv(int(pipe_idx), str(text)) except Exception: return def _set_pipe_percent(percent: int) -> None: """Best-effort percent update without changing step text.""" ui, pipe_idx = _live_ui_and_pipe_index() if ui is None: return try: set_pct = getattr(ui, "set_pipe_percent", None) if callable(set_pct): set_pct(int(pipe_idx), int(percent)) except Exception: return # Minimal inlined helpers from helper/download.py (is_url_supported_by_ytdlp, list_formats) try: import yt_dlp # type: ignore from yt_dlp.extractor import gen_extractors # type: ignore except Exception as exc: yt_dlp = None # type: ignore YTDLP_IMPORT_ERROR = exc else: YTDLP_IMPORT_ERROR = None try: from metadata import extract_ytdlp_tags except ImportError: extract_ytdlp_tags = None _EXTRACTOR_CACHE: List[Any] | None = None # Reused progress formatter for yt-dlp callbacks (stderr only). _YTDLP_PROGRESS_BAR = ProgressBar() _SUBTITLE_EXTS = (".vtt", ".srt", ".ass", ".ssa", ".lrc") def _format_chapters_note(info: Dict[str, Any]) -> Optional[str]: """Format yt-dlp chapter metadata into a stable, note-friendly text. Output is one chapter per line, e.g.: 00:00 Intro 01:23-02:10 Topic name """ try: chapters = info.get("chapters") except Exception: chapters = None if not isinstance(chapters, list) or not chapters: return None rows: List[tuple[int, Optional[int], str]] = [] max_t = 0 for ch in chapters: if not isinstance(ch, dict): continue start_raw = ch.get("start_time") end_raw = ch.get("end_time") title_raw = ch.get("title") or ch.get("name") or ch.get("chapter") try: start_s = int(float(start_raw)) except Exception: continue end_s: Optional[int] = None try: if end_raw is not None: end_s = int(float(end_raw)) except Exception: end_s = None title = str(title_raw).strip() if title_raw is not None else "" rows.append((start_s, end_s, title)) try: max_t = max(max_t, start_s, end_s or 0) except Exception: max_t = max(max_t, start_s) if not rows: return None force_hours = bool(max_t >= 3600) def _tc(seconds: int) -> str: total = max(0, int(seconds)) minutes, secs = divmod(total, 60) hours, minutes = divmod(minutes, 60) if force_hours: return f"{hours:02d}:{minutes:02d}:{secs:02d}" return f"{minutes:02d}:{secs:02d}" lines: List[str] = [] for start_s, end_s, title in sorted(rows, key=lambda r: (r[0], r[1] if r[1] is not None else 10**9, r[2])): if end_s is not None and end_s > start_s: prefix = f"{_tc(start_s)}-{_tc(end_s)}" else: prefix = _tc(start_s) line = f"{prefix} {title}".strip() if line: lines.append(line) text = "\n".join(lines).strip() return text or None def _best_subtitle_sidecar(media_path: Path) -> Optional[Path]: """Find the most likely subtitle sidecar file for a downloaded media file.""" try: base_dir = media_path.parent stem = media_path.stem if not stem: return None candidates: List[Path] = [] for p in base_dir.glob(stem + ".*"): try: if not p.is_file(): continue except Exception: continue if p.suffix.lower() in _SUBTITLE_EXTS: candidates.append(p) if not candidates: return None def _rank(path: Path) -> tuple[int, int, float, str]: name = path.name.lower() lang_rank = 0 if ".en." in name or name.endswith(".en" + path.suffix.lower()) else 1 ext = path.suffix.lower() ext_rank_map = {".vtt": 0, ".srt": 1, ".ass": 2, ".ssa": 3, ".lrc": 4} ext_rank = ext_rank_map.get(ext, 9) try: mtime = float(path.stat().st_mtime) except Exception: mtime = 0.0 return (lang_rank, ext_rank, -mtime, name) candidates.sort(key=_rank) return candidates[0] except Exception: return None def _read_text_file(path: Path, *, max_bytes: int = 1_500_000) -> Optional[str]: try: data = path.read_bytes() except Exception: return None if not data: return None if len(data) > max_bytes: data = data[:max_bytes] try: return data.decode("utf-8", errors="replace") except Exception: try: return data.decode(errors="replace") except Exception: return None def _ensure_yt_dlp_ready() -> None: if yt_dlp is not None: return detail = str(YTDLP_IMPORT_ERROR or "yt-dlp is not installed") raise DownloadError(f"yt-dlp module not available: {detail}") def is_url_supported_by_ytdlp(url: str) -> bool: if yt_dlp is None: return False global _EXTRACTOR_CACHE if _EXTRACTOR_CACHE is None: try: _EXTRACTOR_CACHE = [ie for ie in gen_extractors()] # type: ignore[arg-type] except Exception: _EXTRACTOR_CACHE = [] for extractor in _EXTRACTOR_CACHE: try: if not extractor.suitable(url): continue except Exception: continue name = getattr(extractor, "IE_NAME", "") if name.lower() == "generic": continue return True return False def list_formats( url: str, no_playlist: bool = False, playlist_items: Optional[str] = None, cookiefile: Optional[str] = None, ) -> Optional[List[Dict[str, Any]]]: _ensure_yt_dlp_ready() try: assert yt_dlp is not None ydl_opts: Dict[str, Any] = {"quiet": True, "no_warnings": True, "socket_timeout": 30} if no_playlist: ydl_opts["noplaylist"] = True if playlist_items: ydl_opts["playlist_items"] = playlist_items if cookiefile: ydl_opts["cookiefile"] = cookiefile debug(f"Fetching format list for: {url}") with yt_dlp.YoutubeDL(ydl_opts) as ydl: # type: ignore[arg-type] info = ydl.extract_info(url, download=False) if not isinstance(info, dict): log("No formats available", file=sys.stderr) return None formats = info.get("formats") or [] # Some URLs (notably playlist contexts) yield a playlist-shaped payload with # `entries` rather than a direct video payload. If so, try to pull formats # from the first concrete entry. if (not formats) and isinstance(info.get("entries"), list): try: for entry in info.get("entries") or []: if not isinstance(entry, dict): continue entry_formats = entry.get("formats") if isinstance(entry_formats, list) and entry_formats: formats = entry_formats break except Exception: pass if not isinstance(formats, list) or not formats: log("No formats available", file=sys.stderr) return None result_formats: List[Dict[str, Any]] = [] for fmt in formats: if not isinstance(fmt, dict): continue result_formats.append( { "format_id": fmt.get("format_id", ""), "format": fmt.get("format", ""), "ext": fmt.get("ext", ""), "resolution": fmt.get("resolution", ""), "width": fmt.get("width"), "height": fmt.get("height"), "fps": fmt.get("fps"), "vcodec": fmt.get("vcodec", "none"), "acodec": fmt.get("acodec", "none"), "filesize": fmt.get("filesize"), "abr": fmt.get("abr"), "tbr": fmt.get("tbr"), } ) debug(f"Found {len(result_formats)} available formats") return result_formats or None except Exception as e: log(f"✗ Error fetching formats: {e}", file=sys.stderr) return None def _pick_best_audio_format_id(formats: List[Dict[str, Any]]) -> Optional[str]: audio_only: List[Dict[str, Any]] = [] for fmt in formats: if not isinstance(fmt, dict): continue format_id = str(fmt.get("format_id") or "").strip() if not format_id: continue vcodec = str(fmt.get("vcodec") or "none").lower() acodec = str(fmt.get("acodec") or "none").lower() if vcodec != "none": continue if not acodec or acodec == "none": continue audio_only.append(fmt) if not audio_only: return None def score(f: Dict[str, Any]) -> tuple[float, float]: tbr = f.get("tbr") abr = f.get("abr") bitrate = 0.0 for candidate in (tbr, abr): try: if candidate is not None: bitrate = max(bitrate, float(candidate)) except Exception: pass size = 0.0 try: fs = f.get("filesize") if fs is not None: size = float(fs) except Exception: pass return (bitrate, size) best = max(audio_only, key=score) best_id = str(best.get("format_id") or "").strip() return best_id or None def _download_with_sections_via_cli(url: str, ytdl_options: Dict[str, Any], sections: List[str], quiet: bool = False) -> tuple[Optional[str], Dict[str, Any]]: sections_list = ytdl_options.get("download_sections", []) if not sections_list: return "", {} session_id = hashlib.md5((url + str(time.time()) + ''.join(random.choices(string.ascii_letters, k=10))).encode()).hexdigest()[:12] first_section_info = None total_sections = len(sections_list) for section_idx, section in enumerate(sections_list, 1): # While step 1/2 is "downloading", keep the pipe bar moving for multi-section clips. # Map sections onto 50..99 so step 2/2 can still jump to 100. try: if total_sections > 0: pct = 50 + int(((section_idx - 1) / max(1, total_sections)) * 49) _set_pipe_percent(pct) except Exception: pass base_outtmpl = ytdl_options.get("outtmpl", "%(title)s.%(ext)s") output_dir_path = Path(base_outtmpl).parent filename_tmpl = f"{session_id}_{section_idx}" if base_outtmpl.endswith(".%(ext)s"): filename_tmpl += ".%(ext)s" section_outtmpl = str(output_dir_path / filename_tmpl) if section_idx == 1: metadata_cmd = ["yt-dlp", "--dump-json", "--skip-download"] if ytdl_options.get("cookiefile"): cookies_path = ytdl_options["cookiefile"].replace("\\", "/") metadata_cmd.extend(["--cookies", cookies_path]) if ytdl_options.get("noplaylist"): metadata_cmd.append("--no-playlist") metadata_cmd.append(url) try: meta_result = subprocess.run(metadata_cmd, capture_output=True, text=True) if meta_result.returncode == 0 and meta_result.stdout: try: info_dict = json.loads(meta_result.stdout.strip()) first_section_info = info_dict if not quiet: debug(f"Extracted title from metadata: {info_dict.get('title')}") except json.JSONDecodeError: if not quiet: debug("Could not parse JSON metadata") except Exception as e: if not quiet: debug(f"Error extracting metadata: {e}") cmd = ["yt-dlp"] if quiet: cmd.append("--quiet") cmd.append("--no-warnings") cmd.append("--no-progress") # Keep ffmpeg/merger output from taking over the terminal. cmd.extend(["--postprocessor-args", "ffmpeg:-hide_banner -loglevel error"]) if ytdl_options.get("ffmpeg_location"): try: cmd.extend(["--ffmpeg-location", str(ytdl_options["ffmpeg_location"])]) except Exception: pass if ytdl_options.get("format"): cmd.extend(["-f", ytdl_options["format"]]) if ytdl_options.get("merge_output_format"): cmd.extend(["--merge-output-format", str(ytdl_options["merge_output_format"])]) # For CLI downloads, infer chapter/metadata embedding from either legacy flags # or explicit FFmpegMetadata postprocessor entries. postprocessors = ytdl_options.get("postprocessors") want_add_metadata = bool(ytdl_options.get("addmetadata")) want_embed_chapters = bool(ytdl_options.get("embedchapters")) if isinstance(postprocessors, list): for pp in postprocessors: if not isinstance(pp, dict): continue if str(pp.get("key") or "") == "FFmpegMetadata": want_add_metadata = True if bool(pp.get("add_chapters", True)): want_embed_chapters = True if want_add_metadata: cmd.append("--add-metadata") if want_embed_chapters: cmd.append("--embed-chapters") if ytdl_options.get("writesubtitles"): cmd.append("--write-sub") cmd.append("--write-auto-sub") cmd.extend(["--sub-format", "vtt"]) if ytdl_options.get("force_keyframes_at_cuts"): cmd.append("--force-keyframes-at-cuts") cmd.extend(["-o", section_outtmpl]) if ytdl_options.get("cookiefile"): cookies_path = ytdl_options["cookiefile"].replace("\\", "/") cmd.extend(["--cookies", cookies_path]) if ytdl_options.get("noplaylist"): cmd.append("--no-playlist") # Apply clip/section selection cmd.extend(["--download-sections", section]) cmd.append(url) if not quiet: debug(f"Running yt-dlp for section: {section}") try: if quiet: subprocess.run(cmd, check=True, capture_output=True, text=True) else: subprocess.run(cmd, check=True) except subprocess.CalledProcessError as exc: stderr_text = (exc.stderr or "") tail = "\n".join(stderr_text.splitlines()[-12:]).strip() details = f"\n{tail}" if tail else "" raise DownloadError(f"yt-dlp failed for section {section} (exit {exc.returncode}){details}") from exc except Exception as exc: raise DownloadError(f"yt-dlp failed for section {section}: {exc}") from exc # Mark near-complete before returning so the runner can finalize cleanly. try: _set_pipe_percent(99) except Exception: pass return session_id, first_section_info or {} def _iter_download_entries(info: Dict[str, Any]) -> Iterator[Dict[str, Any]]: queue: List[Dict[str, Any]] = [info] seen: set[int] = set() while queue: current = queue.pop(0) obj_id = id(current) if obj_id in seen: continue seen.add(obj_id) entries = current.get("entries") if isinstance(entries, list): for entry in entries: queue.append(entry) if current.get("requested_downloads") or not entries: yield current def _candidate_paths(entry: Dict[str, Any], output_dir: Path) -> Iterator[Path]: requested = entry.get("requested_downloads") if isinstance(requested, list): for item in requested: if isinstance(item, dict): fp = item.get("filepath") or item.get("_filename") if fp: yield Path(fp) for key in ("filepath", "_filename", "filename"): value = entry.get(key) if value: yield Path(value) if entry.get("filename"): yield output_dir / entry["filename"] def _resolve_entry_and_path(info: Dict[str, Any], output_dir: Path) -> tuple[Dict[str, Any], Path]: for entry in _iter_download_entries(info): for candidate in _candidate_paths(entry, output_dir): if candidate.is_file(): return entry, candidate if not candidate.is_absolute(): maybe = output_dir / candidate if maybe.is_file(): return entry, maybe raise FileNotFoundError("yt-dlp did not report a downloaded media file") def _resolve_entries_and_paths(info: Dict[str, Any], output_dir: Path) -> List[tuple[Dict[str, Any], Path]]: resolved: List[tuple[Dict[str, Any], Path]] = [] seen: set[str] = set() for entry in _iter_download_entries(info): chosen: Optional[Path] = None for candidate in _candidate_paths(entry, output_dir): if candidate.is_file(): chosen = candidate break if not candidate.is_absolute(): maybe = output_dir / candidate if maybe.is_file(): chosen = maybe break if chosen is None: continue key = str(chosen.resolve()) if key in seen: continue seen.add(key) resolved.append((entry, chosen)) return resolved def _extract_sha256(info: Dict[str, Any]) -> Optional[str]: for payload in [info] + info.get("entries", []): if not isinstance(payload, dict): continue hashes = payload.get("hashes") if isinstance(hashes, dict): for key in ("sha256", "sha-256", "sha_256"): if key in hashes and isinstance(hashes[key], str) and hashes[key].strip(): return hashes[key].strip() for key in ("sha256", "sha-256", "sha_256"): value = payload.get(key) if isinstance(value, str) and value.strip(): return value.strip() return None def _progress_callback(status: Dict[str, Any]) -> None: """Simple progress callback using logger.""" event = status.get("status") if event == "downloading": # Always print progress to stderr so piped stdout remains clean. percent = status.get("_percent_str") downloaded = status.get("downloaded_bytes") total = status.get("total_bytes") or status.get("total_bytes_estimate") speed = status.get("_speed_str") eta = status.get("_eta_str") _YTDLP_PROGRESS_BAR.update( downloaded=int(downloaded) if downloaded is not None else None, total=int(total) if total is not None else None, label="download", file=sys.stderr, ) elif event == "finished": _YTDLP_PROGRESS_BAR.finish() elif event in ("postprocessing", "processing"): return def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15, *, cookiefile: Optional[str] = None) -> Optional[Dict[str, Any]]: """Probe URL to extract metadata WITHOUT downloading. Args: url: URL to probe no_playlist: If True, ignore playlists and probe only the single video timeout_seconds: Max seconds to wait for probe (default 15s) Returns: Dict with keys: extractor, title, entries (if playlist), duration, etc. Returns None if not supported by yt-dlp or on timeout. """ if not is_url_supported_by_ytdlp(url): return None # Wrap probe in timeout to prevent hanging on large playlists import threading from typing import cast result_container: List[Optional[Any]] = [None, None] # [result, error] def _do_probe() -> None: try: _ensure_yt_dlp_ready() assert yt_dlp is not None # Extract info without downloading # Use extract_flat='in_playlist' to get full metadata for playlist items ydl_opts = { "quiet": True, # Suppress all output "no_warnings": True, "socket_timeout": 10, "retries": 2, # Reduce retries for faster timeout "skip_download": True, # Don't actually download "extract_flat": "in_playlist", # Get playlist with metadata for each entry "noprogress": True, # No progress bars } if cookiefile: ydl_opts["cookiefile"] = str(cookiefile) # Add no_playlist option if specified if no_playlist: ydl_opts["noplaylist"] = True with yt_dlp.YoutubeDL(ydl_opts) as ydl: # type: ignore[arg-type] info = ydl.extract_info(url, download=False) if not isinstance(info, dict): result_container[0] = None return # Extract relevant fields webpage_url = info.get("webpage_url") or info.get("original_url") or info.get("url") result_container[0] = { "extractor": info.get("extractor", ""), "title": info.get("title", ""), "entries": info.get("entries", []), # Will be populated if playlist "duration": info.get("duration"), "uploader": info.get("uploader"), "description": info.get("description"), # Keep both the requested and canonical URL forms; callers should prefer webpage_url. "requested_url": url, "webpage_url": webpage_url, } except Exception as exc: log(f"Probe error for {url}: {exc}") result_container[1] = exc thread = threading.Thread(target=_do_probe, daemon=False) thread.start() thread.join(timeout=timeout_seconds) if thread.is_alive(): # Probe timed out - return None so the caller can raise an error debug(f"Probe timeout for {url} (>={timeout_seconds}s), proceeding with download") return None if result_container[1] is not None: # Probe error - return None to proceed anyway return None return cast(Optional[Dict[str, Any]], result_container[0]) def download_media( opts: DownloadOptions, *, debug_logger: Optional[DebugLogger] = None, ) -> Any: """Download streaming media exclusively via yt-dlp. Args: opts: DownloadOptions with url, mode, output_dir, etc. debug_logger: Optional debug logger for troubleshooting Returns: DownloadMediaResult with path, info, tags, hash Raises: DownloadError: If the URL is unsupported or yt-dlp detects no media """ # Handle GoFile shares before yt-dlp (they remain unsupported) try: netloc = urlparse(opts.url).netloc.lower() except Exception: netloc = "" if "gofile.io" in netloc: msg = "GoFile links are currently unsupported" if not opts.quiet: debug(msg) if debug_logger is not None: debug_logger.write_record("gofile-unsupported", {"url": opts.url}) raise DownloadError(msg) # Determine if yt-dlp should be used ytdlp_supported = is_url_supported_by_ytdlp(opts.url) if not ytdlp_supported: msg = "URL not supported by yt-dlp; try download-file for manual downloads" if not opts.quiet: log(msg) if debug_logger is not None: debug_logger.write_record("ytdlp-unsupported", {"url": opts.url}) raise DownloadError(msg) # Skip probe for playlists with item selection (probe can hang on large playlists) # Just proceed straight to download which will handle item selection if opts.playlist_items: debug(f"Skipping probe for playlist (item selection: {opts.playlist_items}), proceeding with download") probe_result = {"url": opts.url} # Minimal probe result else: probe_cookiefile = None try: if opts.cookies_path and opts.cookies_path.is_file(): probe_cookiefile = str(opts.cookies_path) except Exception: probe_cookiefile = None probe_result = probe_url(opts.url, no_playlist=opts.no_playlist, timeout_seconds=15, cookiefile=probe_cookiefile) if probe_result is None: msg = "yt-dlp could not detect media for this URL; use download-file for direct downloads" if not opts.quiet: log(msg) if debug_logger is not None: debug_logger.write_record("ytdlp-skip-no-media", {"url": opts.url}) raise DownloadError(msg) _ensure_yt_dlp_ready() ytdlp_tool = YtDlpTool() ytdl_options = ytdlp_tool.build_ytdlp_options(opts) hooks = ytdl_options.get("progress_hooks") if not isinstance(hooks, list): hooks = [] ytdl_options["progress_hooks"] = hooks if _progress_callback not in hooks: hooks.append(_progress_callback) if not opts.quiet: debug(f"Starting yt-dlp download: {opts.url}") if debug_logger is not None: debug_logger.write_record("ytdlp-start", {"url": opts.url}) assert yt_dlp is not None try: # Debug: show what options we're using if not opts.quiet: if ytdl_options.get("download_sections"): debug(f"[yt-dlp] download_sections: {ytdl_options['download_sections']}") debug(f"[yt-dlp] force_keyframes_at_cuts: {ytdl_options.get('force_keyframes_at_cuts', False)}") # Use subprocess when download_sections are present (Python API doesn't support them properly) session_id = None first_section_info = {} if ytdl_options.get("download_sections"): # For clip (download_sections), keep pipeline Live UI active and suppress # yt-dlp/ffmpeg CLI spam when running in quiet/pipeline mode. live_ui, _ = _live_ui_and_pipe_index() quiet_sections = bool(opts.quiet) or (live_ui is not None) session_id, first_section_info = _download_with_sections_via_cli( opts.url, ytdl_options, ytdl_options.get("download_sections", []), quiet=quiet_sections, ) info = None else: with yt_dlp.YoutubeDL(ytdl_options) as ydl: # type: ignore[arg-type] info = ydl.extract_info(opts.url, download=True) except Exception as exc: log(f"yt-dlp failed: {exc}", file=sys.stderr) if debug_logger is not None: debug_logger.write_record( "exception", { "phase": "yt-dlp", "error": str(exc), "traceback": traceback.format_exc(), }, ) raise DownloadError("yt-dlp download failed") from exc # If we used subprocess, we need to find the file manually if info is None: # Find files created/modified during this download (after we started) # Look for files matching the expected output template pattern try: import glob import time import re # Get the expected filename pattern from outtmpl # For sections: "C:\path\{session_id}.section_1_of_3.ext", etc. # For non-sections: "C:\path\title.ext" # Wait a moment to ensure files are fully written time.sleep(0.5) # List all files in output_dir, sorted by modification time files = sorted(opts.output_dir.iterdir(), key=lambda p: p.stat().st_mtime, reverse=True) if not files: raise FileNotFoundError(f"No files found in {opts.output_dir}") # If we downloaded sections, look for files with the session_id pattern if opts.clip_sections and session_id: # Pattern: "{session_id}_1.ext", "{session_id}_2.ext", etc. # Also includes sidecars like "{session_id}_1.en.vtt". section_pattern = re.compile(rf'^{re.escape(session_id)}_(\d+)') matching_files = [f for f in files if section_pattern.search(f.name)] if matching_files: # Sort by section number to ensure correct order def extract_section_num(path: Path) -> int: match = section_pattern.search(path.name) return int(match.group(1)) if match else 999 matching_files.sort(key=extract_section_num) debug(f"Found {len(matching_files)} section file(s) matching pattern") # Now rename section *media* files to use hash-based names. # Sidecars (subtitles) are renamed to match the media hash so they can be # attached as notes later (and not emitted as separate pipeline items). by_index: Dict[int, List[Path]] = {} for f in matching_files: m = section_pattern.search(f.name) if not m: continue try: n = int(m.group(1)) except Exception: continue by_index.setdefault(n, []).append(f) renamed_media_files: List[Path] = [] for sec_num in sorted(by_index.keys()): group = by_index.get(sec_num) or [] if not group: continue def _is_subtitle(p: Path) -> bool: try: return p.suffix.lower() in _SUBTITLE_EXTS except Exception: return False media_candidates = [p for p in group if not _is_subtitle(p)] subtitle_candidates = [p for p in group if _is_subtitle(p)] # Pick the primary media file for this section. # Prefer non-json, non-info sidecars. media_file: Optional[Path] = None for cand in media_candidates: try: if cand.suffix.lower() in {".json", ".info.json"}: continue except Exception: pass media_file = cand break if media_file is None and media_candidates: media_file = media_candidates[0] if media_file is None: # No media file found for this section; skip. continue try: media_hash = sha256_file(media_file) except Exception as e: debug(f"Failed to hash section media file {media_file.name}: {e}") renamed_media_files.append(media_file) continue # Preserve any suffix tail after the section index so language tags survive. # Example: _1.en.vtt -> .en.vtt prefix = f"{session_id}_{sec_num}" def _tail(name: str) -> str: try: if name.startswith(prefix): return name[len(prefix):] except Exception: pass # Fallback: keep just the last suffix. try: return Path(name).suffix except Exception: return "" # Rename media file to (tail typically like .mkv). try: new_media_name = f"{media_hash}{_tail(media_file.name)}" new_media_path = opts.output_dir / new_media_name if new_media_path.exists() and new_media_path != media_file: debug(f"File with hash {media_hash} already exists, using existing file.") try: media_file.unlink() except OSError: pass else: media_file.rename(new_media_path) debug(f"Renamed section file: {media_file.name} -> {new_media_name}") renamed_media_files.append(new_media_path) except Exception as e: debug(f"Failed to rename section media file {media_file.name}: {e}") renamed_media_files.append(media_file) new_media_path = media_file # Rename subtitle sidecars to match media hash for later note attachment. for sub_file in subtitle_candidates: try: new_sub_name = f"{media_hash}{_tail(sub_file.name)}" new_sub_path = opts.output_dir / new_sub_name if new_sub_path.exists() and new_sub_path != sub_file: try: sub_file.unlink() except OSError: pass else: sub_file.rename(new_sub_path) debug(f"Renamed section file: {sub_file.name} -> {new_sub_name}") except Exception as e: debug(f"Failed to rename section subtitle file {sub_file.name}: {e}") media_path = renamed_media_files[0] if renamed_media_files else matching_files[0] media_paths = renamed_media_files if renamed_media_files else None if not opts.quiet: count = len(media_paths) if isinstance(media_paths, list) else 1 debug(f"✓ Downloaded {count} section media file(s) (session: {session_id})") else: # Fallback to most recent file if pattern not found media_path = files[0] media_paths = None if not opts.quiet: debug(f"✓ Downloaded section file (pattern not found): {media_path.name}") else: # No sections, just take the most recent file media_path = files[0] media_paths = None if not opts.quiet: debug(f"✓ Downloaded: {media_path.name}") if debug_logger is not None: debug_logger.write_record("ytdlp-file-found", {"path": str(media_path)}) except Exception as exc: log(f"Error finding downloaded file: {exc}", file=sys.stderr) if debug_logger is not None: debug_logger.write_record( "exception", {"phase": "find-file", "error": str(exc)}, ) raise DownloadError(str(exc)) from exc # Create result with minimal data extracted from filename file_hash = sha256_file(media_path) # For section downloads, create tags with the title and build proper info dict tags = [] title = '' if first_section_info: title = first_section_info.get('title', '') if title: tags.append(f'title:{title}') debug(f"Added title tag for section download: {title}") # Build info dict - always use extracted title if available, not hash if first_section_info: info_dict = first_section_info else: info_dict = { "id": media_path.stem, "title": title or media_path.stem, "ext": media_path.suffix.lstrip(".") } return DownloadMediaResult( path=media_path, info=info_dict, tag=tags, source_url=opts.url, hash_value=file_hash, paths=media_paths, # Include all section files if present ) if not isinstance(info, dict): log(f"Unexpected yt-dlp response: {type(info)}", file=sys.stderr) raise DownloadError("Unexpected yt-dlp response type") info_dict: Dict[str, Any] = info if debug_logger is not None: debug_logger.write_record( "ytdlp-info", { "keys": sorted(info_dict.keys()), "is_playlist": bool(info_dict.get("entries")), }, ) # Playlist/album handling: resolve ALL downloaded entries and return multiple results. # The cmdlet will emit one PipeObject per downloaded file. if info_dict.get("entries") and not opts.no_playlist: resolved = _resolve_entries_and_paths(info_dict, opts.output_dir) if resolved: results: List[DownloadMediaResult] = [] for entry, media_path in resolved: hash_value = _extract_sha256(entry) or _extract_sha256(info_dict) if not hash_value: try: hash_value = sha256_file(media_path) except OSError: hash_value = None tags: List[str] = [] if extract_ytdlp_tags: try: tags = extract_ytdlp_tags(entry) except Exception as e: log(f"Error extracting tags: {e}", file=sys.stderr) source_url = ( entry.get("webpage_url") or entry.get("original_url") or entry.get("url") or opts.url ) results.append( DownloadMediaResult( path=media_path, info=entry, tag=tags, source_url=source_url, hash_value=hash_value, ) ) if not opts.quiet: debug(f"✓ Downloaded playlist items: {len(results)}") return results try: entry, media_path = _resolve_entry_and_path(info_dict, opts.output_dir) except FileNotFoundError as exc: log(f"Error: {exc}", file=sys.stderr) if debug_logger is not None: debug_logger.write_record( "exception", {"phase": "resolve-path", "error": str(exc)}, ) raise DownloadError(str(exc)) from exc if debug_logger is not None: debug_logger.write_record( "resolved-media", {"path": str(media_path), "entry_keys": sorted(entry.keys())}, ) # Extract hash from metadata or compute hash_value = _extract_sha256(entry) or _extract_sha256(info_dict) if not hash_value: try: hash_value = sha256_file(media_path) except OSError as exc: if debug_logger is not None: debug_logger.write_record( "hash-error", {"path": str(media_path), "error": str(exc)}, ) # Extract tags using metadata.py tags = [] if extract_ytdlp_tags: try: tags = extract_ytdlp_tags(entry) except Exception as e: log(f"Error extracting tags: {e}", file=sys.stderr) source_url = ( entry.get("webpage_url") or entry.get("original_url") or entry.get("url") ) if not opts.quiet: debug(f"✓ Downloaded: {media_path.name} ({len(tags)} tags)") if debug_logger is not None: debug_logger.write_record( "downloaded", { "path": str(media_path), "tag_count": len(tags), "source_url": source_url, "sha256": hash_value, }, ) return DownloadMediaResult( path=media_path, info=entry, tag=tags, source_url=source_url, hash_value=hash_value, ) # Timeout handler to prevent yt-dlp hangs def _download_with_timeout(opts: DownloadOptions, timeout_seconds: int = 300) -> Any: """Download with timeout protection. Args: opts: DownloadOptions timeout_seconds: Max seconds to wait (default 300s = 5 min) Returns: DownloadMediaResult or List[DownloadMediaResult] Raises: DownloadError: If timeout exceeded """ import threading from typing import cast result_container: List[Optional[Any]] = [None, None] # [result, error] def _do_download() -> None: try: result_container[0] = download_media(opts) except Exception as e: result_container[1] = e thread = threading.Thread(target=_do_download, daemon=False) thread.start() thread.join(timeout=timeout_seconds) if thread.is_alive(): # Thread still running - timeout raise DownloadError(f"Download timeout after {timeout_seconds} seconds for {opts.url}") if result_container[1] is not None: raise cast(Exception, result_container[1]) if result_container[0] is None: raise DownloadError(f"Download failed for {opts.url}") return cast(Any, result_container[0]) class Download_Media(Cmdlet): """Class-based download-media cmdlet - yt-dlp only, streaming sites.""" def __init__(self) -> None: """Initialize download-media cmdlet.""" super().__init__( name="download-media", summary="Download media from streaming sites (YouTube, Twitch, etc.)", usage="download-media [options] or search-file | download-media [options]", alias=[""], arg=[ SharedArgs.URL, SharedArgs.QUERY, CmdletArg(name="audio", type="flag", alias="a", description="Download audio only"), CmdletArg(name="format", type="string", alias="fmt", description="Explicit yt-dlp format selector"), CmdletArg( name="clip", type="string", description="Extract time range(s) or keyed spec (e.g., clip:3m4s-3m14s,item:2-3)", ), CmdletArg(name="item", type="string", description="Item selection for playlists/formats"), SharedArgs.PATH ], detail=["Download media from streaming sites using yt-dlp.", "For direct file downloads, use download-file."], exec=self.run, ) self.register() def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: """Main execution method.""" stage_ctx = pipeline_context.get_stage_context() in_pipeline = stage_ctx is not None and getattr(stage_ctx, "total_stages", 1) > 1 if in_pipeline and isinstance(config, dict): config["_quiet_background_output"] = True return self._run_impl(result, args, config) def _run_impl(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: """Main download implementation for yt-dlp-supported url.""" try: debug("Starting download-media") ytdlp_tool = YtDlpTool(config) # Parse arguments parsed = parse_cmdlet_args(args, self) # Extract options raw_url = parsed.get("url", []) if isinstance(raw_url, str): raw_url = [raw_url] # Allow a single quoted argument containing multiple URLs separated by commas. # Example: download-media "https://a,https://b" -audio expanded_urls: List[str] = [] for u in (raw_url or []): if u is None: continue s = str(u).strip() if not s: continue if "," in s: parts = [p.strip() for p in s.split(",")] expanded_urls.extend([p for p in parts if p]) else: expanded_urls.append(s) if expanded_urls: raw_url = expanded_urls # If no url provided via args, try to extract from piped result if not raw_url and result: # Handle single result or list of results results_to_check = result if isinstance(result, list) else [result] for item in results_to_check: # Try to get URL from various possible fields url = get_field(item, "url") or get_field(item, "target") if url: raw_url.append(url) # Filter to yt-dlp supported url only supported_url = [ url for url in raw_url if is_url_supported_by_ytdlp(url) ] if not supported_url: log("No yt-dlp-supported url to download", file=sys.stderr) return 1 # Log unsupported url if any unsupported = set(raw_url) - set(supported_url) if unsupported: debug(f"Skipping {len(unsupported)} unsupported url (use download-file for direct downloads)") # Get output directory final_output_dir = self._resolve_output_dir(parsed, config) if not final_output_dir: return 1 debug(f"Output directory: {final_output_dir}") # Get other options clip_spec = parsed.get("clip") query_spec = parsed.get("query") # download-media supports a small keyed spec language inside -query. # Examples: # -query "hash:" # -query "clip:1m-1m15s,2m1s-2m11s" # -query "hash:,clip:1m-1m15s,item:2-3" query_keyed: Dict[str, List[str]] = {} if query_spec: try: query_keyed = self._parse_keyed_csv_spec(str(query_spec), default_key="hash") except Exception: query_keyed = {} # Optional: allow an explicit hash via -query "hash:". # This is used as the preferred king hash for multi-clip relationships. query_hash_override: Optional[str] = None try: hash_values = query_keyed.get("hash", []) if isinstance(query_keyed, dict) else [] hash_candidate = (hash_values[-1] if hash_values else None) if hash_candidate: # Re-wrap for the shared parser which expects the `hash:` prefix. query_hash_override = sh.parse_single_hash_query(f"hash:{hash_candidate}") else: # Backwards-compatible: treat a non-keyed query as a hash query. query_hash_override = sh.parse_single_hash_query(str(query_spec)) if query_spec else None except Exception: query_hash_override = None # Always enable chapters + subtitles so downstream pipes (e.g. mpv) can consume them. embed_chapters = True write_sub = True mode = "audio" if parsed.get("audio") else "video" # Parse clip range(s) if specified clip_ranges: Optional[List[tuple[int, int]]] = None clip_values: List[str] = [] item_values: List[str] = [] if clip_spec: # Support keyed clip syntax: # -clip "clip:3m4s-3m14s,1h22m-1h33m,item:2-3" keyed = self._parse_keyed_csv_spec(str(clip_spec), default_key="clip") clip_values.extend(keyed.get("clip", []) or []) item_values.extend(keyed.get("item", []) or []) # Allow the same keyed spec language inside -query so users can do: # download-media -query "clip:1m-1m15s,2m1s-2m11s" if query_keyed: clip_values.extend(query_keyed.get("clip", []) or []) item_values.extend(query_keyed.get("item", []) or []) if item_values and not parsed.get("item"): parsed["item"] = ",".join([v for v in item_values if v]) if clip_values: clip_ranges = self._parse_time_ranges(",".join([v for v in clip_values if v])) if not clip_ranges: bad_spec = clip_spec or query_spec log(f"Invalid clip format: {bad_spec}", file=sys.stderr) return 1 if clip_ranges: try: debug(f"Clip ranges: {clip_ranges}") except Exception: pass quiet_mode = bool(config.get("_quiet_background_output")) if isinstance(config, dict) else False storage = None hydrus_available = True try: from Store import Store storage = Store(config=config or {}, suppress_debug=True) from API.HydrusNetwork import is_hydrus_available hydrus_available = bool(is_hydrus_available(config or {})) except Exception: storage = None def _preflight_url_duplicate(candidate_url: str, extra_urls: Optional[Sequence[str]] = None) -> bool: # NOTE: download-media sets _quiet_background_output=True when running in a pipeline to # reduce background noise. URL de-dup is interactive and must still run in pipelines. if storage is None: debug("Preflight URL check skipped: storage unavailable") return True debug(f"Preflight URL check: candidate={candidate_url}") try: from metadata import normalize_urls except Exception: normalize_urls = None # type: ignore[assignment] needles: List[str] = [] if normalize_urls is not None: for raw in [candidate_url, *(list(extra_urls) if extra_urls else [])]: try: needles.extend(normalize_urls(raw)) except Exception: continue # Fallback: always have at least one needle if not needles: needles = [str(candidate_url)] # Deduplicate needles (preserve order) seen_needles: List[str] = [] for needle in needles: if needle and needle not in seen_needles: seen_needles.append(needle) needles = seen_needles try: debug(f"Preflight URL needles: {needles}") except Exception: pass url_matches: List[Dict[str, Any]] = [] try: from Store.HydrusNetwork import HydrusNetwork # Avoid searching the temp/download directory backend during dedup. # We only want to warn about duplicates in real stores. backend_names_all = storage.list_searchable_backends() backend_names: List[str] = [] skipped: List[str] = [] for backend_name in backend_names_all: try: backend = storage[backend_name] except Exception: continue try: if str(backend_name).strip().lower() == "temp": skipped.append(backend_name) continue except Exception: pass # Heuristic: if a Folder backend points at the configured temp output dir, skip it. try: backend_location = getattr(backend, "_location", None) if backend_location and final_output_dir: backend_path = Path(str(backend_location)).expanduser().resolve() temp_path = Path(str(final_output_dir)).expanduser().resolve() if backend_path == temp_path: skipped.append(backend_name) continue except Exception: pass backend_names.append(backend_name) try: if skipped: debug(f"Preflight backends: {backend_names} (skipped temp: {skipped})") else: debug(f"Preflight backends: {backend_names}") except Exception: pass for backend_name in backend_names: backend = storage[backend_name] if isinstance(backend, HydrusNetwork) and not hydrus_available: continue backend_hits: List[Dict[str, Any]] = [] for needle in needles: try: backend_hits = backend.search(f"url:{needle}", limit=25) or [] if backend_hits: break except Exception: continue if backend_hits: url_matches.extend([dict(x) if isinstance(x, dict) else {"title": str(x)} for x in backend_hits]) if len(url_matches) >= 25: url_matches = url_matches[:25] break except Exception: url_matches = [] if not url_matches: debug("Preflight URL check: no matches") return True table = ResultTable(f"URL already exists ({len(url_matches)} match(es))") results_list: List[Dict[str, Any]] = [] for item in url_matches: if "title" not in item: item["title"] = item.get("name") or item.get("target") or item.get("path") or "Result" # Keep the full payload for history/inspection, but display a focused table. # Use shared extractors so Ext/Size/Store/Hash remain consistent everywhere. try: from result_table import build_display_row except Exception: build_display_row = None # type: ignore if callable(build_display_row): display_row = build_display_row(item, keys=["title", "store", "hash", "ext", "size"]) else: display_row = { "title": item.get("title"), "store": item.get("store"), "hash": item.get("hash") or item.get("file_hash") or item.get("sha256"), "ext": str(item.get("ext") or ""), "size": item.get("size") or item.get("size_bytes"), } table.add_result(display_row) results_list.append(item) pipeline_context.set_current_stage_table(table) pipeline_context.set_last_result_table(table, results_list) try: from contextlib import nullcontext except Exception: nullcontext = None # type: ignore suspend = getattr(pipeline_context, "suspend_live_progress", None) cm = suspend() if callable(suspend) else (nullcontext() if nullcontext else None) if cm is None: get_stderr_console().print(table) setattr(table, "_rendered_by_cmdlet", True) if not Confirm.ask("Continue anyway?", default=False, console=get_stderr_console()): return False else: with cm: get_stderr_console().print(table) setattr(table, "_rendered_by_cmdlet", True) if not Confirm.ask("Continue anyway?", default=False, console=get_stderr_console()): try: pipeline_context.request_pipeline_stop(reason="duplicate-url declined", exit_code=0) except Exception: pass return False return True def _preflight_url_duplicates_bulk(urls: Sequence[str]) -> bool: """Preflight URL de-dup for a batch of URLs. Purpose: - Avoid per-item interactive URL checks inside a playlist loop. - Let the user see ALL duplicates up front, before any downloads start. """ if storage is None: debug("Bulk URL preflight skipped: storage unavailable") return True unique_urls: List[str] = [] for u in urls or []: s = str(u or "").strip() if s and s not in unique_urls: unique_urls.append(s) if len(unique_urls) <= 1: return True try: from metadata import normalize_urls except Exception: normalize_urls = None # type: ignore[assignment] def _httpish(value: str) -> bool: try: return bool(value) and (value.startswith("http://") or value.startswith("https://")) except Exception: return False url_needles: Dict[str, List[str]] = {} for u in unique_urls: needles: List[str] = [] if normalize_urls is not None: try: needles.extend([n for n in (normalize_urls(u) or []) if isinstance(n, str)]) except Exception: needles = [] if not needles: needles = [u] # Prefer http(s) needles for store lookups. filtered: List[str] = [] for n in needles: n2 = str(n or "").strip() if not n2: continue if not _httpish(n2): continue if n2 not in filtered: filtered.append(n2) url_needles[u] = filtered if filtered else [u] # Determine backends once (same filtering as per-URL preflight). backend_names: List[str] = [] try: backend_names_all = storage.list_searchable_backends() except Exception: backend_names_all = [] for backend_name in backend_names_all: try: backend = storage[backend_name] except Exception: continue try: if str(backend_name).strip().lower() == "temp": continue except Exception: pass try: backend_location = getattr(backend, "_location", None) if backend_location and final_output_dir: backend_path = Path(str(backend_location)).expanduser().resolve() temp_path = Path(str(final_output_dir)).expanduser().resolve() if backend_path == temp_path: continue except Exception: pass backend_names.append(backend_name) if not backend_names: debug("Bulk URL preflight skipped: no searchable backends") return True # Collect matches as display rows (cap to keep output reasonable) seen_pairs: set[tuple[str, str]] = set() matched_urls: set[str] = set() match_rows: List[Dict[str, Any]] = [] max_rows = 200 try: from Store.HydrusNetwork import HydrusNetwork except Exception: HydrusNetwork = None # type: ignore for backend_name in backend_names: if len(match_rows) >= max_rows: break try: backend = storage[backend_name] except Exception: continue if HydrusNetwork is not None and isinstance(backend, HydrusNetwork): if not hydrus_available: continue client = getattr(backend, "_client", None) if client is None: continue for original_url, needles in url_needles.items(): if len(match_rows) >= max_rows: break if (original_url, str(backend_name)) in seen_pairs: continue # Fast-path: ask Hydrus whether it already knows this URL. found_hash: Optional[str] = None found = False for needle in (needles or [])[:3]: if not _httpish(needle): continue try: from API.HydrusNetwork import HydrusRequestSpec spec = HydrusRequestSpec( method="GET", endpoint="/add_urls/get_url_files", query={"url": needle}, ) response = client._perform_request(spec) # type: ignore[attr-defined] raw_hashes = None if isinstance(response, dict): raw_hashes = response.get("hashes") or response.get("file_hashes") raw_ids = response.get("file_ids") has_ids = isinstance(raw_ids, list) and len(raw_ids) > 0 has_hashes = isinstance(raw_hashes, list) and len(raw_hashes) > 0 if has_hashes: try: found_hash = str(raw_hashes[0]).strip() # type: ignore[index] except Exception: found_hash = None if has_ids or has_hashes: found = True break except Exception: continue if not found: continue seen_pairs.add((original_url, str(backend_name))) matched_urls.add(original_url) display_row = { "title": "(exists)", "store": str(backend_name), "hash": found_hash or "", "url": original_url, "columns": [ ("Title", "(exists)"), ("Store", str(backend_name)), ("Hash", found_hash or ""), ("URL", original_url), ], } match_rows.append(display_row) continue # Generic backends: use the existing search() contract. for original_url, needles in url_needles.items(): if len(match_rows) >= max_rows: break if (original_url, str(backend_name)) in seen_pairs: continue backend_hits: List[Dict[str, Any]] = [] for needle in (needles or [])[:3]: try: backend_hits = backend.search(f"url:{needle}", limit=1) or [] if backend_hits: break except Exception: continue if not backend_hits: continue seen_pairs.add((original_url, str(backend_name))) matched_urls.add(original_url) hit = backend_hits[0] title = hit.get("title") or hit.get("name") or hit.get("target") or hit.get("path") or "(exists)" file_hash = hit.get("hash") or hit.get("file_hash") or hit.get("sha256") or "" try: from result_table import build_display_row except Exception: build_display_row = None # type: ignore extracted = { "title": str(title), "store": str(hit.get("store") or backend_name), "hash": str(file_hash or ""), "ext": "", "size": None, } if callable(build_display_row): try: extracted = build_display_row(hit, keys=["title", "store", "hash", "ext", "size"]) except Exception: pass # Ensure we still prefer the precomputed values for title/store/hash. extracted["title"] = str(title) extracted["store"] = str(hit.get("store") or backend_name) extracted["hash"] = str(file_hash or "") ext = extracted.get("ext") size_val = extracted.get("size") display_row = { "title": str(title), "store": str(hit.get("store") or backend_name), "hash": str(file_hash or ""), "ext": str(ext or ""), "size": size_val, "url": original_url, "columns": [ ("Title", str(title)), ("Store", str(hit.get("store") or backend_name)), ("Hash", str(file_hash or "")), ("Ext", str(ext or "")), ("Size", size_val), ("URL", original_url), ], } match_rows.append(display_row) if not match_rows: debug("Bulk URL preflight: no matches") return True # This table is non-interactive and intentionally wide (we want URL + ext/size). table = ResultTable(f"URL already exists ({len(matched_urls)} url(s))", max_columns=10) table.set_no_choice(True) try: table.set_preserve_order(True) except Exception: pass for row in match_rows: table.add_result(row) # Display as an overlay so we don't clobber the current selectable table/history. try: pipeline_context.set_last_result_table_overlay(table, match_rows) except Exception: pass get_stderr_console().print(table) setattr(table, "_rendered_by_cmdlet", True) if not Confirm.ask("Continue anyway?", default=False, console=get_stderr_console()): return False return True def _canonicalize_url_for_storage(requested_url: str) -> str: # Prefer yt-dlp's canonical webpage URL (e.g. strips timestamps/redirects). # Fall back to the requested URL if probing fails. # Important: when playlist item selection is used, avoid probing (can hang on large playlists). if playlist_items: return str(requested_url) try: cf = None try: cookie_path = ytdlp_tool.resolve_cookiefile() if cookie_path is not None and cookie_path.is_file(): cf = str(cookie_path) except Exception: cf = None pr = probe_url(requested_url, no_playlist=False, timeout_seconds=15, cookiefile=cf) if isinstance(pr, dict): for key in ("webpage_url", "original_url", "url", "requested_url"): value = pr.get(key) if isinstance(value, str) and value.strip(): return value.strip() except Exception: pass return str(requested_url) # Check if we need to show format selection playlist_items = str(parsed.get("item")) if parsed.get("item") else None ytdl_format = parsed.get("format") playlist_selection_handled = False def _parse_at_selection(choice: str, *, max_index: int) -> Optional[List[int]]: """Parse @ selection syntax (@2, @2-5, @{1,3,5}, @2,5,7) into 1-based indices.""" raw = str(choice or "").strip() if not raw: return None if raw.lower() in {"q", "quit", "cancel"}: return None if raw == "@*" or raw == "*": return list(range(1, max_index + 1)) if raw.startswith("@"): raw = raw[1:].strip() if raw.startswith("{") and raw.endswith("}"): raw = raw[1:-1].strip() if not raw: return None indices: set[int] = set() for part in raw.split(","): part = part.strip() if not part: continue if "-" in part: left, right = [p.strip() for p in part.split("-", 1)] if not left or not right: return None try: start = int(left) end = int(right) except ValueError: return None if start < 1 or end < 1: return None if end < start: start, end = end, start for i in range(start, end + 1): if 1 <= i <= max_index: indices.add(i) else: try: i = int(part) except ValueError: return None if 1 <= i <= max_index: indices.add(i) if not indices: return None return sorted(indices) def _maybe_show_playlist_table(url: str) -> bool: """If URL appears to be a playlist/channel/collection, show a normal selectable table. This intentionally avoids a special input() prompt so the user can use the regular REPL prompt with autocomplete and standard @ selection: download-media -url "" (shows table) @* | download-media [options] | add-file ... Returns True if a playlist table was shown. """ try: cf = None try: cookie_path = ytdlp_tool.resolve_cookiefile() if cookie_path is not None and cookie_path.is_file(): cf = str(cookie_path) except Exception: cf = None pr = probe_url(url, no_playlist=False, timeout_seconds=15, cookiefile=cf) except Exception: pr = None if not isinstance(pr, dict): return False entries = pr.get("entries") if not isinstance(entries, list) or len(entries) <= 1: return False # Display table (limit rows to keep output reasonable) max_rows = 200 display_entries = entries[:max_rows] total = len(entries) def _entry_to_url(entry: Any) -> Optional[str]: if not isinstance(entry, dict): return None # Prefer explicit absolute URLs when present for key in ("webpage_url", "original_url", "url"): v = entry.get(key) if isinstance(v, str) and v.strip(): s = v.strip() try: if urlparse(s).scheme in {"http", "https"}: return s except Exception: return s # Best-effort YouTube fallback from id entry_id = entry.get("id") if isinstance(entry_id, str) and entry_id.strip(): extractor_name = str(pr.get("extractor") or pr.get("extractor_key") or "").lower() if "youtube" in extractor_name: return f"https://www.youtube.com/watch?v={entry_id.strip()}" return None table = ResultTable() safe_url = str(url or "").strip() table.title = f'download-media -url "{safe_url}"' if safe_url else "download-media" # Selection tables should expand '@N' into a runnable command. # For playlist-item rows we prefer the concrete per-item URL so the # expanded command targets a single video (not the whole playlist). table.set_source_command("download-media", []) try: table.set_preserve_order(True) except Exception: pass results_list: List[Dict[str, Any]] = [] for idx, entry in enumerate(display_entries, 1): title = None uploader = None duration = None entry_url = _entry_to_url(entry) try: if isinstance(entry, dict): title = entry.get("title") uploader = entry.get("uploader") or pr.get("uploader") duration = entry.get("duration") except Exception: pass row: Dict[str, Any] = { "table": "download-media", "title": str(title or f"Item {idx}"), "detail": str(uploader or ""), "media_kind": "playlist-item", "playlist_index": idx, # Enable '@N' expansion into a concrete command. # Prefer selecting the resolved per-item URL when available. "_selection_args": (["-url", str(entry_url)] if entry_url else ["-url", str(url), "-item", str(idx)]), # Critical for normal @ selection piping: downstream cmdlets # (including download-media itself) look for url/target. "url": entry_url, "target": entry_url, "columns": [ ("#", str(idx)), ("Title", str(title or "")), ("Duration", str(duration or "")), ("Uploader", str(uploader or "")), ], } results_list.append(row) table.add_result(row) pipeline_context.set_current_stage_table(table) pipeline_context.set_last_result_table(table, results_list) get_stderr_console().print(table) setattr(table, "_rendered_by_cmdlet", True) return True # Playlist/multi-entry detection: if the URL has multiple items and the user didn't # specify -item or -format, show a normal selectable table and return. if len(supported_url) == 1 and not playlist_items and not ytdl_format: candidate_url = supported_url[0] if _maybe_show_playlist_table(candidate_url): playlist_selection_handled = True # Let the user pick items using the normal REPL prompt: # @* | download-media ... return 0 # Bulk preflight for playlist selections (per-entry URLs): check all URLs once before downloading. skip_per_url_preflight = False if len(supported_url) > 1: if not _preflight_url_duplicates_bulk(list(supported_url)): return 0 skip_per_url_preflight = True # Playlist-level format preflight: if the batch has only one available format, # discover it once and force it for every item. This avoids per-item failures # and per-item --list-formats calls (e.g. Bandcamp albums). formats_cache: Dict[str, Optional[List[Dict[str, Any]]]] = {} def _cookiefile_str() -> Optional[str]: try: cookie_path = ytdlp_tool.resolve_cookiefile() if cookie_path is not None and cookie_path.is_file(): return str(cookie_path) except Exception: pass return None def _list_formats_cached(u: str, *, playlist_items_value: Optional[str]) -> Optional[List[Dict[str, Any]]]: key = f"{u}||{playlist_items_value or ''}" if key in formats_cache: return formats_cache[key] fmts = list_formats( u, no_playlist=False, playlist_items=playlist_items_value, cookiefile=_cookiefile_str(), ) formats_cache[key] = fmts return fmts forced_single_format_id: Optional[str] = None forced_single_format_for_batch = False if len(supported_url) > 1 and not playlist_items and not ytdl_format: try: sample_url = str(supported_url[0]) fmts = _list_formats_cached(sample_url, playlist_items_value=None) if isinstance(fmts, list) and len(fmts) == 1 and isinstance(fmts[0], dict): only_id = str(fmts[0].get("format_id") or "").strip() if only_id: forced_single_format_id = only_id forced_single_format_for_batch = True debug( f"Playlist format preflight: only one format available; using {forced_single_format_id} for all items" ) except Exception: forced_single_format_id = None forced_single_format_for_batch = False # If no -item, no explicit -format specified, and single URL, show the format table. # Do NOT stop to show formats when -audio is used (auto-pick) or when -clip is used. if ( mode != "audio" and not clip_spec and not playlist_items and not ytdl_format and len(supported_url) == 1 and not playlist_selection_handled ): url = supported_url[0] canonical_url = _canonicalize_url_for_storage(url) if not _preflight_url_duplicate(canonical_url, extra_urls=[url]): log(f"Skipping download: {url}", file=sys.stderr) return 0 formats = _list_formats_cached(url, playlist_items_value=None) if formats and len(formats) > 1: # Formatlist filtering # # Goal: # - Keep the list useful (hide non-media entries like storyboards) # - But NEVER filter down so far that the user can't browse/pick formats. # # The old filtering was too aggressive (e.g. width>=640, one per resolution), # which often hid most YouTube formats. def _is_browseable_format(fmt: Any) -> bool: if not isinstance(fmt, dict): return False format_id = str(fmt.get("format_id") or "").strip() if not format_id: return False ext = str(fmt.get("ext") or "").strip().lower() if ext in {"mhtml", "json"}: return False note = str(fmt.get("format_note") or "").lower() if "storyboard" in note: return False if format_id.lower().startswith("sb"): return False vcodec = str(fmt.get("vcodec", "none")) acodec = str(fmt.get("acodec", "none")) # Keep anything with at least one stream. return not (vcodec == "none" and acodec == "none") candidate_formats = [f for f in formats if _is_browseable_format(f)] filtered_formats = candidate_formats if candidate_formats else list(formats) debug(f"Formatlist: showing {len(filtered_formats)} formats (raw={len(formats)})") # Build the base command that will be replayed with @N selection # Include any additional args from the original command base_cmd = f'download-media "{url}"' # Preserve any additional pipeline stages if this is in a pipeline remaining_args = [arg for arg in args if arg not in [url] and not arg.startswith('-')] if remaining_args: base_cmd += ' ' + ' '.join(remaining_args) # Create result table for display # NOTE: ResultTable defaults to max_columns=5; for formatlist we want more columns # (including Size) so the user can compare formats. table = ResultTable(title=f"Available formats for {url}", max_columns=10, preserve_order=True) table.set_table("ytdlp.formatlist") table.set_source_command("download-media", [url]) # Collect results for table results_list = [] # Emit format results for selection for idx, fmt in enumerate(filtered_formats, 1): resolution = fmt.get("resolution", "") ext = fmt.get("ext", "") vcodec = fmt.get("vcodec", "none") acodec = fmt.get("acodec", "none") filesize = fmt.get("filesize") filesize_approx = fmt.get("filesize_approx") format_id = fmt.get("format_id", "") # If the chosen format is video-only (no audio stream), automatically # request best audio too so the resulting file has sound. selection_format_id = format_id try: if vcodec != "none" and acodec == "none" and format_id: selection_format_id = f"{format_id}+ba" except Exception: selection_format_id = format_id # Format size (prefer exact filesize; fall back to filesize_approx) size_str = "" size_prefix = "" size_bytes = filesize if not size_bytes: size_bytes = filesize_approx if size_bytes: size_prefix = "~" try: if isinstance(size_bytes, (int, float)) and size_bytes > 0: size_mb = float(size_bytes) / (1024 * 1024) size_str = f"{size_prefix}{size_mb:.1f}MB" except Exception: size_str = "" # Build format description desc_parts = [] if resolution and resolution != "audio only": desc_parts.append(resolution) if ext: desc_parts.append(ext.upper()) if vcodec != "none": desc_parts.append(f"v:{vcodec}") if acodec != "none": desc_parts.append(f"a:{acodec}") if size_str: desc_parts.append(size_str) format_desc = " | ".join(desc_parts) # Build format dict for emission and table format_dict = { "table": "download-media", "title": f"Format {format_id}", "url": url, "target": url, "detail": format_desc, "annotations": [ext, resolution] if resolution else [ext], "media_kind": "format", "cmd": base_cmd, # Put Size early so it's visible even with smaller column caps. "columns": [ ("ID", format_id), ("Resolution", resolution or "N/A"), ("Ext", ext), ("Size", size_str or ""), ("Video", vcodec), ("Audio", acodec), ], "full_metadata": { "format_id": format_id, "url": url, "item_selector": selection_format_id, }, "_selection_args": None, } # Preserve clip settings across @N selection. # Some runners only append row selection args; make sure clip intent # survives even when it was provided via -query "clip:...". selection_args: List[str] = ["-format", selection_format_id] try: if (not clip_spec) and clip_values: selection_args.extend(["-clip", ",".join([v for v in clip_values if v])]) except Exception: pass format_dict["_selection_args"] = selection_args # Add to results list and table (don't emit - formats should wait for @N selection) results_list.append(format_dict) table.add_result(format_dict) # Render and display the table # Some runners (e.g. cmdnat) do not automatically render stage tables. # Since this branch is explicitly interactive (user must pick @N), always # print the table here and mark it as already rendered to avoid duplicates # in runners that also print tables (e.g. CLI.py). try: get_stderr_console().print(table) setattr(table, "_rendered_by_cmdlet", True) except Exception: pass # Set the result table so it displays and is available for @N selection pipeline_context.set_current_stage_table(table) pipeline_context.set_last_result_table(table, results_list) log(f"", file=sys.stderr) return 0 # Download each URL downloaded_count = 0 downloaded_pipe_objects: List[Dict[str, Any]] = [] pipe_seq = 0 clip_sections_spec = self._build_clip_sections_spec(clip_ranges) if clip_sections_spec: try: debug(f"Clip sections spec: {clip_sections_spec}") except Exception: pass for url in supported_url: try: debug(f"Processing: {url}") canonical_url = _canonicalize_url_for_storage(url) # Preflight: warn if URL already exists in storage backends. # For playlist selections we already did a bulk preflight; skip per-item checks. if not skip_per_url_preflight: if not _preflight_url_duplicate(canonical_url, extra_urls=[url]): log(f"Skipping download: {url}", file=sys.stderr) continue # Step progress is per-URL download. # Keep steps meaningful: long-running download + finalize. # (Fast internal bookkeeping should not be steps.) _begin_live_steps(2) # If playlist_items is specified but looks like a format ID (e.g. from table selection), # treat it as a format selector instead of playlist items. # This handles the case where @N selection passes -item actual_format = ytdl_format actual_playlist_items = playlist_items if playlist_items and not ytdl_format: # Heuristic: if it contains non-numeric chars (excluding ranges/commas) # it is likely a format ID (e.g. '140-drc', 'best', '137+140') import re if re.search(r"[^0-9,-]", playlist_items): actual_format = playlist_items actual_playlist_items = None # For -audio, default to yt-dlp's built-in bestaudio selector. # This should *not* require interactive format picking. if mode == "audio" and not actual_format: actual_format = "bestaudio" # If no explicit format is provided for video mode, allow a config override. if mode == "video" and not actual_format: configured = (ytdlp_tool.default_format("video") or "").strip() if configured and configured != "bestvideo+bestaudio/best": actual_format = configured # If the batch has exactly one available format, force it. # This prevents the "Requested format is not available" error loop entirely. forced_single_applied = False if forced_single_format_for_batch and forced_single_format_id and not ytdl_format and not actual_playlist_items: actual_format = forced_single_format_id forced_single_applied = True # If a single format id was chosen and it is video-only, auto-merge best audio. if ( actual_format and isinstance(actual_format, str) and mode != "audio" and "+" not in actual_format and "/" not in actual_format and "[" not in actual_format and actual_format not in {"best", "bv", "ba", "b"} and not forced_single_applied ): try: formats = _list_formats_cached(url, playlist_items_value=actual_playlist_items) if formats: fmt_match = next( (f for f in formats if str(f.get("format_id", "")) == actual_format), None, ) if fmt_match: vcodec = str(fmt_match.get("vcodec", "none")) acodec = str(fmt_match.get("acodec", "none")) if vcodec != "none" and acodec == "none": debug( f"Selected video-only format {actual_format}; using {actual_format}+ba for audio" ) actual_format = f"{actual_format}+ba" except Exception: pass attempted_single_format_fallback = False while True: try: opts = DownloadOptions( url=url, mode=mode, output_dir=final_output_dir, ytdl_format=actual_format, cookies_path=ytdlp_tool.resolve_cookiefile(), clip_sections=clip_sections_spec, playlist_items=actual_playlist_items, quiet=quiet_mode, no_playlist=False, embed_chapters=embed_chapters, write_sub=write_sub, ) _step("downloading") # Use timeout wrapper to prevent hanging debug(f"Starting download with 5-minute timeout...") result_obj = _download_with_timeout(opts, timeout_seconds=300) debug(f"Download completed, building pipe object...") break except DownloadError as e: # If yt-dlp failed due to an unavailable default format and there is only # one available format, auto-retry with that format instead of prompting. cause = getattr(e, "__cause__", None) detail = "" try: detail = str(cause or "") except Exception: detail = "" if ("requested format is not available" in (detail or "").lower()) and mode != "audio": # If we already discovered there's only one format for the batch, # retry directly with it instead of re-listing formats per item. if ( forced_single_format_for_batch and forced_single_format_id and not ytdl_format and not actual_playlist_items and not attempted_single_format_fallback ): attempted_single_format_fallback = True actual_format = forced_single_format_id debug(f"Only one format available (playlist preflight); retrying with: {actual_format}") continue formats = _list_formats_cached(url, playlist_items_value=actual_playlist_items) if ( (not attempted_single_format_fallback) and isinstance(formats, list) and len(formats) == 1 and isinstance(formats[0], dict) ): only = formats[0] fallback_format = str(only.get("format_id") or "").strip() selection_format_id = fallback_format try: vcodec = str(only.get("vcodec", "none")) acodec = str(only.get("acodec", "none")) if vcodec != "none" and acodec == "none" and fallback_format: selection_format_id = f"{fallback_format}+ba" except Exception: selection_format_id = fallback_format if selection_format_id: attempted_single_format_fallback = True actual_format = selection_format_id debug(f"Only one format available; retrying with: {actual_format}") continue # Fall back to interactive selection when there are multiple formats. if formats: formats_to_show = formats table = ResultTable(title=f"Available formats for {url}", max_columns=10, preserve_order=True) table.set_table("ytdlp.formatlist") table.set_source_command("download-media", [url]) results_list: List[Dict[str, Any]] = [] for idx, fmt in enumerate(formats_to_show, 1): resolution = fmt.get("resolution", "") ext = fmt.get("ext", "") vcodec = fmt.get("vcodec", "none") acodec = fmt.get("acodec", "none") filesize = fmt.get("filesize") filesize_approx = fmt.get("filesize_approx") format_id = fmt.get("format_id", "") selection_format_id = format_id try: if vcodec != "none" and acodec == "none" and format_id: selection_format_id = f"{format_id}+ba" except Exception: selection_format_id = format_id size_str = "" size_prefix = "" size_bytes = filesize if not size_bytes: size_bytes = filesize_approx if size_bytes: size_prefix = "~" try: if isinstance(size_bytes, (int, float)) and size_bytes > 0: size_mb = float(size_bytes) / (1024 * 1024) size_str = f"{size_prefix}{size_mb:.1f}MB" except Exception: size_str = "" desc_parts: List[str] = [] if resolution and resolution != "audio only": desc_parts.append(str(resolution)) if ext: desc_parts.append(str(ext).upper()) if vcodec != "none": desc_parts.append(f"v:{vcodec}") if acodec != "none": desc_parts.append(f"a:{acodec}") if size_str: desc_parts.append(size_str) format_desc = " | ".join(desc_parts) format_dict: Dict[str, Any] = { "table": "download-media", "title": f"Format {format_id}", "url": url, "target": url, "detail": format_desc, "media_kind": "format", "columns": [ ("ID", format_id), ("Resolution", resolution or "N/A"), ("Ext", ext), ("Size", size_str or ""), ("Video", vcodec), ("Audio", acodec), ], "full_metadata": { "format_id": format_id, "url": url, "item_selector": selection_format_id, }, "_selection_args": ["-format", selection_format_id], } results_list.append(format_dict) table.add_result(format_dict) pipeline_context.set_current_stage_table(table) pipeline_context.set_last_result_table(table, results_list) # Always print for interactive selection. try: get_stderr_console().print(table) setattr(table, "_rendered_by_cmdlet", True) except Exception: pass # Complete the step sequence: we return here and the user must # re-run with @N selection. _step("awaiting selection") log("Requested format is not available; select a working format with @N", file=sys.stderr) return 0 # Non-format DownloadError: surface and skip this URL. raise # Expand result set: # - playlists return a list # - section clips return a single DownloadMediaResult with `paths` populated results_to_emit: List[Any] = [] if isinstance(result_obj, list): results_to_emit = list(result_obj) else: paths = getattr(result_obj, "paths", None) if isinstance(paths, list) and paths: # Create one DownloadMediaResult per section file for p in paths: try: p_path = Path(p) except Exception: continue # Sidecars (subtitles) should never be piped as standalone items. # They are handled separately and attached to notes. try: if p_path.suffix.lower() in _SUBTITLE_EXTS: continue except Exception: pass if not p_path.exists() or p_path.is_dir(): continue try: hv = sha256_file(p_path) except Exception: hv = None results_to_emit.append( DownloadMediaResult( path=p_path, info=getattr(result_obj, "info", {}) or {}, tag=list(getattr(result_obj, "tag", []) or []), source_url=getattr(result_obj, "source_url", None) or opts.url, hash_value=hv, ) ) else: results_to_emit = [result_obj] # Build PipeObjects first so we can attach cross-clip relationships. pipe_objects: List[Dict[str, Any]] = [] for downloaded in results_to_emit: po = self._build_pipe_object(downloaded, url, opts) pipe_seq += 1 try: po.setdefault("pipe_index", pipe_seq) except Exception: pass # Attach chapter timestamps for downstream consumers (e.g., mpv scripts) # even if container embedding fails. try: info = downloaded.info if isinstance(getattr(downloaded, "info", None), dict) else {} except Exception: info = {} chapters_text = _format_chapters_note(info) if embed_chapters else None if chapters_text: notes = po.get("notes") if not isinstance(notes, dict): notes = {} notes.setdefault("chapters", chapters_text) po["notes"] = notes if write_sub: try: media_path = Path(str(po.get("path") or "")) except Exception: media_path = None if media_path is not None and media_path.exists() and media_path.is_file(): sub_path = _best_subtitle_sidecar(media_path) if sub_path is not None: sub_text = _read_text_file(sub_path) if sub_text: notes = po.get("notes") if not isinstance(notes, dict): notes = {} notes["sub"] = sub_text po["notes"] = notes # We keep subtitles as notes; do not leave a sidecar that later stages # might try to ingest as a file. try: sub_path.unlink() except Exception: pass pipe_objects.append(po) # If this is a clip download, decorate titles/tags so the title: tag is clip-based. # Relationship tags are only added when multiple clips exist. try: if clip_ranges and len(pipe_objects) == len(clip_ranges): source_hash = query_hash_override or self._find_existing_hash_for_url(storage, canonical_url, hydrus_available=hydrus_available) self._apply_clip_decorations(pipe_objects, clip_ranges, source_king_hash=source_hash) except Exception: pass debug(f"Emitting {len(pipe_objects)} result(s) to pipeline...") # Mark complete *before* the first emit, because the pipeline clears the # status line on emit(). _step("finalized") stage_ctx = pipeline_context.get_stage_context() emit_enabled = bool(stage_ctx is not None and not getattr(stage_ctx, "is_last_stage", False)) for pipe_obj_dict in pipe_objects: # Only emit when there is a downstream stage. # This keeps `download-media` from producing a result table when run standalone. if emit_enabled: pipeline_context.emit(pipe_obj_dict) # Automatically register url with local library if pipe_obj_dict.get("url"): pipe_obj = coerce_to_pipe_object(pipe_obj_dict) register_url_with_local_library(pipe_obj, config) # Keep a copy so we can optionally run a tail pipeline entered at the playlist prompt. try: downloaded_pipe_objects.append(pipe_obj_dict) except Exception: pass downloaded_count += len(pipe_objects) debug("✓ Downloaded and emitted") except DownloadError as e: log(f"Download failed for {url}: {e}", file=sys.stderr) except Exception as e: log(f"Error processing {url}: {e}", file=sys.stderr) if downloaded_count > 0: debug(f"✓ Successfully processed {downloaded_count} URL(s)") return 0 log("No downloads completed", file=sys.stderr) return 1 except Exception as e: log(f"Error in download-media: {e}", file=sys.stderr) return 1 def _resolve_output_dir(self, parsed: Dict[str, Any], config: Dict[str, Any]) -> Optional[Path]: """Resolve the output directory. Rules: - If user passes `-path`, use that directory (override). - Otherwise default to a temp directory (config["temp"] if present, else OS temp). """ # Priority 1: explicit output directory override path_override = parsed.get("path") if path_override: try: candidate = Path(str(path_override)).expanduser() # If user passed a file path, treat its parent as output dir. if candidate.suffix: candidate = candidate.parent candidate.mkdir(parents=True, exist_ok=True) debug(f"Using output directory override: {candidate}") return candidate except Exception as e: log(f"Invalid -path output directory: {e}", file=sys.stderr) return None # Priority 2: config-provided temp/output directory try: temp_value = (config or {}).get("temp") if isinstance(config, dict) else None except Exception: temp_value = None if temp_value: try: candidate = Path(str(temp_value)).expanduser() candidate.mkdir(parents=True, exist_ok=True) debug(f"Using config temp directory: {candidate}") return candidate except Exception as e: log(f"Cannot use configured temp directory '{temp_value}': {e}", file=sys.stderr) return None # Priority 3: OS temp fallback try: candidate = Path(tempfile.gettempdir()) / "Medios-Macina" candidate.mkdir(parents=True, exist_ok=True) debug(f"Using OS temp directory: {candidate}") return candidate except Exception as e: log(f"Cannot create OS temp directory: {e}", file=sys.stderr) return None def _parse_time_ranges(self, spec: str) -> List[tuple[int, int]]: """Parse clip specs into a list of (start_seconds, end_seconds). Supported inputs: - "MM:SS-MM:SS" - "HH:MM:SS-HH:MM:SS" - seconds: "280-300" - multiple ranges separated by commas: "4:40-5:00,5:15-5:25" """ def _to_seconds(ts: str) -> Optional[int]: ts = str(ts).strip() if not ts: return None # Support compact units like 3m4s, 1h22m, 1h2m3s # (case-insensitive; seconds may be fractional but are truncated to int) try: unit_match = re.fullmatch( r"(?i)\s*(?:(?P\d+)h)?\s*(?:(?P\d+)m)?\s*(?:(?P\d+(?:\.\d+)?)s)?\s*", ts, ) except Exception: unit_match = None if unit_match and unit_match.group(0).strip() and any(unit_match.group(g) for g in ("h", "m", "s")): try: hours = int(unit_match.group("h") or 0) minutes = int(unit_match.group("m") or 0) seconds = float(unit_match.group("s") or 0) total = (hours * 3600) + (minutes * 60) + seconds return int(total) except Exception: return None if ":" in ts: parts = [p.strip() for p in ts.split(":")] if len(parts) == 2: hh_s = "0" mm_s, ss_s = parts elif len(parts) == 3: hh_s, mm_s, ss_s = parts else: return None try: hours = int(hh_s) minutes = int(mm_s) seconds = float(ss_s) total = (hours * 3600) + (minutes * 60) + seconds return int(total) except Exception: return None try: return int(float(ts)) except Exception: return None ranges: List[tuple[int, int]] = [] if not spec: return ranges for piece in str(spec).split(","): piece = piece.strip() if not piece: continue if "-" not in piece: return [] start_s, end_s = [p.strip() for p in piece.split("-", 1)] start = _to_seconds(start_s) end = _to_seconds(end_s) if start is None or end is None or start >= end: return [] ranges.append((start, end)) return ranges @staticmethod def _parse_keyed_csv_spec(spec: str, *, default_key: str) -> Dict[str, List[str]]: """Parse comma-separated values with optional sticky `key:` prefixes. Example: clip:3m4s-3m14s,1h22m-1h33m,item:2-3 Rules: - Items are split on commas. - If an item begins with `key:` then key becomes active for subsequent items. - If an item has no `key:` prefix, it belongs to the last active key. - If no key has been set yet, values belong to default_key. """ out: Dict[str, List[str]] = {} if not isinstance(spec, str): spec = str(spec) text = spec.strip() if not text: return out active = (default_key or "").strip().lower() or "clip" key_pattern = re.compile(r"^([A-Za-z_][A-Za-z0-9_-]*)\s*:\s*(.*)$") for raw_piece in text.split(","): piece = raw_piece.strip() if not piece: continue m = key_pattern.match(piece) if m: active = (m.group(1) or "").strip().lower() or active value = (m.group(2) or "").strip() if value: out.setdefault(active, []).append(value) continue out.setdefault(active, []).append(piece) return out def _build_clip_sections_spec( self, clip_ranges: Optional[List[tuple[int, int]]], ) -> Optional[str]: """Convert parsed clip range into downloader spec (seconds).""" ranges: List[str] = [] if clip_ranges: for start_s, end_s in clip_ranges: ranges.append(f"{start_s}-{end_s}") return ",".join(ranges) if ranges else None def _build_pipe_object(self, download_result: Any, url: str, opts: DownloadOptions) -> Dict[str, Any]: """Create a PipeObject-compatible dict from a DownloadMediaResult.""" info: Dict[str, Any] = download_result.info if isinstance(download_result.info, dict) else {} media_path = Path(download_result.path) hash_value = download_result.hash_value or self._compute_file_hash(media_path) title = info.get("title") or media_path.stem tag = list(download_result.tag or []) # Add title tag for searchability if title and f"title:{title}" not in tag: tag.insert(0, f"title:{title}") # Store the canonical URL for de-dup/search purposes. # Prefer yt-dlp's webpage_url, and do not mix in the raw requested URL (which may contain timestamps). final_url = None try: page_url = info.get("webpage_url") or info.get("original_url") or info.get("url") if page_url: final_url = str(page_url) except Exception: final_url = None if not final_url and url: final_url = str(url) # Construct canonical PipeObject dict: hash, store, path, url, title, tags # Prefer explicit backend names (storage_name/storage_location). If none, default to PATH # which indicates the file is available at a filesystem path and hasn't been added to a backend yet. return { "path": str(media_path), "hash": hash_value, "title": title, "url": final_url, "tag": tag, "action": "cmdlet:download-media", "is_temp": True, "ytdl_format": getattr(opts, "ytdl_format", None), # download_mode removed (deprecated), keep media_kind "store": getattr(opts, "storage_name", None) or getattr(opts, "storage_location", None) or "PATH", "media_kind": "video" if opts.mode == "video" else "audio", } @staticmethod def _normalise_hash_hex(value: Optional[str]) -> Optional[str]: if not value or not isinstance(value, str): return None candidate = value.strip().lower() if len(candidate) == 64 and all(c in "0123456789abcdef" for c in candidate): return candidate return None @classmethod def _extract_hash_from_search_hit(cls, hit: Any) -> Optional[str]: if not isinstance(hit, dict): return None for key in ("hash", "hash_hex", "file_hash", "hydrus_hash"): v = hit.get(key) normalized = cls._normalise_hash_hex(str(v) if v is not None else None) if normalized: return normalized return None @classmethod def _find_existing_hash_for_url( cls, storage: Any, canonical_url: str, *, hydrus_available: bool, ) -> Optional[str]: """Best-effort lookup of an existing stored item hash by url:. Used to make the stored source video the king for multi-clip relationships. """ if storage is None or not canonical_url: return None try: from Store.HydrusNetwork import HydrusNetwork except Exception: HydrusNetwork = None # type: ignore try: backend_names = list(storage.list_searchable_backends() or []) except Exception: backend_names = [] for backend_name in backend_names: try: backend = storage[backend_name] except Exception: continue try: if str(backend_name).strip().lower() == "temp": continue except Exception: pass try: if HydrusNetwork is not None and isinstance(backend, HydrusNetwork) and not hydrus_available: continue except Exception: pass try: hits = backend.search(f"url:{canonical_url}", limit=5) or [] except Exception: hits = [] for hit in hits: extracted = cls._extract_hash_from_search_hit(hit) if extracted: return extracted return None @staticmethod def _format_timecode(seconds: int, *, force_hours: bool) -> str: total = max(0, int(seconds)) minutes, secs = divmod(total, 60) hours, minutes = divmod(minutes, 60) if force_hours: return f"{hours:02d}:{minutes:02d}:{secs:02d}" return f"{minutes:02d}:{secs:02d}" @classmethod def _format_clip_range(cls, start_s: int, end_s: int) -> str: force_hours = bool(start_s >= 3600 or end_s >= 3600) return f"{cls._format_timecode(start_s, force_hours=force_hours)}-{cls._format_timecode(end_s, force_hours=force_hours)}" @classmethod def _apply_clip_decorations( cls, pipe_objects: List[Dict[str, Any]], clip_ranges: List[tuple[int, int]], *, source_king_hash: Optional[str], ) -> None: """Apply clip:{range} tags/titles and relationship metadata for multi-clip downloads. - Sets the clip title (and title: tag) to exactly `clip:{range}`. - Adds `clip:{range}` tag. - Sets `relationships` on each emitted item (king hash first, then alt hashes) so downstream can persist relationships into a DB/API without storing relationship tags. """ if not pipe_objects or len(pipe_objects) != len(clip_ranges): return # Always apply clip titles/tags (even for a single clip). for po, (start_s, end_s) in zip(pipe_objects, clip_ranges): clip_range = cls._format_clip_range(start_s, end_s) clip_tag = f"clip:{clip_range}" # Title: make it generic/consistent for clips. po["title"] = clip_tag tags = po.get("tag") if not isinstance(tags, list): tags = [] # Replace any existing title: tags with title: tags = [t for t in tags if not str(t).strip().lower().startswith("title:")] # Relationships must not be stored as tags. tags = [t for t in tags if not str(t).strip().lower().startswith("relationship:")] tags.insert(0, f"title:{clip_tag}") # Ensure clip tag exists if clip_tag not in tags: tags.append(clip_tag) po["tag"] = tags # Relationship tagging only makes sense when multiple clips exist. if len(pipe_objects) < 2: return hashes: List[str] = [] for po in pipe_objects: h = cls._normalise_hash_hex(str(po.get("hash") or "")) hashes.append(h or "") # Determine king: prefer an existing source video hash if present; else first clip becomes king. king_hash = cls._normalise_hash_hex(source_king_hash) if source_king_hash else None if not king_hash: king_hash = hashes[0] if hashes and hashes[0] else None if not king_hash: return alt_hashes: List[str] = [h for h in hashes if h and h != king_hash] if not alt_hashes: return # Carry relationship metadata through the pipeline without using tags. rel_payload = {"king": [king_hash], "alt": list(alt_hashes)} for po in pipe_objects: po["relationships"] = {"king": [king_hash], "alt": list(alt_hashes)} def _compute_file_hash(self, filepath: Path) -> str: """Compute SHA256 hash of a file.""" import hashlib sha256_hash = hashlib.sha256() with open(filepath, "rb") as f: for byte_block in iter(lambda: f.read(4096), b""): sha256_hash.update(byte_block) return sha256_hash.hexdigest() # Module-level singleton registration CMDLET = Download_Media()