Files
Medios-Macina/cmdlet/download_media.py

3966 lines
152 KiB
Python
Raw Normal View History

2025-12-11 12:47:30 -08:00
"""Download media from url using yt-dlp (streaming sites only).
Focused cmdlet for video/audio downloads from yt-dlp-supported sites:
- YouTube, Twitch, Dailymotion, Vimeo, etc.
- No direct file downloads (use download-file for that)
- Playlist detection with item selection
- Clip extraction (time ranges)
- Format selection and audio/video modes
- Tags extraction and metadata integration
"""
from __future__ import annotations
import glob # noqa: F401
import hashlib
import json # noqa: F401
import random
import re
import string
import subprocess
import sys
2025-12-16 01:45:01 -08:00
import tempfile
2025-12-11 12:47:30 -08:00
import time
import traceback
2025-12-22 02:11:53 -08:00
from contextlib import AbstractContextManager, nullcontext
2025-12-20 23:57:44 -08:00
from pathlib import Path
2025-12-22 02:11:53 -08:00
from typing import Any, Dict, Iterator, List, Optional, Sequence, cast
2025-12-20 23:57:44 -08:00
from urllib.parse import urlparse
2025-12-11 12:47:30 -08:00
2025-12-11 19:04:02 -08:00
from SYS.logger import log, debug
2025-12-22 02:11:53 -08:00
from SYS.pipeline_progress import PipelineProgress
2025-12-20 23:57:44 -08:00
from SYS.utils import sha256_file
2025-12-11 12:47:30 -08:00
from models import DownloadError, DownloadOptions, DownloadMediaResult, DebugLogger, ProgressBar
import pipeline as pipeline_context
from result_table import ResultTable
2025-12-20 23:57:44 -08:00
from rich.prompt import Confirm
from rich_display import stderr_console as get_stderr_console
2025-12-29 17:05:03 -08:00
from . import _shared as sh
QueryArg = sh.QueryArg
2025-12-11 12:47:30 -08:00
2025-12-16 23:23:43 -08:00
from tool.ytdlp import YtDlpTool
from . import _shared as sh
Cmdlet = sh.Cmdlet
CmdletArg = sh.CmdletArg
SharedArgs = sh.SharedArgs
create_pipe_object_result = sh.create_pipe_object_result
parse_cmdlet_args = sh.parse_cmdlet_args
register_url_with_local_library = sh.register_url_with_local_library
coerce_to_pipe_object = sh.coerce_to_pipe_object
get_field = sh.get_field
2025-12-11 12:47:30 -08:00
2025-12-21 16:59:37 -08:00
def _live_ui_and_pipe_index() -> tuple[Optional[Any], int]:
ui = None
try:
2025-12-29 17:05:03 -08:00
ui = (
pipeline_context.get_live_progress()
if hasattr(pipeline_context,
"get_live_progress") else None
2025-12-29 17:05:03 -08:00
)
2025-12-21 16:59:37 -08:00
except Exception:
ui = None
pipe_idx: int = 0
try:
2025-12-29 17:05:03 -08:00
stage_ctx = (
pipeline_context.get_stage_context()
if hasattr(pipeline_context,
"get_stage_context") else None
2025-12-29 17:05:03 -08:00
)
maybe_idx = getattr(
stage_ctx,
"pipe_index",
None
) if stage_ctx is not None else None
2025-12-21 16:59:37 -08:00
if isinstance(maybe_idx, int):
pipe_idx = int(maybe_idx)
except Exception:
pipe_idx = 0
return ui, pipe_idx
def _begin_live_steps(total_steps: int) -> None:
"""Declare the total number of steps for the current pipe."""
ui, pipe_idx = _live_ui_and_pipe_index()
if ui is None:
return
try:
begin = getattr(ui, "begin_pipe_steps", None)
if callable(begin):
begin(int(pipe_idx), total_steps=int(total_steps))
except Exception:
return
def _step(text: str) -> None:
"""Emit a *new* step (increments i/N and advances percent automatically)."""
ui, pipe_idx = _live_ui_and_pipe_index()
if ui is None:
return
try:
adv = getattr(ui, "advance_pipe_step", None)
if callable(adv):
adv(int(pipe_idx), str(text))
except Exception:
return
def _set_pipe_percent(percent: int) -> None:
"""Best-effort percent update without changing step text."""
ui, pipe_idx = _live_ui_and_pipe_index()
if ui is None:
return
try:
set_pct = getattr(ui, "set_pipe_percent", None)
if callable(set_pct):
set_pct(int(pipe_idx), int(percent))
except Exception:
return
2025-12-23 16:36:39 -08:00
def _print_table_suspended(table: Any) -> None:
"""Print a Rich table while pausing Live progress if active."""
suspend = getattr(pipeline_context, "suspend_live_progress", None)
cm: AbstractContextManager[Any] = nullcontext()
if callable(suspend):
try:
maybe_cm = suspend()
if maybe_cm is not None:
cm = maybe_cm # type: ignore[assignment]
except Exception:
cm = nullcontext()
with cm:
get_stderr_console().print(table)
2025-12-11 12:47:30 -08:00
# Minimal inlined helpers from helper/download.py (is_url_supported_by_ytdlp, list_formats)
try:
import yt_dlp # type: ignore
from yt_dlp.extractor import gen_extractors # type: ignore
except Exception as exc:
yt_dlp = None # type: ignore
2025-12-22 02:11:53 -08:00
gen_extractors = None # type: ignore
2025-12-11 12:47:30 -08:00
YTDLP_IMPORT_ERROR = exc
else:
YTDLP_IMPORT_ERROR = None
try:
from metadata import extract_ytdlp_tags
except ImportError:
extract_ytdlp_tags = None
_EXTRACTOR_CACHE: List[Any] | None = None
2025-12-13 12:09:50 -08:00
# Reused progress formatter for yt-dlp callbacks (stderr only).
_YTDLP_PROGRESS_BAR = ProgressBar()
2025-12-16 23:23:43 -08:00
_SUBTITLE_EXTS = (".vtt", ".srt", ".ass", ".ssa", ".lrc")
def _format_chapters_note(info: Dict[str, Any]) -> Optional[str]:
"""Format yt-dlp chapter metadata into a stable, note-friendly text.
Output is one chapter per line, e.g.:
00:00 Intro
01:23-02:10 Topic name
"""
try:
chapters = info.get("chapters")
except Exception:
chapters = None
if not isinstance(chapters, list) or not chapters:
return None
rows: List[tuple[int, Optional[int], str]] = []
max_t = 0
for ch in chapters:
if not isinstance(ch, dict):
continue
start_raw = ch.get("start_time")
end_raw = ch.get("end_time")
title_raw = ch.get("title") or ch.get("name") or ch.get("chapter")
try:
2025-12-22 02:11:53 -08:00
if start_raw is None:
continue
2025-12-16 23:23:43 -08:00
start_s = int(float(start_raw))
except Exception:
continue
end_s: Optional[int] = None
try:
if end_raw is not None:
end_s = int(float(end_raw))
except Exception:
end_s = None
title = str(title_raw).strip() if title_raw is not None else ""
rows.append((start_s, end_s, title))
try:
max_t = max(max_t, start_s, end_s or 0)
except Exception:
max_t = max(max_t, start_s)
if not rows:
return None
force_hours = bool(max_t >= 3600)
def _tc(seconds: int) -> str:
total = max(0, int(seconds))
minutes, secs = divmod(total, 60)
hours, minutes = divmod(minutes, 60)
if force_hours:
return f"{hours:02d}:{minutes:02d}:{secs:02d}"
return f"{minutes:02d}:{secs:02d}"
lines: List[str] = []
2025-12-29 17:05:03 -08:00
for start_s, end_s, title in sorted(
rows, key=lambda r: (r[0], r[1] if r[1] is not None else 10**9, r[2])
):
2025-12-16 23:23:43 -08:00
if end_s is not None and end_s > start_s:
prefix = f"{_tc(start_s)}-{_tc(end_s)}"
else:
prefix = _tc(start_s)
line = f"{prefix} {title}".strip()
if line:
lines.append(line)
text = "\n".join(lines).strip()
return text or None
def _best_subtitle_sidecar(media_path: Path) -> Optional[Path]:
"""Find the most likely subtitle sidecar file for a downloaded media file."""
try:
base_dir = media_path.parent
stem = media_path.stem
if not stem:
return None
candidates: List[Path] = []
for p in base_dir.glob(stem + ".*"):
try:
if not p.is_file():
continue
except Exception:
continue
if p.suffix.lower() in _SUBTITLE_EXTS:
candidates.append(p)
2025-12-22 02:11:53 -08:00
# Prefer VTT then SRT then others.
preferred_order = [".vtt", ".srt", ".ass", ".ssa", ".lrc"]
for ext in preferred_order:
for p in candidates:
if p.suffix.lower() == ext:
return p
2025-12-16 23:23:43 -08:00
2025-12-22 02:11:53 -08:00
return candidates[0] if candidates else None
2025-12-16 23:23:43 -08:00
except Exception:
return None
2025-12-22 02:11:53 -08:00
def _read_text_file(path: Path) -> Optional[str]:
2025-12-16 23:23:43 -08:00
try:
2025-12-22 02:11:53 -08:00
return path.read_text(encoding="utf-8", errors="ignore")
2025-12-16 23:23:43 -08:00
except Exception:
return None
2025-12-11 12:47:30 -08:00
def _ensure_yt_dlp_ready() -> None:
2025-12-22 02:11:53 -08:00
if YTDLP_IMPORT_ERROR is not None:
raise DownloadError(f"yt-dlp import error: {YTDLP_IMPORT_ERROR}")
if yt_dlp is None:
raise DownloadError("yt-dlp is not available")
def _get_extractors() -> List[Any]:
global _EXTRACTOR_CACHE
if _EXTRACTOR_CACHE is not None:
return _EXTRACTOR_CACHE
_ensure_yt_dlp_ready()
assert gen_extractors is not None
try:
_EXTRACTOR_CACHE = list(gen_extractors())
except Exception:
_EXTRACTOR_CACHE = []
return _EXTRACTOR_CACHE
2025-12-11 12:47:30 -08:00
def is_url_supported_by_ytdlp(url: str) -> bool:
2025-12-22 02:11:53 -08:00
if not url or not isinstance(url, str):
2025-12-11 12:47:30 -08:00
return False
2025-12-22 02:11:53 -08:00
if YTDLP_IMPORT_ERROR is not None:
return False
try:
parsed = urlparse(url)
if not parsed.scheme or not parsed.netloc:
return False
except Exception:
return False
try:
for ie in _get_extractors():
try:
if ie.suitable(url) and ie.IE_NAME != "generic":
return True
except Exception:
2025-12-11 12:47:30 -08:00
continue
2025-12-22 02:11:53 -08:00
except Exception:
return False
2025-12-11 12:47:30 -08:00
return False
2025-12-20 23:57:44 -08:00
def list_formats(
url: str,
2025-12-22 02:11:53 -08:00
*,
2025-12-20 23:57:44 -08:00
no_playlist: bool = False,
playlist_items: Optional[str] = None,
cookiefile: Optional[str] = None,
) -> Optional[List[Dict[str,
Any]]]:
2025-12-22 02:11:53 -08:00
if not is_url_supported_by_ytdlp(url):
return None
2025-12-11 12:47:30 -08:00
_ensure_yt_dlp_ready()
2025-12-22 02:11:53 -08:00
assert yt_dlp is not None
2025-12-16 23:23:43 -08:00
ydl_opts: Dict[str,
Any] = {
"quiet": True,
"no_warnings": True,
"skip_download": True,
"noprogress": True,
}
2025-12-22 02:11:53 -08:00
if cookiefile:
ydl_opts["cookiefile"] = str(cookiefile)
if no_playlist:
ydl_opts["noplaylist"] = True
if playlist_items:
ydl_opts["playlist_items"] = str(playlist_items)
try:
2025-12-12 21:55:38 -08:00
with yt_dlp.YoutubeDL(ydl_opts) as ydl: # type: ignore[arg-type]
2025-12-11 12:47:30 -08:00
info = ydl.extract_info(url, download=False)
2025-12-22 02:11:53 -08:00
except Exception:
2025-12-11 12:47:30 -08:00
return None
2025-12-22 02:11:53 -08:00
if not isinstance(info, dict):
2025-12-12 21:55:38 -08:00
return None
2025-12-22 02:11:53 -08:00
formats = info.get("formats")
if not isinstance(formats, list):
return None
out: List[Dict[str, Any]] = []
for f in formats:
if isinstance(f, dict):
out.append(f)
return out
2025-12-12 21:55:38 -08:00
2025-12-22 02:11:53 -08:00
def _download_with_sections_via_cli(
url: str,
ytdl_options: Dict[str,
Any],
2025-12-22 02:11:53 -08:00
sections: List[str],
quiet: bool = False,
) -> tuple[Optional[str],
Dict[str,
Any]]:
2025-12-11 12:47:30 -08:00
sections_list = ytdl_options.get("download_sections", [])
if not sections_list:
return "", {}
2025-12-29 17:05:03 -08:00
session_id = hashlib.md5(
(url + str(time.time()) + "".join(random.choices(string.ascii_letters,
k=10))).encode()
2025-12-29 17:05:03 -08:00
).hexdigest()[:12]
2025-12-11 12:47:30 -08:00
first_section_info = None
2025-12-21 16:59:37 -08:00
total_sections = len(sections_list)
2025-12-11 12:47:30 -08:00
for section_idx, section in enumerate(sections_list, 1):
2025-12-21 16:59:37 -08:00
# While step 1/2 is "downloading", keep the pipe bar moving for multi-section clips.
# Map sections onto 50..99 so step 2/2 can still jump to 100.
try:
if total_sections > 0:
pct = 50 + int(((section_idx - 1) / max(1, total_sections)) * 49)
_set_pipe_percent(pct)
except Exception:
pass
2025-12-11 12:47:30 -08:00
base_outtmpl = ytdl_options.get("outtmpl", "%(title)s.%(ext)s")
output_dir_path = Path(base_outtmpl).parent
filename_tmpl = f"{session_id}_{section_idx}"
if base_outtmpl.endswith(".%(ext)s"):
filename_tmpl += ".%(ext)s"
section_outtmpl = str(output_dir_path / filename_tmpl)
if section_idx == 1:
metadata_cmd = ["yt-dlp", "--dump-json", "--skip-download"]
if ytdl_options.get("cookiefile"):
cookies_path = ytdl_options["cookiefile"].replace("\\", "/")
metadata_cmd.extend(["--cookies", cookies_path])
if ytdl_options.get("noplaylist"):
metadata_cmd.append("--no-playlist")
metadata_cmd.append(url)
try:
meta_result = subprocess.run(
metadata_cmd,
capture_output=True,
text=True
)
2025-12-11 12:47:30 -08:00
if meta_result.returncode == 0 and meta_result.stdout:
try:
info_dict = json.loads(meta_result.stdout.strip())
first_section_info = info_dict
if not quiet:
debug(
f"Extracted title from metadata: {info_dict.get('title')}"
)
2025-12-11 12:47:30 -08:00
except json.JSONDecodeError:
if not quiet:
debug("Could not parse JSON metadata")
except Exception as e:
if not quiet:
debug(f"Error extracting metadata: {e}")
cmd = ["yt-dlp"]
2025-12-21 16:59:37 -08:00
if quiet:
cmd.append("--quiet")
cmd.append("--no-warnings")
cmd.append("--no-progress")
# Keep ffmpeg/merger output from taking over the terminal.
cmd.extend(["--postprocessor-args", "ffmpeg:-hide_banner -loglevel error"])
if ytdl_options.get("ffmpeg_location"):
try:
cmd.extend(["--ffmpeg-location", str(ytdl_options["ffmpeg_location"])])
except Exception:
pass
2025-12-11 12:47:30 -08:00
if ytdl_options.get("format"):
cmd.extend(["-f", ytdl_options["format"]])
2025-12-16 23:23:43 -08:00
if ytdl_options.get("merge_output_format"):
cmd.extend(
["--merge-output-format",
str(ytdl_options["merge_output_format"])]
)
2025-12-16 23:23:43 -08:00
# For CLI downloads, infer chapter/metadata embedding from either legacy flags
# or explicit FFmpegMetadata postprocessor entries.
postprocessors = ytdl_options.get("postprocessors")
want_add_metadata = bool(ytdl_options.get("addmetadata"))
want_embed_chapters = bool(ytdl_options.get("embedchapters"))
if isinstance(postprocessors, list):
for pp in postprocessors:
if not isinstance(pp, dict):
continue
if str(pp.get("key") or "") == "FFmpegMetadata":
want_add_metadata = True
if bool(pp.get("add_chapters", True)):
want_embed_chapters = True
if want_add_metadata:
cmd.append("--add-metadata")
if want_embed_chapters:
cmd.append("--embed-chapters")
if ytdl_options.get("writesubtitles"):
cmd.append("--write-sub")
cmd.append("--write-auto-sub")
cmd.extend(["--sub-format", "vtt"])
2025-12-11 12:47:30 -08:00
if ytdl_options.get("force_keyframes_at_cuts"):
2025-12-21 16:59:37 -08:00
cmd.append("--force-keyframes-at-cuts")
2025-12-11 12:47:30 -08:00
cmd.extend(["-o", section_outtmpl])
if ytdl_options.get("cookiefile"):
cookies_path = ytdl_options["cookiefile"].replace("\\", "/")
cmd.extend(["--cookies", cookies_path])
if ytdl_options.get("noplaylist"):
cmd.append("--no-playlist")
2025-12-12 21:55:38 -08:00
# Apply clip/section selection
cmd.extend(["--download-sections", section])
2025-12-11 12:47:30 -08:00
cmd.append(url)
if not quiet:
debug(f"Running yt-dlp for section: {section}")
try:
2025-12-21 16:59:37 -08:00
if quiet:
subprocess.run(cmd, check=True, capture_output=True, text=True)
else:
subprocess.run(cmd, check=True)
except subprocess.CalledProcessError as exc:
2025-12-29 17:05:03 -08:00
stderr_text = exc.stderr or ""
2025-12-21 16:59:37 -08:00
tail = "\n".join(stderr_text.splitlines()[-12:]).strip()
details = f"\n{tail}" if tail else ""
2025-12-29 17:05:03 -08:00
raise DownloadError(
f"yt-dlp failed for section {section} (exit {exc.returncode}){details}"
) from exc
2025-12-11 12:47:30 -08:00
except Exception as exc:
2025-12-21 16:59:37 -08:00
raise DownloadError(f"yt-dlp failed for section {section}: {exc}") from exc
# Mark near-complete before returning so the runner can finalize cleanly.
try:
_set_pipe_percent(99)
except Exception:
pass
2025-12-11 12:47:30 -08:00
return session_id, first_section_info or {}
def _iter_download_entries(info: Dict[str, Any]) -> Iterator[Dict[str, Any]]:
queue: List[Dict[str, Any]] = [info]
seen: set[int] = set()
while queue:
current = queue.pop(0)
obj_id = id(current)
if obj_id in seen:
continue
seen.add(obj_id)
entries = current.get("entries")
if isinstance(entries, list):
for entry in entries:
queue.append(entry)
if current.get("requested_downloads") or not entries:
yield current
def _candidate_paths(entry: Dict[str, Any], output_dir: Path) -> Iterator[Path]:
requested = entry.get("requested_downloads")
if isinstance(requested, list):
for item in requested:
if isinstance(item, dict):
fp = item.get("filepath") or item.get("_filename")
if fp:
yield Path(fp)
for key in ("filepath", "_filename", "filename"):
value = entry.get(key)
if value:
yield Path(value)
if entry.get("filename"):
yield output_dir / entry["filename"]
def _resolve_entry_and_path(info: Dict[str,
Any],
output_dir: Path) -> tuple[Dict[str,
Any],
Path]:
2025-12-11 12:47:30 -08:00
for entry in _iter_download_entries(info):
for candidate in _candidate_paths(entry, output_dir):
if candidate.is_file():
return entry, candidate
if not candidate.is_absolute():
maybe = output_dir / candidate
if maybe.is_file():
return entry, maybe
raise FileNotFoundError("yt-dlp did not report a downloaded media file")
def _resolve_entries_and_paths(info: Dict[str,
Any],
output_dir: Path) -> List[tuple[Dict[str,
Any],
Path]]:
2025-12-13 00:18:30 -08:00
resolved: List[tuple[Dict[str, Any], Path]] = []
seen: set[str] = set()
for entry in _iter_download_entries(info):
chosen: Optional[Path] = None
for candidate in _candidate_paths(entry, output_dir):
if candidate.is_file():
chosen = candidate
break
if not candidate.is_absolute():
maybe = output_dir / candidate
if maybe.is_file():
chosen = maybe
break
if chosen is None:
continue
key = str(chosen.resolve())
if key in seen:
continue
seen.add(key)
resolved.append((entry, chosen))
return resolved
2025-12-11 12:47:30 -08:00
def _extract_sha256(info: Dict[str, Any]) -> Optional[str]:
for payload in [info] + info.get("entries", []):
if not isinstance(payload, dict):
continue
hashes = payload.get("hashes")
if isinstance(hashes, dict):
for key in ("sha256", "sha-256", "sha_256"):
if key in hashes and isinstance(hashes[key],
str) and hashes[key].strip():
2025-12-11 12:47:30 -08:00
return hashes[key].strip()
for key in ("sha256", "sha-256", "sha_256"):
value = payload.get(key)
if isinstance(value, str) and value.strip():
return value.strip()
return None
def _progress_callback(status: Dict[str, Any]) -> None:
"""Simple progress callback using logger."""
event = status.get("status")
if event == "downloading":
2025-12-13 12:09:50 -08:00
# Always print progress to stderr so piped stdout remains clean.
percent = status.get("_percent_str")
downloaded = status.get("downloaded_bytes")
total = status.get("total_bytes") or status.get("total_bytes_estimate")
speed = status.get("_speed_str")
eta = status.get("_eta_str")
2025-12-20 23:57:44 -08:00
_YTDLP_PROGRESS_BAR.update(
downloaded=int(downloaded) if downloaded is not None else None,
total=int(total) if total is not None else None,
label="download",
file=sys.stderr,
)
2025-12-11 12:47:30 -08:00
elif event == "finished":
2025-12-20 23:57:44 -08:00
_YTDLP_PROGRESS_BAR.finish()
2025-12-11 12:47:30 -08:00
elif event in ("postprocessing", "processing"):
2025-12-13 12:09:50 -08:00
return
2025-12-11 12:47:30 -08:00
2025-12-29 17:05:03 -08:00
def probe_url(
url: str,
no_playlist: bool = False,
timeout_seconds: int = 15,
*,
cookiefile: Optional[str] = None,
) -> Optional[Dict[str,
Any]]:
2025-12-11 12:47:30 -08:00
"""Probe URL to extract metadata WITHOUT downloading.
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
Args:
url: URL to probe
no_playlist: If True, ignore playlists and probe only the single video
timeout_seconds: Max seconds to wait for probe (default 15s)
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
Returns:
Dict with keys: extractor, title, entries (if playlist), duration, etc.
Returns None if not supported by yt-dlp or on timeout.
"""
if not is_url_supported_by_ytdlp(url):
return None
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Wrap probe in timeout to prevent hanging on large playlists
import threading
from typing import cast
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
result_container: List[Optional[Any]] = [None, None] # [result, error]
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
def _do_probe() -> None:
try:
_ensure_yt_dlp_ready()
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
assert yt_dlp is not None
# Extract info without downloading
# Use extract_flat='in_playlist' to get full metadata for playlist items
ydl_opts = {
"quiet": True, # Suppress all output
"no_warnings": True,
"socket_timeout": 10,
"retries": 2, # Reduce retries for faster timeout
"skip_download": True, # Don't actually download
"extract_flat": "in_playlist", # Get playlist with metadata for each entry
"noprogress": True, # No progress bars
}
2025-12-29 17:05:03 -08:00
2025-12-16 23:23:43 -08:00
if cookiefile:
ydl_opts["cookiefile"] = str(cookiefile)
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Add no_playlist option if specified
if no_playlist:
ydl_opts["noplaylist"] = True
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
with yt_dlp.YoutubeDL(ydl_opts) as ydl: # type: ignore[arg-type]
info = ydl.extract_info(url, download=False)
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
if not isinstance(info, dict):
result_container[0] = None
return
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Extract relevant fields
webpage_url = info.get("webpage_url") or info.get("original_url"
) or info.get("url")
2025-12-11 12:47:30 -08:00
result_container[0] = {
"extractor": info.get("extractor", ""),
"title": info.get("title", ""),
"entries": info.get("entries", []), # Will be populated if playlist
"duration": info.get("duration"),
"uploader": info.get("uploader"),
"description": info.get("description"),
2025-12-14 00:53:52 -08:00
# Keep both the requested and canonical URL forms; callers should prefer webpage_url.
"requested_url": url,
"webpage_url": webpage_url,
2025-12-11 12:47:30 -08:00
}
except Exception as exc:
log(f"Probe error for {url}: {exc}")
result_container[1] = exc
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
thread = threading.Thread(target=_do_probe, daemon=False)
thread.start()
thread.join(timeout=timeout_seconds)
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
if thread.is_alive():
2025-12-20 23:57:44 -08:00
# Probe timed out - return None so the caller can raise an error
debug(
f"Probe timeout for {url} (>={timeout_seconds}s), proceeding with download"
)
2025-12-11 12:47:30 -08:00
return None
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
if result_container[1] is not None:
# Probe error - return None to proceed anyway
return None
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
return cast(Optional[Dict[str, Any]], result_container[0])
def download_media(
opts: DownloadOptions,
*,
debug_logger: Optional[DebugLogger] = None,
2025-12-13 00:18:30 -08:00
) -> Any:
2025-12-20 23:57:44 -08:00
"""Download streaming media exclusively via yt-dlp.
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
Args:
opts: DownloadOptions with url, mode, output_dir, etc.
debug_logger: Optional debug logger for troubleshooting
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
Returns:
DownloadMediaResult with path, info, tags, hash
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
Raises:
2025-12-20 23:57:44 -08:00
DownloadError: If the URL is unsupported or yt-dlp detects no media
2025-12-11 12:47:30 -08:00
"""
2025-12-20 23:57:44 -08:00
# Handle GoFile shares before yt-dlp (they remain unsupported)
2025-12-11 12:47:30 -08:00
try:
netloc = urlparse(opts.url).netloc.lower()
except Exception:
netloc = ""
if "gofile.io" in netloc:
msg = "GoFile links are currently unsupported"
if not opts.quiet:
debug(msg)
if debug_logger is not None:
debug_logger.write_record("gofile-unsupported",
{
"url": opts.url
})
2025-12-11 12:47:30 -08:00
raise DownloadError(msg)
# Determine if yt-dlp should be used
ytdlp_supported = is_url_supported_by_ytdlp(opts.url)
2025-12-20 23:57:44 -08:00
if not ytdlp_supported:
msg = "URL not supported by yt-dlp; try download-file for manual downloads"
if not opts.quiet:
log(msg)
if debug_logger is not None:
debug_logger.write_record("ytdlp-unsupported",
{
"url": opts.url
})
2025-12-20 23:57:44 -08:00
raise DownloadError(msg)
2025-12-16 23:23:43 -08:00
2025-12-20 23:57:44 -08:00
# Skip probe for playlists with item selection (probe can hang on large playlists)
# Just proceed straight to download which will handle item selection
if opts.playlist_items:
2025-12-29 17:05:03 -08:00
debug(
f"Skipping probe for playlist (item selection: {opts.playlist_items}), proceeding with download"
)
probe_result = {
"url": opts.url
} # Minimal probe result
2025-12-11 12:47:30 -08:00
else:
2025-12-20 23:57:44 -08:00
probe_cookiefile = None
try:
if opts.cookies_path and opts.cookies_path.is_file():
probe_cookiefile = str(opts.cookies_path)
except Exception:
probe_cookiefile = None
2025-12-29 17:05:03 -08:00
probe_result = probe_url(
opts.url,
no_playlist=opts.no_playlist,
timeout_seconds=15,
cookiefile=probe_cookiefile
2025-12-29 17:05:03 -08:00
)
2025-12-20 23:57:44 -08:00
if probe_result is None:
msg = "yt-dlp could not detect media for this URL; use download-file for direct downloads"
2025-12-11 12:47:30 -08:00
if not opts.quiet:
2025-12-20 23:57:44 -08:00
log(msg)
2025-12-11 12:47:30 -08:00
if debug_logger is not None:
debug_logger.write_record("ytdlp-skip-no-media",
{
"url": opts.url
})
2025-12-20 23:57:44 -08:00
raise DownloadError(msg)
2025-12-11 12:47:30 -08:00
_ensure_yt_dlp_ready()
2025-12-20 23:57:44 -08:00
ytdlp_tool = YtDlpTool()
ytdl_options = ytdlp_tool.build_ytdlp_options(opts)
hooks = ytdl_options.get("progress_hooks")
if not isinstance(hooks, list):
hooks = []
ytdl_options["progress_hooks"] = hooks
if _progress_callback not in hooks:
hooks.append(_progress_callback)
2025-12-11 12:47:30 -08:00
if not opts.quiet:
debug(f"Starting yt-dlp download: {opts.url}")
if debug_logger is not None:
debug_logger.write_record("ytdlp-start",
{
"url": opts.url
})
2025-12-11 12:47:30 -08:00
assert yt_dlp is not None
try:
# Debug: show what options we're using
if not opts.quiet:
if ytdl_options.get("download_sections"):
debug(
f"[yt-dlp] download_sections: {ytdl_options['download_sections']}"
)
2025-12-29 17:05:03 -08:00
debug(
f"[yt-dlp] force_keyframes_at_cuts: {ytdl_options.get('force_keyframes_at_cuts', False)}"
)
2025-12-11 12:47:30 -08:00
# Use subprocess when download_sections are present (Python API doesn't support them properly)
session_id = None
first_section_info = {}
if ytdl_options.get("download_sections"):
2025-12-21 16:59:37 -08:00
# For clip (download_sections), keep pipeline Live UI active and suppress
# yt-dlp/ffmpeg CLI spam when running in quiet/pipeline mode.
2025-12-22 02:11:53 -08:00
live_ui, _ = PipelineProgress(pipeline_context).ui_and_pipe_index()
2025-12-21 16:59:37 -08:00
quiet_sections = bool(opts.quiet) or (live_ui is not None)
session_id, first_section_info = _download_with_sections_via_cli(
opts.url,
ytdl_options,
ytdl_options.get("download_sections", []),
quiet=quiet_sections,
)
2025-12-11 12:47:30 -08:00
info = None
else:
with yt_dlp.YoutubeDL(ytdl_options) as ydl: # type: ignore[arg-type]
info = ydl.extract_info(opts.url, download=True)
except Exception as exc:
log(f"yt-dlp failed: {exc}", file=sys.stderr)
if debug_logger is not None:
debug_logger.write_record(
"exception",
{
"phase": "yt-dlp",
"error": str(exc),
"traceback": traceback.format_exc(),
},
)
raise DownloadError("yt-dlp download failed") from exc
# If we used subprocess, we need to find the file manually
if info is None:
# Find files created/modified during this download (after we started)
# Look for files matching the expected output template pattern
try:
import glob
import time
import re
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Get the expected filename pattern from outtmpl
# For sections: "C:\path\{session_id}.section_1_of_3.ext", etc.
# For non-sections: "C:\path\title.ext"
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Wait a moment to ensure files are fully written
time.sleep(0.5)
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# List all files in output_dir, sorted by modification time
files = sorted(
opts.output_dir.iterdir(),
key=lambda p: p.stat().st_mtime,
reverse=True
)
2025-12-11 12:47:30 -08:00
if not files:
raise FileNotFoundError(f"No files found in {opts.output_dir}")
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# If we downloaded sections, look for files with the session_id pattern
if opts.clip_sections and session_id:
# Pattern: "{session_id}_1.ext", "{session_id}_2.ext", etc.
2025-12-20 02:12:45 -08:00
# Also includes sidecars like "{session_id}_1.en.vtt".
2025-12-29 17:05:03 -08:00
section_pattern = re.compile(rf"^{re.escape(session_id)}_(\d+)")
2025-12-11 12:47:30 -08:00
matching_files = [f for f in files if section_pattern.search(f.name)]
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
if matching_files:
# Sort by section number to ensure correct order
def extract_section_num(path: Path) -> int:
match = section_pattern.search(path.name)
return int(match.group(1)) if match else 999
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
matching_files.sort(key=extract_section_num)
debug(
f"Found {len(matching_files)} section file(s) matching pattern"
)
2025-12-29 17:05:03 -08:00
2025-12-20 02:12:45 -08:00
# Now rename section *media* files to use hash-based names.
# Sidecars (subtitles) are renamed to match the media hash so they can be
# attached as notes later (and not emitted as separate pipeline items).
by_index: Dict[int,
List[Path]] = {}
2025-12-20 02:12:45 -08:00
for f in matching_files:
m = section_pattern.search(f.name)
if not m:
continue
2025-12-11 12:47:30 -08:00
try:
2025-12-20 02:12:45 -08:00
n = int(m.group(1))
except Exception:
continue
by_index.setdefault(n, []).append(f)
renamed_media_files: List[Path] = []
for sec_num in sorted(by_index.keys()):
group = by_index.get(sec_num) or []
if not group:
continue
def _is_subtitle(p: Path) -> bool:
try:
return p.suffix.lower() in _SUBTITLE_EXTS
except Exception:
return False
media_candidates = [p for p in group if not _is_subtitle(p)]
subtitle_candidates = [p for p in group if _is_subtitle(p)]
# Pick the primary media file for this section.
# Prefer non-json, non-info sidecars.
media_file: Optional[Path] = None
for cand in media_candidates:
try:
if cand.suffix.lower() in {".json",
".info.json"}:
2025-12-20 02:12:45 -08:00
continue
except Exception:
pass
media_file = cand
break
if media_file is None and media_candidates:
media_file = media_candidates[0]
if media_file is None:
# No media file found for this section; skip.
continue
try:
media_hash = sha256_file(media_file)
except Exception as e:
debug(
f"Failed to hash section media file {media_file.name}: {e}"
)
2025-12-20 02:12:45 -08:00
renamed_media_files.append(media_file)
continue
# Preserve any suffix tail after the section index so language tags survive.
# Example: <session>_1.en.vtt -> <hash>.en.vtt
prefix = f"{session_id}_{sec_num}"
def _tail(name: str) -> str:
try:
if name.startswith(prefix):
return name[len(prefix):]
2025-12-20 02:12:45 -08:00
except Exception:
pass
# Fallback: keep just the last suffix.
try:
return Path(name).suffix
except Exception:
return ""
# Rename media file to <hash><tail> (tail typically like .mkv).
try:
new_media_name = f"{media_hash}{_tail(media_file.name)}"
new_media_path = opts.output_dir / new_media_name
if new_media_path.exists() and new_media_path != media_file:
2025-12-29 17:05:03 -08:00
debug(
f"File with hash {media_hash} already exists, using existing file."
)
2025-12-11 12:47:30 -08:00
try:
2025-12-20 02:12:45 -08:00
media_file.unlink()
2025-12-11 12:47:30 -08:00
except OSError:
pass
else:
2025-12-20 02:12:45 -08:00
media_file.rename(new_media_path)
2025-12-29 17:05:03 -08:00
debug(
f"Renamed section file: {media_file.name} -> {new_media_name}"
)
2025-12-20 02:12:45 -08:00
renamed_media_files.append(new_media_path)
2025-12-11 12:47:30 -08:00
except Exception as e:
debug(
f"Failed to rename section media file {media_file.name}: {e}"
)
2025-12-20 02:12:45 -08:00
renamed_media_files.append(media_file)
new_media_path = media_file
# Rename subtitle sidecars to match media hash for later note attachment.
for sub_file in subtitle_candidates:
try:
new_sub_name = f"{media_hash}{_tail(sub_file.name)}"
new_sub_path = opts.output_dir / new_sub_name
if new_sub_path.exists() and new_sub_path != sub_file:
try:
sub_file.unlink()
except OSError:
pass
else:
sub_file.rename(new_sub_path)
2025-12-29 17:05:03 -08:00
debug(
f"Renamed section file: {sub_file.name} -> {new_sub_name}"
)
2025-12-20 02:12:45 -08:00
except Exception as e:
2025-12-29 17:05:03 -08:00
debug(
f"Failed to rename section subtitle file {sub_file.name}: {e}"
)
2025-12-20 02:12:45 -08:00
2025-12-29 17:05:03 -08:00
media_path = (
renamed_media_files[0]
if renamed_media_files else matching_files[0]
2025-12-29 17:05:03 -08:00
)
2025-12-20 02:12:45 -08:00
media_paths = renamed_media_files if renamed_media_files else None
2025-12-11 12:47:30 -08:00
if not opts.quiet:
2025-12-20 02:12:45 -08:00
count = len(media_paths) if isinstance(media_paths, list) else 1
debug(
f"✓ Downloaded {count} section media file(s) (session: {session_id})"
)
2025-12-11 12:47:30 -08:00
else:
# Fallback to most recent file if pattern not found
media_path = files[0]
media_paths = None
if not opts.quiet:
debug(
f"✓ Downloaded section file (pattern not found): {media_path.name}"
)
2025-12-11 12:47:30 -08:00
else:
# No sections, just take the most recent file
media_path = files[0]
media_paths = None
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
if not opts.quiet:
debug(f"✓ Downloaded: {media_path.name}")
if debug_logger is not None:
debug_logger.write_record(
"ytdlp-file-found",
{
"path": str(media_path)
}
)
2025-12-11 12:47:30 -08:00
except Exception as exc:
log(f"Error finding downloaded file: {exc}", file=sys.stderr)
if debug_logger is not None:
debug_logger.write_record(
"exception",
{
"phase": "find-file",
"error": str(exc)
},
2025-12-11 12:47:30 -08:00
)
raise DownloadError(str(exc)) from exc
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Create result with minimal data extracted from filename
file_hash = sha256_file(media_path)
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# For section downloads, create tags with the title and build proper info dict
tags = []
2025-12-29 17:05:03 -08:00
title = ""
2025-12-11 12:47:30 -08:00
if first_section_info:
2025-12-29 17:05:03 -08:00
title = first_section_info.get("title", "")
2025-12-11 12:47:30 -08:00
if title:
2025-12-29 17:05:03 -08:00
tags.append(f"title:{title}")
2025-12-11 12:47:30 -08:00
debug(f"Added title tag for section download: {title}")
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Build info dict - always use extracted title if available, not hash
if first_section_info:
info_dict = first_section_info
else:
info_dict = {
"id": media_path.stem,
"title": title or media_path.stem,
2025-12-29 17:05:03 -08:00
"ext": media_path.suffix.lstrip("."),
2025-12-11 12:47:30 -08:00
}
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
return DownloadMediaResult(
path=media_path,
info=info_dict,
2025-12-12 21:55:38 -08:00
tag=tags,
2025-12-11 12:47:30 -08:00
source_url=opts.url,
hash_value=file_hash,
paths=media_paths, # Include all section files if present
)
if not isinstance(info, dict):
log(f"Unexpected yt-dlp response: {type(info)}", file=sys.stderr)
raise DownloadError("Unexpected yt-dlp response type")
2025-12-22 02:11:53 -08:00
info_dict: Dict[str, Any] = cast(Dict[str, Any], info)
2025-12-11 12:47:30 -08:00
if debug_logger is not None:
debug_logger.write_record(
"ytdlp-info",
{
"keys": sorted(info_dict.keys()),
"is_playlist": bool(info_dict.get("entries")),
},
)
2025-12-13 00:18:30 -08:00
# Playlist/album handling: resolve ALL downloaded entries and return multiple results.
# The cmdlet will emit one PipeObject per downloaded file.
if info_dict.get("entries") and not opts.no_playlist:
resolved = _resolve_entries_and_paths(info_dict, opts.output_dir)
if resolved:
results: List[DownloadMediaResult] = []
for entry, media_path in resolved:
hash_value = _extract_sha256(entry) or _extract_sha256(info_dict)
if not hash_value:
try:
hash_value = sha256_file(media_path)
except OSError:
hash_value = None
tags: List[str] = []
if extract_ytdlp_tags:
try:
tags = extract_ytdlp_tags(entry)
except Exception as e:
log(f"Error extracting tags: {e}", file=sys.stderr)
source_url = (
entry.get("webpage_url") or entry.get("original_url")
or entry.get("url") or opts.url
2025-12-13 00:18:30 -08:00
)
results.append(
DownloadMediaResult(
path=media_path,
info=entry,
tag=tags,
source_url=source_url,
hash_value=hash_value,
)
)
if not opts.quiet:
debug(f"✓ Downloaded playlist items: {len(results)}")
return results
2025-12-11 12:47:30 -08:00
try:
entry, media_path = _resolve_entry_and_path(info_dict, opts.output_dir)
except FileNotFoundError as exc:
log(f"Error: {exc}", file=sys.stderr)
if debug_logger is not None:
debug_logger.write_record(
"exception",
{
"phase": "resolve-path",
"error": str(exc)
},
2025-12-11 12:47:30 -08:00
)
raise DownloadError(str(exc)) from exc
if debug_logger is not None:
debug_logger.write_record(
"resolved-media",
{
"path": str(media_path),
"entry_keys": sorted(entry.keys())
},
2025-12-11 12:47:30 -08:00
)
# Extract hash from metadata or compute
hash_value = _extract_sha256(entry) or _extract_sha256(info_dict)
if not hash_value:
try:
hash_value = sha256_file(media_path)
except OSError as exc:
if debug_logger is not None:
debug_logger.write_record(
"hash-error",
{
"path": str(media_path),
"error": str(exc)
},
2025-12-11 12:47:30 -08:00
)
# Extract tags using metadata.py
tags = []
if extract_ytdlp_tags:
try:
tags = extract_ytdlp_tags(entry)
except Exception as e:
log(f"Error extracting tags: {e}", file=sys.stderr)
source_url = entry.get("webpage_url") or entry.get("original_url"
) or entry.get("url")
2025-12-11 12:47:30 -08:00
if not opts.quiet:
debug(f"✓ Downloaded: {media_path.name} ({len(tags)} tags)")
if debug_logger is not None:
debug_logger.write_record(
"downloaded",
{
"path": str(media_path),
"tag_count": len(tags),
"source_url": source_url,
"sha256": hash_value,
},
)
return DownloadMediaResult(
path=media_path,
info=entry,
2025-12-12 21:55:38 -08:00
tag=tags,
2025-12-11 12:47:30 -08:00
source_url=source_url,
hash_value=hash_value,
)
# Timeout handler to prevent yt-dlp hangs
def _download_with_timeout(opts: DownloadOptions, timeout_seconds: int = 300) -> Any:
"""Download with timeout protection.
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
Args:
opts: DownloadOptions
timeout_seconds: Max seconds to wait (default 300s = 5 min)
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
Returns:
2025-12-13 00:18:30 -08:00
DownloadMediaResult or List[DownloadMediaResult]
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
Raises:
DownloadError: If timeout exceeded
"""
import threading
from typing import cast
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
result_container: List[Optional[Any]] = [None, None] # [result, error]
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
def _do_download() -> None:
try:
result_container[0] = download_media(opts)
except Exception as e:
result_container[1] = e
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
thread = threading.Thread(target=_do_download, daemon=False)
thread.start()
thread.join(timeout=timeout_seconds)
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
if thread.is_alive():
# Thread still running - timeout
raise DownloadError(
f"Download timeout after {timeout_seconds} seconds for {opts.url}"
)
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
if result_container[1] is not None:
raise cast(Exception, result_container[1])
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
if result_container[0] is None:
raise DownloadError(f"Download failed for {opts.url}")
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
return cast(Any, result_container[0])
class Download_Media(Cmdlet):
"""Class-based download-media cmdlet - yt-dlp only, streaming sites."""
def __init__(self) -> None:
"""Initialize download-media cmdlet."""
super().__init__(
name="download-media",
summary="Download media from streaming sites (YouTube, Twitch, etc.)",
usage=
"download-media <url> [options] or search-file | download-media [options]",
2025-12-12 21:55:38 -08:00
alias=[""],
2025-12-11 12:47:30 -08:00
arg=[
2025-12-12 21:55:38 -08:00
SharedArgs.URL,
2025-12-20 02:12:45 -08:00
SharedArgs.QUERY,
CmdletArg(
name="audio",
type="flag",
alias="a",
description="Download audio only"
),
2025-12-20 02:12:45 -08:00
CmdletArg(
2025-12-29 17:05:03 -08:00
name="format",
2025-12-20 02:12:45 -08:00
type="string",
2025-12-29 17:05:03 -08:00
alias="fmt",
description="Explicit yt-dlp format selector",
2025-12-20 02:12:45 -08:00
),
2025-12-29 17:05:03 -08:00
QueryArg(
"clip",
key="clip",
aliases=["range",
"section",
"sections"],
2025-12-29 17:05:03 -08:00
type="string",
required=False,
description=(
"Clip time ranges via -query keyed fields (e.g. clip:1m-2m or clip:00:01-00:10). "
"Comma-separated values supported."
),
query_only=True,
),
CmdletArg(
name="item",
type="string",
description="Item selection for playlists/formats"
2025-12-29 17:05:03 -08:00
),
SharedArgs.PATH,
],
detail=[
"Download media from streaming sites using yt-dlp.",
"For direct file downloads, use download-file.",
2025-12-11 12:47:30 -08:00
],
exec=self.run,
)
self.register()
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
"""Main execution method."""
stage_ctx = pipeline_context.get_stage_context()
in_pipeline = stage_ctx is not None and getattr(
stage_ctx,
"total_stages",
1
) > 1
2025-12-11 12:47:30 -08:00
if in_pipeline and isinstance(config, dict):
config["_quiet_background_output"] = True
return self._run_impl(result, args, config)
2025-12-22 02:11:53 -08:00
@staticmethod
def _normalize_urls(parsed: Dict[str, Any]) -> List[str]:
raw_url = parsed.get("url", [])
if isinstance(raw_url, str):
raw_url = [raw_url]
2025-12-11 12:47:30 -08:00
2025-12-22 02:11:53 -08:00
expanded_urls: List[str] = []
2025-12-29 17:05:03 -08:00
for u in raw_url or []:
2025-12-22 02:11:53 -08:00
if u is None:
continue
s = str(u).strip()
if not s:
continue
if "," in s:
parts = [p.strip() for p in s.split(",")]
expanded_urls.extend([p for p in parts if p])
else:
expanded_urls.append(s)
return expanded_urls
@staticmethod
def _append_urls_from_piped_result(raw_urls: List[str], result: Any) -> List[str]:
if raw_urls:
return raw_urls
if not result:
return raw_urls
results_to_check = result if isinstance(result, list) else [result]
for item in results_to_check:
try:
url = get_field(item, "url") or get_field(item, "target")
except Exception:
url = None
if url:
raw_urls.append(url)
return raw_urls
@staticmethod
def _filter_supported_urls(raw_urls: Sequence[str]) -> tuple[List[str], List[str]]:
supported = [url for url in (raw_urls or []) if is_url_supported_by_ytdlp(url)]
# Preserve original debug semantics: count unique unsupported URLs.
unsupported = list(set(raw_urls or []) - set(supported or []))
return supported, unsupported
def _parse_query_keyed_spec(self,
query_spec: Optional[str]) -> Dict[str,
List[str]]:
2025-12-22 02:11:53 -08:00
if not query_spec:
return {}
try:
2025-12-29 17:05:03 -08:00
keyed = self._parse_keyed_csv_spec(str(query_spec), default_key="hash")
if not keyed:
return {}
# Normalize aliases so users can write shorter/alternate keys.
# Note: download-media uses a comma-separated keyed spec language inside -query.
def _alias(src: str, dest: str) -> None:
try:
values = keyed.get(src)
except Exception:
values = None
if not values:
return
try:
keyed.setdefault(dest, []).extend(list(values))
except Exception:
pass
try:
keyed.pop(src, None)
except Exception:
pass
for src in ("range", "ranges", "section", "sections"):
_alias(src, "clip")
for src in ("fmt", "f"):
_alias(src, "format")
for src in ("aud", "a"):
_alias(src, "audio")
return keyed
2025-12-22 02:11:53 -08:00
except Exception:
return {}
2025-12-11 12:47:30 -08:00
2025-12-22 02:11:53 -08:00
@staticmethod
2025-12-29 17:05:03 -08:00
def _extract_hash_override(
query_spec: Optional[str],
query_keyed: Dict[str,
List[str]]
2025-12-29 17:05:03 -08:00
) -> Optional[str]:
2025-12-22 02:11:53 -08:00
try:
hash_values = query_keyed.get("hash",
[]) if isinstance(query_keyed,
dict) else []
2025-12-29 17:05:03 -08:00
hash_candidate = hash_values[-1] if hash_values else None
2025-12-22 02:11:53 -08:00
if hash_candidate:
return sh.parse_single_hash_query(f"hash:{hash_candidate}")
2025-12-11 12:47:30 -08:00
2025-12-22 02:11:53 -08:00
# Backwards-compatible: treat a non-keyed query as a hash query.
2025-12-23 16:36:39 -08:00
# If the query uses keyed specs (e.g. format:, item:, clip:), do NOT attempt
# to interpret the whole string as a hash.
try:
has_non_hash_keys = bool(
query_keyed and isinstance(query_keyed,
dict)
and any(
k
for k in query_keyed.keys() if str(k).strip().lower() != "hash"
)
2025-12-23 16:36:39 -08:00
)
except Exception:
has_non_hash_keys = False
if has_non_hash_keys:
return None
2025-12-22 02:11:53 -08:00
return sh.parse_single_hash_query(str(query_spec)) if query_spec else None
except Exception:
return None
2025-12-11 12:47:30 -08:00
2025-12-22 02:11:53 -08:00
def _parse_clip_ranges_and_apply_items(
self,
*,
clip_spec: Optional[str],
query_keyed: Dict[str,
List[str]],
parsed: Dict[str,
Any],
2025-12-22 02:11:53 -08:00
query_spec: Optional[str],
) -> tuple[Optional[List[tuple[int,
int]]],
bool,
List[str]]:
2025-12-22 02:11:53 -08:00
clip_ranges: Optional[List[tuple[int, int]]] = None
clip_values: List[str] = []
item_values: List[str] = []
if clip_spec:
# Support keyed clip syntax:
2025-12-29 17:05:03 -08:00
# -query "clip:3m4s-3m14s,1h22m-1h33m,item:2-3"
2025-12-22 02:11:53 -08:00
keyed = self._parse_keyed_csv_spec(str(clip_spec), default_key="clip")
clip_values.extend(keyed.get("clip", []) or [])
item_values.extend(keyed.get("item", []) or [])
# Allow the same keyed spec language inside -query so users can do:
# download-media <url> -query "clip:1m-1m15s,2m1s-2m11s"
if query_keyed:
clip_values.extend(query_keyed.get("clip", []) or [])
item_values.extend(query_keyed.get("item", []) or [])
if item_values and not parsed.get("item"):
parsed["item"] = ",".join([v for v in item_values if v])
if clip_values:
clip_ranges = self._parse_time_ranges(
",".join([v for v in clip_values if v])
)
2025-12-22 02:11:53 -08:00
if not clip_ranges:
bad_spec = clip_spec or query_spec
log(f"Invalid clip format: {bad_spec}", file=sys.stderr)
return None, True, clip_values
return clip_ranges, False, clip_values
@staticmethod
def _init_storage(config: Dict[str, Any]) -> tuple[Optional[Any], bool]:
storage = None
hydrus_available = True
try:
from Store import Store
2025-12-29 17:05:03 -08:00
storage = Store(
config=config or {},
suppress_debug=True
)
2025-12-22 02:11:53 -08:00
from API.HydrusNetwork import is_hydrus_available
2025-12-29 17:05:03 -08:00
2025-12-22 02:11:53 -08:00
hydrus_available = bool(is_hydrus_available(config or {}))
except Exception:
storage = None
return storage, hydrus_available
2025-12-20 02:12:45 -08:00
2025-12-22 02:11:53 -08:00
@staticmethod
def _cookiefile_str(ytdlp_tool: YtDlpTool) -> Optional[str]:
try:
cookie_path = ytdlp_tool.resolve_cookiefile()
if cookie_path is not None and cookie_path.is_file():
return str(cookie_path)
except Exception:
pass
return None
2025-12-20 02:12:45 -08:00
2025-12-22 02:11:53 -08:00
def _list_formats_cached(
self,
u: str,
*,
playlist_items_value: Optional[str],
formats_cache: Dict[str,
Optional[List[Dict[str,
Any]]]],
2025-12-22 02:11:53 -08:00
ytdlp_tool: YtDlpTool,
) -> Optional[List[Dict[str,
Any]]]:
2025-12-22 02:11:53 -08:00
key = f"{u}||{playlist_items_value or ''}"
if key in formats_cache:
return formats_cache[key]
fmts = list_formats(
u,
no_playlist=False,
playlist_items=playlist_items_value,
cookiefile=self._cookiefile_str(ytdlp_tool),
)
formats_cache[key] = fmts
return fmts
2025-12-23 16:36:39 -08:00
def _is_browseable_format(self, fmt: Any) -> bool:
"""Return True for formats that are sensible to show in the format table."""
if not isinstance(fmt, dict):
return False
format_id = str(fmt.get("format_id") or "").strip()
if not format_id:
return False
ext = str(fmt.get("ext") or "").strip().lower()
if ext in {"mhtml",
"json"}:
2025-12-23 16:36:39 -08:00
return False
note = str(fmt.get("format_note") or "").lower()
if "storyboard" in note:
return False
if format_id.lower().startswith("sb"):
return False
vcodec = str(fmt.get("vcodec", "none"))
acodec = str(fmt.get("acodec", "none"))
# Keep anything with at least one stream.
return not (vcodec == "none" and acodec == "none")
def _format_id_for_query_index(
self,
query_format: str,
url: str,
formats_cache: Dict[str,
Optional[List[Dict[str,
Any]]]],
2025-12-23 16:36:39 -08:00
ytdlp_tool: YtDlpTool,
) -> Optional[str]:
"""Resolve a numeric 'format:N' query into an actual yt-dlp format selector.
Acceptable forms: '7', '#7', ' 7 ' (whitespace allowed). Uses the same
browseable filtering rules as the interactive table and selects the
1-based index. Returns a yt-dlp format string (possibly with +ba added
for video-only formats). Raises ValueError when the index is invalid or
formats cannot be listed.
"""
import re
if not query_format or not re.match(r"^\s*#?\d+\s*$", str(query_format)):
return None
try:
idx = int(str(query_format).lstrip("#").strip())
except Exception:
raise ValueError(f"Invalid format index: {query_format}")
fmts = self._list_formats_cached(
url,
playlist_items_value=None,
formats_cache=formats_cache,
ytdlp_tool=ytdlp_tool,
)
if not fmts:
2025-12-29 17:05:03 -08:00
raise ValueError(
"Unable to list formats for the URL; cannot resolve numeric format index"
)
2025-12-23 16:36:39 -08:00
candidate_formats = [f for f in fmts if self._is_browseable_format(f)]
filtered_formats = candidate_formats if candidate_formats else list(fmts)
if not filtered_formats:
raise ValueError("No formats available for selection")
if idx <= 0 or idx > len(filtered_formats):
raise ValueError(
f"Format index {idx} out of range (1..{len(filtered_formats)})"
)
2025-12-23 16:36:39 -08:00
chosen = filtered_formats[idx - 1]
selection_format_id = str(chosen.get("format_id") or "").strip()
if not selection_format_id:
raise ValueError("Selected format has no format_id")
try:
vcodec = str(chosen.get("vcodec", "none"))
acodec = str(chosen.get("acodec", "none"))
if vcodec != "none" and acodec == "none":
selection_format_id = f"{selection_format_id}+ba"
except Exception:
pass
return selection_format_id
@staticmethod
def _format_selector_for_query_height(query_format: str) -> Optional[str]:
"""Translate a query value like '720p' into a yt-dlp -f selector.
Returns a selector that chooses the best video at or under the requested
height and always pairs it with audio.
Example: '640p' -> 'bv*[height<=640]+ba'
Notes:
- Only the '<digits>p' form is treated as a height cap to avoid
ambiguity with numeric format IDs and numeric index selection.
"""
import re
if query_format is None:
return None
s = str(query_format).strip().lower()
m = re.match(r"^(\d{2,5})p$", s)
if not m:
return None
try:
height = int(m.group(1))
except Exception:
return None
if height <= 0:
raise ValueError(f"Invalid height selection: {query_format}")
return f"bv*[height<={height}]+ba"
2025-12-22 02:11:53 -08:00
@staticmethod
2025-12-29 17:05:03 -08:00
def _canonicalize_url_for_storage(
*,
requested_url: str,
ytdlp_tool: YtDlpTool,
playlist_items: Optional[str]
2025-12-29 17:05:03 -08:00
) -> str:
2025-12-22 02:11:53 -08:00
# Prefer yt-dlp's canonical webpage URL (e.g. strips timestamps/redirects).
# Fall back to the requested URL if probing fails.
# Important: when playlist item selection is used, avoid probing (can hang on large playlists).
if playlist_items:
return str(requested_url)
try:
cf = None
2025-12-20 02:12:45 -08:00
try:
2025-12-22 02:11:53 -08:00
cookie_path = ytdlp_tool.resolve_cookiefile()
if cookie_path is not None and cookie_path.is_file():
cf = str(cookie_path)
2025-12-20 02:12:45 -08:00
except Exception:
2025-12-22 02:11:53 -08:00
cf = None
pr = probe_url(
requested_url,
no_playlist=False,
timeout_seconds=15,
cookiefile=cf
)
2025-12-22 02:11:53 -08:00
if isinstance(pr, dict):
for key in ("webpage_url", "original_url", "url", "requested_url"):
value = pr.get(key)
if isinstance(value, str) and value.strip():
return value.strip()
except Exception:
pass
return str(requested_url)
def _preflight_url_duplicate(
self,
*,
storage: Any,
hydrus_available: bool,
final_output_dir: Path,
candidate_url: str,
extra_urls: Optional[Sequence[str]] = None,
) -> bool:
# NOTE: download-media sets _quiet_background_output=True when running in a pipeline to
# reduce background noise. URL de-dup is interactive and must still run in pipelines.
if storage is None:
debug("Preflight URL check skipped: storage unavailable")
return True
debug(f"Preflight URL check: candidate={candidate_url}")
2025-12-12 21:55:38 -08:00
2025-12-22 02:11:53 -08:00
try:
from metadata import normalize_urls
except Exception:
normalize_urls = None # type: ignore[assignment]
2025-12-11 12:47:30 -08:00
2025-12-22 02:11:53 -08:00
needles: List[str] = []
if normalize_urls is not None:
for raw in [candidate_url, *(list(extra_urls) if extra_urls else [])]:
2025-12-20 02:12:45 -08:00
try:
2025-12-22 02:11:53 -08:00
needles.extend(normalize_urls(raw))
2025-12-20 02:12:45 -08:00
except Exception:
2025-12-22 02:11:53 -08:00
continue
# Fallback: always have at least one needle
if not needles:
needles = [str(candidate_url)]
2025-12-14 00:53:52 -08:00
2025-12-22 02:11:53 -08:00
# Deduplicate needles (preserve order)
seen_needles: List[str] = []
for needle in needles:
if needle and needle not in seen_needles:
seen_needles.append(needle)
needles = seen_needles
2025-12-14 00:53:52 -08:00
2025-12-22 02:11:53 -08:00
try:
debug(f"Preflight URL needles: {needles}")
except Exception:
pass
2025-12-14 00:53:52 -08:00
2025-12-22 02:11:53 -08:00
url_matches: List[Dict[str, Any]] = []
try:
from Store.HydrusNetwork import HydrusNetwork
# Avoid searching the temp/download directory backend during dedup.
# We only want to warn about duplicates in real stores.
backend_names_all = storage.list_searchable_backends()
backend_names: List[str] = []
skipped: List[str] = []
for backend_name in backend_names_all:
2025-12-14 00:53:52 -08:00
try:
2025-12-22 02:11:53 -08:00
backend = storage[backend_name]
2025-12-14 00:53:52 -08:00
except Exception:
2025-12-22 02:11:53 -08:00
continue
2025-12-14 00:53:52 -08:00
try:
2025-12-22 02:11:53 -08:00
if str(backend_name).strip().lower() == "temp":
skipped.append(backend_name)
continue
2025-12-14 00:53:52 -08:00
except Exception:
pass
2025-12-22 02:11:53 -08:00
# Heuristic: if a Folder backend points at the configured temp output dir, skip it.
2025-12-14 00:53:52 -08:00
try:
2025-12-22 02:11:53 -08:00
backend_location = getattr(backend, "_location", None)
if backend_location and final_output_dir:
backend_path = Path(str(backend_location)
).expanduser().resolve()
2025-12-22 02:11:53 -08:00
temp_path = Path(str(final_output_dir)).expanduser().resolve()
if backend_path == temp_path:
skipped.append(backend_name)
2025-12-14 00:53:52 -08:00
continue
2025-12-22 02:11:53 -08:00
except Exception:
pass
2025-12-14 00:53:52 -08:00
2025-12-22 02:11:53 -08:00
backend_names.append(backend_name)
2025-12-14 00:53:52 -08:00
2025-12-22 02:11:53 -08:00
try:
if skipped:
debug(
f"Preflight backends: {backend_names} (skipped temp: {skipped})"
)
2025-12-22 02:11:53 -08:00
else:
debug(f"Preflight backends: {backend_names}")
except Exception:
pass
2025-12-14 00:53:52 -08:00
2025-12-22 02:11:53 -08:00
for backend_name in backend_names:
backend = storage[backend_name]
if isinstance(backend, HydrusNetwork) and not hydrus_available:
continue
2025-12-14 00:53:52 -08:00
2025-12-22 02:11:53 -08:00
backend_hits: List[Dict[str, Any]] = []
for needle in needles:
2025-12-14 00:53:52 -08:00
try:
2025-12-22 02:11:53 -08:00
backend_hits = backend.search(f"url:{needle}", limit=25) or []
if backend_hits:
break
2025-12-14 00:53:52 -08:00
except Exception:
2025-12-22 02:11:53 -08:00
continue
if backend_hits:
2025-12-29 17:05:03 -08:00
url_matches.extend(
[
dict(x) if isinstance(x,
dict) else {
"title": str(x)
} for x in backend_hits
2025-12-29 17:05:03 -08:00
]
)
2025-12-14 00:53:52 -08:00
2025-12-22 02:11:53 -08:00
if len(url_matches) >= 25:
url_matches = url_matches[:25]
break
except Exception:
url_matches = []
2025-12-14 00:53:52 -08:00
2025-12-22 02:11:53 -08:00
if not url_matches:
debug("Preflight URL check: no matches")
return True
2025-12-14 00:53:52 -08:00
2025-12-25 04:49:22 -08:00
# If the user already answered the duplicate URL prompt for this pipeline/command,
# respect that decision and don't re-prompt for every item.
try:
current_cmd_text = pipeline_context.get_current_command_text("")
except Exception:
current_cmd_text = ""
try:
stage_ctx = pipeline_context.get_stage_context()
except Exception:
stage_ctx = None
in_pipeline = bool(
stage_ctx is not None or ("|" in str(current_cmd_text or ""))
)
2025-12-25 04:49:22 -08:00
if in_pipeline:
try:
2025-12-29 17:05:03 -08:00
cached_cmd = pipeline_context.load_value(
"preflight.url_duplicates.command",
default=""
2025-12-29 17:05:03 -08:00
)
cached_decision = pipeline_context.load_value(
"preflight.url_duplicates.continue",
default=None
2025-12-29 17:05:03 -08:00
)
2025-12-25 04:49:22 -08:00
except Exception:
cached_cmd = ""
cached_decision = None
if cached_decision is not None and str(cached_cmd or "") == str(
current_cmd_text or ""):
2025-12-25 04:49:22 -08:00
if bool(cached_decision):
return True
try:
2025-12-29 17:05:03 -08:00
pipeline_context.request_pipeline_stop(
reason="duplicate-url declined",
exit_code=0
2025-12-29 17:05:03 -08:00
)
2025-12-25 04:49:22 -08:00
except Exception:
pass
return False
2025-12-22 02:11:53 -08:00
table = ResultTable(f"URL already exists ({len(url_matches)} match(es))")
results_list: List[Dict[str, Any]] = []
for item in url_matches:
if "title" not in item:
2025-12-29 17:05:03 -08:00
item["title"] = (
item.get("name") or item.get("target") or item.get("path")
or "Result"
2025-12-29 17:05:03 -08:00
)
2025-12-14 00:53:52 -08:00
2025-12-22 02:11:53 -08:00
# Keep the full payload for history/inspection, but display a focused table.
# Use shared extractors so Ext/Size/Store/Hash remain consistent everywhere.
try:
from result_table import build_display_row
except Exception:
build_display_row = None # type: ignore
2025-12-14 00:53:52 -08:00
2025-12-22 02:11:53 -08:00
if callable(build_display_row):
2025-12-29 17:05:03 -08:00
display_row = build_display_row(
item,
keys=["title",
"store",
"hash",
"ext",
"size"]
2025-12-29 17:05:03 -08:00
)
2025-12-22 02:11:53 -08:00
else:
display_row = {
"title": item.get("title"),
"store": item.get("store"),
"hash": item.get("hash") or item.get("file_hash")
or item.get("sha256"),
2025-12-22 02:11:53 -08:00
"ext": str(item.get("ext") or ""),
"size": item.get("size") or item.get("size_bytes"),
}
table.add_result(display_row)
results_list.append(item)
pipeline_context.set_current_stage_table(table)
pipeline_context.set_last_result_table(table, results_list)
2025-12-20 23:57:44 -08:00
2025-12-22 02:11:53 -08:00
suspend = getattr(pipeline_context, "suspend_live_progress", None)
used_suspend = False
cm: AbstractContextManager[Any] = nullcontext()
if callable(suspend):
try:
maybe_cm = suspend()
if maybe_cm is not None:
cm = maybe_cm # type: ignore[assignment]
used_suspend = True
except Exception:
cm = nullcontext()
used_suspend = False
with cm:
get_stderr_console().print(table)
setattr(table, "_rendered_by_cmdlet", True)
2025-12-29 17:05:03 -08:00
answered_yes = bool(
Confirm.ask(
"Continue anyway?",
default=False,
console=get_stderr_console()
)
2025-12-29 17:05:03 -08:00
)
2025-12-25 04:49:22 -08:00
# Cache decision for the duration of this pipeline/command.
if in_pipeline:
try:
existing = pipeline_context.load_value("preflight", default=None)
except Exception:
existing = None
preflight_cache: Dict[str,
Any] = existing if isinstance(existing,
dict) else {}
2025-12-25 04:49:22 -08:00
url_dup_cache = preflight_cache.get("url_duplicates")
if not isinstance(url_dup_cache, dict):
url_dup_cache = {}
url_dup_cache["command"] = str(current_cmd_text or "")
url_dup_cache["continue"] = bool(answered_yes)
preflight_cache["url_duplicates"] = url_dup_cache
try:
pipeline_context.store_value("preflight", preflight_cache)
except Exception:
pass
if not answered_yes:
if in_pipeline and used_suspend:
2025-12-21 05:10:09 -08:00
try:
2025-12-29 17:05:03 -08:00
pipeline_context.request_pipeline_stop(
reason="duplicate-url declined",
exit_code=0
2025-12-29 17:05:03 -08:00
)
2025-12-21 05:10:09 -08:00
except Exception:
2025-12-22 02:11:53 -08:00
pass
return False
return True
2025-12-21 05:10:09 -08:00
2025-12-22 02:11:53 -08:00
def _preflight_url_duplicates_bulk(
self,
*,
storage: Any,
hydrus_available: bool,
final_output_dir: Path,
urls: Sequence[str],
) -> bool:
"""Preflight URL de-dup for a batch of URLs.
Purpose:
- Avoid per-item interactive URL checks inside a playlist loop.
- Let the user see ALL duplicates up front, before any downloads start.
"""
if storage is None:
debug("Bulk URL preflight skipped: storage unavailable")
return True
2025-12-25 04:49:22 -08:00
# Honor any prior duplicate URL decision for this pipeline/command.
try:
current_cmd_text = pipeline_context.get_current_command_text("")
except Exception:
current_cmd_text = ""
try:
stage_ctx = pipeline_context.get_stage_context()
except Exception:
stage_ctx = None
in_pipeline = bool(
stage_ctx is not None or ("|" in str(current_cmd_text or ""))
)
2025-12-25 04:49:22 -08:00
if in_pipeline:
try:
2025-12-29 17:05:03 -08:00
cached_cmd = pipeline_context.load_value(
"preflight.url_duplicates.command",
default=""
2025-12-29 17:05:03 -08:00
)
cached_decision = pipeline_context.load_value(
"preflight.url_duplicates.continue",
default=None
2025-12-29 17:05:03 -08:00
)
2025-12-25 04:49:22 -08:00
except Exception:
cached_cmd = ""
cached_decision = None
if cached_decision is not None and str(cached_cmd or "") == str(
current_cmd_text or ""):
2025-12-25 04:49:22 -08:00
if bool(cached_decision):
return True
try:
2025-12-29 17:05:03 -08:00
pipeline_context.request_pipeline_stop(
reason="duplicate-url declined",
exit_code=0
2025-12-29 17:05:03 -08:00
)
2025-12-25 04:49:22 -08:00
except Exception:
pass
return False
2025-12-22 02:11:53 -08:00
unique_urls: List[str] = []
for u in urls or []:
s = str(u or "").strip()
if s and s not in unique_urls:
unique_urls.append(s)
if len(unique_urls) <= 1:
return True
2025-12-14 00:53:52 -08:00
2025-12-22 02:11:53 -08:00
try:
from metadata import normalize_urls
except Exception:
normalize_urls = None # type: ignore[assignment]
2025-12-14 00:53:52 -08:00
2025-12-22 02:11:53 -08:00
def _httpish(value: str) -> bool:
try:
return bool(value) and (
value.startswith("http://") or value.startswith("https://")
)
2025-12-22 02:11:53 -08:00
except Exception:
return False
url_needles: Dict[str,
List[str]] = {}
2025-12-22 02:11:53 -08:00
for u in unique_urls:
needles: List[str] = []
if normalize_urls is not None:
2025-12-21 05:10:09 -08:00
try:
needles.extend(
[n for n in (normalize_urls(u) or []) if isinstance(n, str)]
)
2025-12-21 05:10:09 -08:00
except Exception:
2025-12-22 02:11:53 -08:00
needles = []
if not needles:
needles = [u]
# Prefer http(s) needles for store lookups.
filtered: List[str] = []
for n in needles:
n2 = str(n or "").strip()
if not n2:
continue
if not _httpish(n2):
continue
if n2 not in filtered:
filtered.append(n2)
url_needles[u] = filtered if filtered else [u]
2025-12-21 05:10:09 -08:00
2025-12-22 02:11:53 -08:00
# Determine backends once (same filtering as per-URL preflight).
backend_names: List[str] = []
try:
backend_names_all = storage.list_searchable_backends()
except Exception:
backend_names_all = []
2025-12-20 23:57:44 -08:00
2025-12-22 02:11:53 -08:00
for backend_name in backend_names_all:
try:
backend = storage[backend_name]
except Exception:
continue
2025-12-20 23:57:44 -08:00
2025-12-22 02:11:53 -08:00
try:
if str(backend_name).strip().lower() == "temp":
continue
except Exception:
pass
2025-12-20 23:57:44 -08:00
2025-12-22 02:11:53 -08:00
try:
backend_location = getattr(backend, "_location", None)
if backend_location and final_output_dir:
backend_path = Path(str(backend_location)).expanduser().resolve()
temp_path = Path(str(final_output_dir)).expanduser().resolve()
if backend_path == temp_path:
continue
except Exception:
pass
2025-12-20 23:57:44 -08:00
2025-12-22 02:11:53 -08:00
backend_names.append(backend_name)
2025-12-20 23:57:44 -08:00
2025-12-22 02:11:53 -08:00
if not backend_names:
debug("Bulk URL preflight skipped: no searchable backends")
return True
2025-12-20 23:57:44 -08:00
2025-12-22 02:11:53 -08:00
# Collect matches as display rows (cap to keep output reasonable)
seen_pairs: set[tuple[str, str]] = set()
matched_urls: set[str] = set()
match_rows: List[Dict[str, Any]] = []
max_rows = 200
2025-12-20 23:57:44 -08:00
2025-12-22 02:11:53 -08:00
try:
from Store.HydrusNetwork import HydrusNetwork
except Exception:
HydrusNetwork = None # type: ignore
2025-12-20 23:57:44 -08:00
2025-12-22 02:11:53 -08:00
for backend_name in backend_names:
if len(match_rows) >= max_rows:
break
try:
backend = storage[backend_name]
except Exception:
continue
2025-12-20 23:57:44 -08:00
2025-12-22 02:11:53 -08:00
if HydrusNetwork is not None and isinstance(backend, HydrusNetwork):
if not hydrus_available:
continue
2025-12-20 23:57:44 -08:00
2025-12-22 02:11:53 -08:00
client = getattr(backend, "_client", None)
if client is None:
continue
2025-12-20 23:57:44 -08:00
2025-12-22 02:11:53 -08:00
for original_url, needles in url_needles.items():
2025-12-20 23:57:44 -08:00
if len(match_rows) >= max_rows:
break
2025-12-22 02:11:53 -08:00
if (original_url, str(backend_name)) in seen_pairs:
2025-12-20 23:57:44 -08:00
continue
2025-12-22 02:11:53 -08:00
# Fast-path: ask Hydrus whether it already knows this URL.
found_hash: Optional[str] = None
found = False
for needle in (needles or [])[:3]:
if not _httpish(needle):
2025-12-20 23:57:44 -08:00
continue
2025-12-22 02:11:53 -08:00
try:
from API.HydrusNetwork import HydrusRequestSpec
2025-12-20 23:57:44 -08:00
2025-12-22 02:11:53 -08:00
spec = HydrusRequestSpec(
method="GET",
endpoint="/add_urls/get_url_files",
query={
"url": needle
},
2025-12-22 02:11:53 -08:00
)
response = client._perform_request(
spec
) # type: ignore[attr-defined]
2025-12-22 02:11:53 -08:00
raw_hashes = None
if isinstance(response, dict):
raw_hashes = response.get("hashes") or response.get(
"file_hashes"
)
2025-12-22 02:11:53 -08:00
raw_ids = response.get("file_ids")
has_ids = isinstance(raw_ids, list) and len(raw_ids) > 0
has_hashes = isinstance(raw_hashes,
list) and len(raw_hashes) > 0
2025-12-22 02:11:53 -08:00
if has_hashes:
try:
found_hash = str(
raw_hashes[0]
).strip() # type: ignore[index]
2025-12-22 02:11:53 -08:00
except Exception:
found_hash = None
if has_ids or has_hashes:
found = True
break
except Exception:
2025-12-20 23:57:44 -08:00
continue
2025-12-22 02:11:53 -08:00
if not found:
2025-12-20 23:57:44 -08:00
continue
2025-12-22 02:11:53 -08:00
seen_pairs.add((original_url, str(backend_name)))
matched_urls.add(original_url)
display_row = {
"title":
"(exists)",
"store":
str(backend_name),
"hash":
found_hash or "",
"url":
original_url,
2025-12-22 02:11:53 -08:00
"columns": [
("Title",
"(exists)"),
("Store",
str(backend_name)),
("Hash",
found_hash or ""),
("URL",
original_url),
2025-12-22 02:11:53 -08:00
],
}
match_rows.append(display_row)
continue
2025-12-20 23:57:44 -08:00
2025-12-22 02:11:53 -08:00
# Generic backends: use the existing search() contract.
for original_url, needles in url_needles.items():
if len(match_rows) >= max_rows:
break
if (original_url, str(backend_name)) in seen_pairs:
continue
2025-12-20 23:57:44 -08:00
2025-12-22 02:11:53 -08:00
backend_hits: List[Dict[str, Any]] = []
for needle in (needles or [])[:3]:
try:
backend_hits = backend.search(f"url:{needle}", limit=1) or []
if backend_hits:
break
except Exception:
continue
2025-12-20 23:57:44 -08:00
2025-12-22 02:11:53 -08:00
if not backend_hits:
continue
2025-12-21 05:10:09 -08:00
2025-12-22 02:11:53 -08:00
seen_pairs.add((original_url, str(backend_name)))
matched_urls.add(original_url)
hit = backend_hits[0]
2025-12-29 17:05:03 -08:00
title = (
hit.get("title") or hit.get("name") or hit.get("target")
or hit.get("path") or "(exists)"
2025-12-29 17:05:03 -08:00
)
file_hash = hit.get("hash") or hit.get("file_hash"
) or hit.get("sha256") or ""
2025-12-20 23:57:44 -08:00
try:
2025-12-22 02:11:53 -08:00
from result_table import build_display_row
2025-12-20 23:57:44 -08:00
except Exception:
2025-12-22 02:11:53 -08:00
build_display_row = None # type: ignore
extracted = {
"title": str(title),
"store": str(hit.get("store") or backend_name),
"hash": str(file_hash or ""),
"ext": "",
"size": None,
}
if callable(build_display_row):
try:
2025-12-29 17:05:03 -08:00
extracted = build_display_row(
hit,
keys=["title",
"store",
"hash",
"ext",
"size"]
2025-12-29 17:05:03 -08:00
)
2025-12-22 02:11:53 -08:00
except Exception:
pass
# Ensure we still prefer the precomputed values for title/store/hash.
extracted["title"] = str(title)
extracted["store"] = str(hit.get("store") or backend_name)
extracted["hash"] = str(file_hash or "")
ext = extracted.get("ext")
size_val = extracted.get("size")
display_row = {
"title":
str(title),
"store":
str(hit.get("store") or backend_name),
"hash":
str(file_hash or ""),
"ext":
str(ext or ""),
"size":
size_val,
"url":
original_url,
2025-12-22 02:11:53 -08:00
"columns": [
("Title",
str(title)),
("Store",
str(hit.get("store") or backend_name)),
("Hash",
str(file_hash or "")),
("Ext",
str(ext or "")),
("Size",
size_val),
("URL",
original_url),
2025-12-22 02:11:53 -08:00
],
}
match_rows.append(display_row)
2025-12-20 23:57:44 -08:00
2025-12-22 02:11:53 -08:00
if not match_rows:
debug("Bulk URL preflight: no matches")
return True
2025-12-20 23:57:44 -08:00
2025-12-22 02:11:53 -08:00
# This table is non-interactive and intentionally wide (we want URL + ext/size).
table = ResultTable(
f"URL already exists ({len(matched_urls)} url(s))",
max_columns=10
)
2025-12-22 02:11:53 -08:00
table.set_no_choice(True)
try:
table.set_preserve_order(True)
except Exception:
pass
for row in match_rows:
table.add_result(row)
2025-12-20 23:57:44 -08:00
2025-12-22 02:11:53 -08:00
# Display as an overlay so we don't clobber the current selectable table/history.
try:
pipeline_context.set_last_result_table_overlay(table, match_rows)
except Exception:
pass
2025-12-20 23:57:44 -08:00
2025-12-23 16:36:39 -08:00
_print_table_suspended(table)
2025-12-22 02:11:53 -08:00
setattr(table, "_rendered_by_cmdlet", True)
2025-12-14 00:53:52 -08:00
2025-12-23 16:36:39 -08:00
suspend = getattr(pipeline_context, "suspend_live_progress", None)
cm: AbstractContextManager[Any] = nullcontext()
if callable(suspend):
try:
maybe_cm = suspend()
if maybe_cm is not None:
cm = maybe_cm # type: ignore[assignment]
except Exception:
cm = nullcontext()
with cm:
2025-12-29 17:05:03 -08:00
answered_yes = bool(
Confirm.ask(
"Continue anyway?",
default=False,
console=get_stderr_console()
)
2025-12-29 17:05:03 -08:00
)
2025-12-25 04:49:22 -08:00
# Cache decision for the duration of this pipeline/command.
if in_pipeline:
try:
existing = pipeline_context.load_value("preflight", default=None)
except Exception:
existing = None
preflight_cache: Dict[str,
Any] = existing if isinstance(existing,
dict) else {}
2025-12-25 04:49:22 -08:00
url_dup_cache = preflight_cache.get("url_duplicates")
if not isinstance(url_dup_cache, dict):
url_dup_cache = {}
url_dup_cache["command"] = str(current_cmd_text or "")
url_dup_cache["continue"] = bool(answered_yes)
preflight_cache["url_duplicates"] = url_dup_cache
try:
pipeline_context.store_value("preflight", preflight_cache)
except Exception:
pass
if not answered_yes:
if in_pipeline:
try:
2025-12-29 17:05:03 -08:00
pipeline_context.request_pipeline_stop(
reason="duplicate-url declined",
exit_code=0
2025-12-29 17:05:03 -08:00
)
2025-12-25 04:49:22 -08:00
except Exception:
pass
2025-12-23 16:36:39 -08:00
return False
2025-12-22 02:11:53 -08:00
return True
def _maybe_show_playlist_table(self, *, url: str, ytdlp_tool: YtDlpTool) -> bool:
"""Show a normal selectable playlist table when URL yields multiple entries."""
try:
cf = self._cookiefile_str(ytdlp_tool)
pr = probe_url(url, no_playlist=False, timeout_seconds=15, cookiefile=cf)
except Exception:
pr = None
if not isinstance(pr, dict):
return False
entries = pr.get("entries")
if not isinstance(entries, list) or len(entries) <= 1:
return False
2025-12-25 04:49:22 -08:00
# Identify a stable table type so `@* | ...` pipelines can auto-insert the
# appropriate downloader stage (e.g., Bandcamp selections should insert
# `download-media` before `merge-file`).
extractor_name = ""
try:
2025-12-29 17:05:03 -08:00
extractor_name = (
str(pr.get("extractor") or pr.get("extractor_key")
or "").strip().lower()
2025-12-29 17:05:03 -08:00
)
2025-12-25 04:49:22 -08:00
except Exception:
extractor_name = ""
table_type: Optional[str] = None
if "bandcamp" in extractor_name:
table_type = "bandcamp"
elif "youtube" in extractor_name:
table_type = "youtube"
2025-12-22 02:11:53 -08:00
# Display table (limit rows to keep output reasonable)
max_rows = 200
display_entries = entries[:max_rows]
def _entry_to_url(entry: Any) -> Optional[str]:
if not isinstance(entry, dict):
return None
# Prefer explicit absolute URLs when present
for key in ("webpage_url", "original_url", "url"):
v = entry.get(key)
if isinstance(v, str) and v.strip():
s = v.strip()
2025-12-16 23:23:43 -08:00
try:
if urlparse(s).scheme in {"http",
"https"}:
2025-12-22 02:11:53 -08:00
return s
2025-12-16 23:23:43 -08:00
except Exception:
2025-12-22 02:11:53 -08:00
return s
# Best-effort YouTube fallback from id
entry_id = entry.get("id")
if isinstance(entry_id, str) and entry_id.strip():
extractor_name = str(
pr.get("extractor") or pr.get("extractor_key") or ""
).lower()
2025-12-22 02:11:53 -08:00
if "youtube" in extractor_name:
return f"https://www.youtube.com/watch?v={entry_id.strip()}"
return None
2025-12-14 00:53:52 -08:00
2025-12-22 02:11:53 -08:00
table = ResultTable()
safe_url = str(url or "").strip()
table.title = f'download-media -url "{safe_url}"' if safe_url else "download-media"
2025-12-25 04:49:22 -08:00
if table_type:
try:
table.set_table(table_type)
except Exception:
table.table = table_type
2025-12-22 02:11:53 -08:00
table.set_source_command("download-media", [])
try:
table.set_preserve_order(True)
except Exception:
pass
2025-12-14 00:53:52 -08:00
2025-12-22 02:11:53 -08:00
results_list: List[Dict[str, Any]] = []
for idx, entry in enumerate(display_entries, 1):
title = None
uploader = None
duration = None
entry_url = _entry_to_url(entry)
try:
if isinstance(entry, dict):
title = entry.get("title")
uploader = entry.get("uploader") or pr.get("uploader")
duration = entry.get("duration")
except Exception:
pass
2025-12-14 00:53:52 -08:00
row: Dict[str,
Any] = {
"table":
"download-media",
"title":
str(title or f"Item {idx}"),
"detail":
str(uploader or ""),
"media_kind":
"playlist-item",
"playlist_index":
idx,
"_selection_args": (
["-url",
str(entry_url)]
if entry_url else ["-url",
str(url),
"-item",
str(idx)]
),
"url":
entry_url,
"target":
entry_url,
"columns": [
("#",
str(idx)),
("Title",
str(title or "")),
("Duration",
str(duration or "")),
("Uploader",
str(uploader or "")),
],
}
2025-12-22 02:11:53 -08:00
results_list.append(row)
table.add_result(row)
2025-12-14 00:53:52 -08:00
2025-12-22 02:11:53 -08:00
pipeline_context.set_current_stage_table(table)
pipeline_context.set_last_result_table(table, results_list)
2025-12-14 00:53:52 -08:00
2025-12-23 16:36:39 -08:00
_print_table_suspended(table)
2025-12-22 02:11:53 -08:00
setattr(table, "_rendered_by_cmdlet", True)
return True
2025-12-14 00:53:52 -08:00
2025-12-22 02:11:53 -08:00
def _maybe_show_format_table_for_single_url(
self,
*,
mode: str,
clip_spec: Any,
clip_values: Sequence[str],
playlist_items: Optional[str],
ytdl_format: Any,
supported_url: Sequence[str],
playlist_selection_handled: bool,
ytdlp_tool: YtDlpTool,
formats_cache: Dict[str,
Optional[List[Dict[str,
Any]]]],
2025-12-22 02:11:53 -08:00
storage: Any,
hydrus_available: bool,
final_output_dir: Path,
args: Sequence[str],
) -> Optional[int]:
# If no -item, no explicit -format specified, and single URL, show the format table.
2025-12-29 17:05:03 -08:00
# Do NOT stop to show formats when -audio is used (auto-pick) or when clip ranges are requested via -query.
if (mode != "audio" and not clip_spec and not clip_values and not playlist_items
and not ytdl_format and len(supported_url) == 1
and not playlist_selection_handled):
2025-12-22 02:11:53 -08:00
url = supported_url[0]
canonical_url = self._canonicalize_url_for_storage(
requested_url=url,
ytdlp_tool=ytdlp_tool,
playlist_items=playlist_items,
)
if not self._preflight_url_duplicate(
storage=storage,
hydrus_available=hydrus_available,
final_output_dir=final_output_dir,
candidate_url=canonical_url,
extra_urls=[url],
2025-12-22 02:11:53 -08:00
):
log(f"Skipping download: {url}", file=sys.stderr)
return 0
2025-12-14 00:53:52 -08:00
2025-12-22 02:11:53 -08:00
formats = self._list_formats_cached(
url,
playlist_items_value=None,
formats_cache=formats_cache,
ytdlp_tool=ytdlp_tool,
)
2025-12-14 00:53:52 -08:00
2025-12-22 02:11:53 -08:00
if formats and len(formats) > 1:
# Formatlist filtering
#
# Goal:
# - Keep the list useful (hide non-media entries like storyboards)
# - But NEVER filter down so far that the user can't browse/pick formats.
def _is_browseable_format(fmt: Any) -> bool:
if not isinstance(fmt, dict):
return False
format_id = str(fmt.get("format_id") or "").strip()
if not format_id:
return False
ext = str(fmt.get("ext") or "").strip().lower()
if ext in {"mhtml",
"json"}:
2025-12-22 02:11:53 -08:00
return False
note = str(fmt.get("format_note") or "").lower()
if "storyboard" in note:
return False
if format_id.lower().startswith("sb"):
return False
vcodec = str(fmt.get("vcodec", "none"))
acodec = str(fmt.get("acodec", "none"))
# Keep anything with at least one stream.
return not (vcodec == "none" and acodec == "none")
candidate_formats = [f for f in formats if _is_browseable_format(f)]
filtered_formats = candidate_formats if candidate_formats else list(
formats
)
2025-12-22 02:11:53 -08:00
debug(
f"Formatlist: showing {len(filtered_formats)} formats (raw={len(formats)})"
)
2025-12-22 02:11:53 -08:00
# Build the base command that will be replayed with @N selection
base_cmd = f'download-media "{url}"'
2025-12-29 17:05:03 -08:00
remaining_args = [
arg for arg in args if arg not in [url] and not arg.startswith("-")
]
2025-12-22 02:11:53 -08:00
if remaining_args:
2025-12-29 17:05:03 -08:00
base_cmd += " " + " ".join(remaining_args)
2025-12-22 02:11:53 -08:00
# Create result table for display
2025-12-29 17:05:03 -08:00
table = ResultTable(
title=f"Available formats for {url}",
max_columns=10,
preserve_order=True
2025-12-29 17:05:03 -08:00
)
2025-12-22 02:11:53 -08:00
table.set_table("ytdlp.formatlist")
table.set_source_command("download-media", [url])
results_list: List[Dict[str, Any]] = []
for idx, fmt in enumerate(filtered_formats, 1):
resolution = fmt.get("resolution", "")
ext = fmt.get("ext", "")
vcodec = fmt.get("vcodec", "none")
acodec = fmt.get("acodec", "none")
filesize = fmt.get("filesize")
filesize_approx = fmt.get("filesize_approx")
format_id = fmt.get("format_id", "")
# If the chosen format is video-only (no audio stream), automatically
# request best audio too so the resulting file has sound.
selection_format_id = format_id
2025-12-16 23:23:43 -08:00
try:
2025-12-22 02:11:53 -08:00
if vcodec != "none" and acodec == "none" and format_id:
selection_format_id = f"{format_id}+ba"
2025-12-16 23:23:43 -08:00
except Exception:
2025-12-22 02:11:53 -08:00
selection_format_id = format_id
2025-12-14 00:53:52 -08:00
2025-12-22 02:11:53 -08:00
size_str = ""
size_prefix = ""
size_bytes = filesize
if not size_bytes:
size_bytes = filesize_approx
if size_bytes:
size_prefix = "~"
2025-12-14 00:53:52 -08:00
try:
2025-12-22 02:11:53 -08:00
if isinstance(size_bytes, (int, float)) and size_bytes > 0:
size_mb = float(size_bytes) / (1024 * 1024)
size_str = f"{size_prefix}{size_mb:.1f}MB"
2025-12-14 00:53:52 -08:00
except Exception:
2025-12-22 02:11:53 -08:00
size_str = ""
2025-12-14 00:53:52 -08:00
2025-12-22 02:11:53 -08:00
desc_parts: List[str] = []
if resolution and resolution != "audio only":
desc_parts.append(resolution)
if ext:
desc_parts.append(str(ext).upper())
if vcodec != "none":
desc_parts.append(f"v:{vcodec}")
if acodec != "none":
desc_parts.append(f"a:{acodec}")
if size_str:
desc_parts.append(size_str)
format_desc = " | ".join(desc_parts)
format_dict = {
"table":
"download-media",
"title":
f"Format {format_id}",
"url":
url,
"target":
url,
"detail":
format_desc,
"annotations": [ext,
resolution] if resolution else [ext],
"media_kind":
"format",
"cmd":
base_cmd,
2025-12-14 00:53:52 -08:00
"columns": [
("ID",
format_id),
("Resolution",
resolution or "N/A"),
("Ext",
ext),
("Size",
size_str or ""),
("Video",
vcodec),
("Audio",
acodec),
2025-12-14 00:53:52 -08:00
],
2025-12-22 02:11:53 -08:00
"full_metadata": {
"format_id": format_id,
"url": url,
"item_selector": selection_format_id,
},
"_selection_args":
None,
2025-12-14 00:53:52 -08:00
}
2025-12-20 23:57:44 -08:00
2025-12-22 02:11:53 -08:00
selection_args: List[str] = ["-format", selection_format_id]
try:
if (not clip_spec) and clip_values:
2025-12-29 17:05:03 -08:00
selection_args.extend(
[
"-query",
f"clip:{','.join([v for v in clip_values if v])}"
]
2025-12-29 17:05:03 -08:00
)
2025-12-22 02:11:53 -08:00
except Exception:
pass
format_dict["_selection_args"] = selection_args
2025-12-20 23:57:44 -08:00
2025-12-22 02:11:53 -08:00
results_list.append(format_dict)
table.add_result(format_dict)
2025-12-20 23:57:44 -08:00
try:
2025-12-23 16:36:39 -08:00
_print_table_suspended(table)
2025-12-22 02:11:53 -08:00
setattr(table, "_rendered_by_cmdlet", True)
2025-12-20 23:57:44 -08:00
except Exception:
pass
2025-12-22 02:11:53 -08:00
pipeline_context.set_current_stage_table(table)
pipeline_context.set_last_result_table(table, results_list)
2025-12-20 23:57:44 -08:00
2025-12-22 02:11:53 -08:00
log(f"", file=sys.stderr)
return 0
2025-12-14 00:53:52 -08:00
2025-12-22 02:11:53 -08:00
return None
2025-12-14 00:53:52 -08:00
2025-12-22 02:11:53 -08:00
def _download_supported_urls(
self,
*,
supported_url: Sequence[str],
ytdlp_tool: YtDlpTool,
args: Sequence[str],
config: Dict[str,
Any],
2025-12-22 02:11:53 -08:00
final_output_dir: Path,
mode: str,
clip_spec: Any,
clip_ranges: Optional[List[tuple[int,
int]]],
2025-12-22 02:11:53 -08:00
query_hash_override: Optional[str],
embed_chapters: bool,
write_sub: bool,
quiet_mode: bool,
playlist_items: Optional[str],
ytdl_format: Any,
skip_per_url_preflight: bool,
forced_single_format_id: Optional[str],
forced_single_format_for_batch: bool,
formats_cache: Dict[str,
Optional[List[Dict[str,
Any]]]],
2025-12-22 02:11:53 -08:00
storage: Any,
hydrus_available: bool,
) -> int:
downloaded_count = 0
downloaded_pipe_objects: List[Dict[str, Any]] = []
pipe_seq = 0
clip_sections_spec = self._build_clip_sections_spec(clip_ranges)
if clip_sections_spec:
try:
debug(f"Clip sections spec: {clip_sections_spec}")
except Exception:
pass
2025-12-11 12:47:30 -08:00
2025-12-22 02:11:53 -08:00
for url in supported_url:
try:
debug(f"Processing: {url}")
2025-12-20 02:12:45 -08:00
2025-12-22 02:11:53 -08:00
canonical_url = self._canonicalize_url_for_storage(
requested_url=url,
ytdlp_tool=ytdlp_tool,
playlist_items=playlist_items,
)
2025-12-11 12:47:30 -08:00
2025-12-22 02:11:53 -08:00
if not skip_per_url_preflight:
if not self._preflight_url_duplicate(
storage=storage,
hydrus_available=hydrus_available,
final_output_dir=final_output_dir,
candidate_url=canonical_url,
extra_urls=[url],
2025-12-22 02:11:53 -08:00
):
log(f"Skipping download: {url}", file=sys.stderr)
continue
2025-12-14 00:53:52 -08:00
2025-12-22 02:11:53 -08:00
PipelineProgress(pipeline_context).begin_steps(2)
actual_format = ytdl_format
actual_playlist_items = playlist_items
if playlist_items and not ytdl_format:
import re
if re.search(r"[^0-9,-]", playlist_items):
actual_format = playlist_items
actual_playlist_items = None
if mode == "audio" and not actual_format:
actual_format = "bestaudio"
if mode == "video" and not actual_format:
configured = (ytdlp_tool.default_format("video") or "").strip()
if configured and configured != "bestvideo+bestaudio/best":
actual_format = configured
forced_single_applied = False
if (forced_single_format_for_batch and forced_single_format_id
and not ytdl_format and not actual_playlist_items):
2025-12-22 02:11:53 -08:00
actual_format = forced_single_format_id
forced_single_applied = True
if (actual_format and isinstance(actual_format,
str) and mode != "audio"
and "+" not in actual_format and "/" not in actual_format
and "[" not in actual_format and actual_format not in {"best",
"bv",
"ba",
"b"}
and not forced_single_applied):
2025-12-22 02:11:53 -08:00
try:
formats = self._list_formats_cached(
url,
playlist_items_value=actual_playlist_items,
formats_cache=formats_cache,
ytdlp_tool=ytdlp_tool,
)
if formats:
fmt_match = next(
2025-12-29 17:05:03 -08:00
(
f for f in formats
2025-12-29 17:05:03 -08:00
if str(f.get("format_id", "")) == actual_format
),
2025-12-22 02:11:53 -08:00
None,
)
if fmt_match:
vcodec = str(fmt_match.get("vcodec", "none"))
acodec = str(fmt_match.get("acodec", "none"))
if vcodec != "none" and acodec == "none":
2025-12-29 17:05:03 -08:00
debug(
f"Selected video-only format {actual_format}; using {actual_format}+ba for audio"
)
2025-12-22 02:11:53 -08:00
actual_format = f"{actual_format}+ba"
except Exception:
pass
2025-12-14 00:53:52 -08:00
2025-12-22 02:11:53 -08:00
attempted_single_format_fallback = False
while True:
try:
opts = DownloadOptions(
url=url,
mode=mode,
output_dir=final_output_dir,
ytdl_format=actual_format,
cookies_path=ytdlp_tool.resolve_cookiefile(),
clip_sections=clip_sections_spec,
playlist_items=actual_playlist_items,
quiet=quiet_mode,
no_playlist=False,
embed_chapters=embed_chapters,
write_sub=write_sub,
)
PipelineProgress(pipeline_context).step("downloading")
debug(f"Starting download with 5-minute timeout...")
result_obj = _download_with_timeout(opts, timeout_seconds=300)
debug(f"Download completed, building pipe object...")
break
except DownloadError as e:
cause = getattr(e, "__cause__", None)
detail = ""
2025-12-16 23:23:43 -08:00
try:
2025-12-22 02:11:53 -08:00
detail = str(cause or "")
2025-12-16 23:23:43 -08:00
except Exception:
2025-12-22 02:11:53 -08:00
detail = ""
2025-12-12 21:55:38 -08:00
if ("requested format is not available"
in (detail or "").lower()) and mode != "audio":
if (forced_single_format_for_batch
and forced_single_format_id and not ytdl_format
and not actual_playlist_items
and not attempted_single_format_fallback):
2025-12-22 02:11:53 -08:00
attempted_single_format_fallback = True
actual_format = forced_single_format_id
2025-12-29 17:05:03 -08:00
debug(
f"Only one format available (playlist preflight); retrying with: {actual_format}"
)
2025-12-22 02:11:53 -08:00
continue
formats = self._list_formats_cached(
url,
playlist_items_value=actual_playlist_items,
formats_cache=formats_cache,
ytdlp_tool=ytdlp_tool,
2025-12-20 23:57:44 -08:00
)
if ((not attempted_single_format_fallback)
and isinstance(formats,
list) and len(formats) == 1
and isinstance(formats[0],
dict)):
2025-12-22 02:11:53 -08:00
only = formats[0]
fallback_format = str(only.get("format_id")
or "").strip()
2025-12-22 02:11:53 -08:00
selection_format_id = fallback_format
try:
vcodec = str(only.get("vcodec", "none"))
acodec = str(only.get("acodec", "none"))
if vcodec != "none" and acodec == "none" and fallback_format:
selection_format_id = f"{fallback_format}+ba"
except Exception:
selection_format_id = fallback_format
2025-12-20 23:57:44 -08:00
2025-12-22 02:11:53 -08:00
if selection_format_id:
2025-12-20 23:57:44 -08:00
attempted_single_format_fallback = True
2025-12-22 02:11:53 -08:00
actual_format = selection_format_id
2025-12-29 17:05:03 -08:00
debug(
f"Only one format available; retrying with: {actual_format}"
)
2025-12-20 23:57:44 -08:00
continue
2025-12-22 02:11:53 -08:00
if formats:
formats_to_show = formats
2025-12-29 17:05:03 -08:00
table = ResultTable(
title=f"Available formats for {url}",
max_columns=10,
preserve_order=True,
)
2025-12-22 02:11:53 -08:00
table.set_table("ytdlp.formatlist")
table.set_source_command("download-media", [url])
results_list: List[Dict[str, Any]] = []
for idx, fmt in enumerate(formats_to_show, 1):
resolution = fmt.get("resolution", "")
ext = fmt.get("ext", "")
vcodec = fmt.get("vcodec", "none")
acodec = fmt.get("acodec", "none")
filesize = fmt.get("filesize")
filesize_approx = fmt.get("filesize_approx")
format_id = fmt.get("format_id", "")
selection_format_id = format_id
2025-12-20 23:57:44 -08:00
try:
2025-12-22 02:11:53 -08:00
if vcodec != "none" and acodec == "none" and format_id:
selection_format_id = f"{format_id}+ba"
2025-12-20 23:57:44 -08:00
except Exception:
selection_format_id = format_id
2025-12-22 02:11:53 -08:00
size_str = ""
size_prefix = ""
size_bytes = filesize
if not size_bytes:
size_bytes = filesize_approx
if size_bytes:
size_prefix = "~"
2025-12-20 23:57:44 -08:00
try:
if isinstance(size_bytes,
(int,
float)) and size_bytes > 0:
2025-12-22 02:11:53 -08:00
size_mb = float(size_bytes) / (1024 * 1024)
size_str = f"{size_prefix}{size_mb:.1f}MB"
2025-12-20 23:57:44 -08:00
except Exception:
2025-12-22 02:11:53 -08:00
size_str = ""
2025-12-20 23:57:44 -08:00
2025-12-22 02:11:53 -08:00
desc_parts: List[str] = []
if resolution and resolution != "audio only":
desc_parts.append(str(resolution))
if ext:
desc_parts.append(str(ext).upper())
if vcodec != "none":
desc_parts.append(f"v:{vcodec}")
if acodec != "none":
desc_parts.append(f"a:{acodec}")
if size_str:
desc_parts.append(size_str)
format_desc = " | ".join(desc_parts)
format_dict: Dict[
str,
Any] = {
"table":
"download-media",
"title":
f"Format {format_id}",
"url":
url,
"target":
url,
"detail":
format_desc,
"media_kind":
"format",
"columns": [
("ID",
format_id),
("Resolution",
resolution or "N/A"),
("Ext",
ext),
("Size",
size_str or ""),
("Video",
vcodec),
("Audio",
acodec),
],
"full_metadata": {
"format_id": format_id,
"url": url,
"item_selector": selection_format_id,
},
"_selection_args":
["-format",
selection_format_id],
}
2025-12-21 16:59:37 -08:00
2025-12-22 02:11:53 -08:00
results_list.append(format_dict)
table.add_result(format_dict)
2025-12-11 12:47:30 -08:00
2025-12-22 02:11:53 -08:00
pipeline_context.set_current_stage_table(table)
pipeline_context.set_last_result_table(
table,
results_list
)
2025-12-13 00:18:30 -08:00
2025-12-16 01:45:01 -08:00
try:
2025-12-23 16:36:39 -08:00
_print_table_suspended(table)
2025-12-22 02:11:53 -08:00
setattr(table, "_rendered_by_cmdlet", True)
2025-12-20 02:12:45 -08:00
except Exception:
pass
2025-12-22 02:11:53 -08:00
PipelineProgress(pipeline_context
).step("awaiting selection")
2025-12-22 02:11:53 -08:00
2025-12-29 17:05:03 -08:00
log(
"Requested format is not available; select a working format with @N",
file=sys.stderr,
)
2025-12-22 02:11:53 -08:00
return 0
raise
results_to_emit: List[Any] = []
if isinstance(result_obj, list):
results_to_emit = list(result_obj)
else:
paths = getattr(result_obj, "paths", None)
if isinstance(paths, list) and paths:
for p in paths:
try:
p_path = Path(p)
except Exception:
continue
try:
if p_path.suffix.lower() in _SUBTITLE_EXTS:
2025-12-16 01:45:01 -08:00
continue
2025-12-22 02:11:53 -08:00
except Exception:
pass
if not p_path.exists() or p_path.is_dir():
continue
try:
hv = sha256_file(p_path)
except Exception:
hv = None
results_to_emit.append(
DownloadMediaResult(
path=p_path,
info=getattr(result_obj,
"info",
{}) or {},
tag=list(getattr(result_obj,
"tag",
[]) or []),
source_url=getattr(result_obj,
"source_url",
None) or opts.url,
2025-12-22 02:11:53 -08:00
hash_value=hv,
2025-12-16 01:45:01 -08:00
)
2025-12-22 02:11:53 -08:00
)
else:
results_to_emit = [result_obj]
pipe_objects: List[Dict[str, Any]] = []
for downloaded in results_to_emit:
po = self._build_pipe_object(downloaded, url, opts)
pipe_seq += 1
try:
po.setdefault("pipe_index", pipe_seq)
except Exception:
pass
2025-12-16 23:23:43 -08:00
2025-12-22 02:11:53 -08:00
try:
2025-12-29 17:05:03 -08:00
info = (
downloaded.info
if isinstance(getattr(downloaded,
"info",
None),
dict) else {}
2025-12-29 17:05:03 -08:00
)
2025-12-22 02:11:53 -08:00
except Exception:
info = {}
chapters_text = _format_chapters_note(
info
) if embed_chapters else None
2025-12-22 02:11:53 -08:00
if chapters_text:
notes = po.get("notes")
if not isinstance(notes, dict):
notes = {}
notes.setdefault("chapters", chapters_text)
po["notes"] = notes
if write_sub:
2025-12-16 23:23:43 -08:00
try:
2025-12-22 02:11:53 -08:00
media_path = Path(str(po.get("path") or ""))
2025-12-16 23:23:43 -08:00
except Exception:
2025-12-22 02:11:53 -08:00
media_path = None
if media_path is not None and media_path.exists(
) and media_path.is_file():
2025-12-22 02:11:53 -08:00
sub_path = _best_subtitle_sidecar(media_path)
if sub_path is not None:
sub_text = _read_text_file(sub_path)
if sub_text:
notes = po.get("notes")
if not isinstance(notes, dict):
notes = {}
notes["sub"] = sub_text
po["notes"] = notes
try:
sub_path.unlink()
except Exception:
pass
pipe_objects.append(po)
try:
if clip_ranges and len(pipe_objects) == len(clip_ranges):
source_hash = query_hash_override or self._find_existing_hash_for_url(
storage,
canonical_url,
hydrus_available=hydrus_available,
)
2025-12-29 17:05:03 -08:00
self._apply_clip_decorations(
pipe_objects,
clip_ranges,
source_king_hash=source_hash
2025-12-29 17:05:03 -08:00
)
2025-12-22 02:11:53 -08:00
except Exception:
pass
debug(f"Emitting {len(pipe_objects)} result(s) to pipeline...")
PipelineProgress(pipeline_context).step("finalized")
stage_ctx = pipeline_context.get_stage_context()
2025-12-25 04:49:22 -08:00
# Emit per downloaded item whenever we're running under the pipeline runner.
# Live progress advances on emit(), and suppressing emits for the last stage
# causes the pipe bar to stay at 0% even while downloads complete.
emit_enabled = bool(stage_ctx is not None)
2025-12-22 02:11:53 -08:00
for pipe_obj_dict in pipe_objects:
if emit_enabled:
pipeline_context.emit(pipe_obj_dict)
if pipe_obj_dict.get("url"):
pipe_obj = coerce_to_pipe_object(pipe_obj_dict)
register_url_with_local_library(pipe_obj, config)
2025-12-16 01:45:01 -08:00
try:
2025-12-22 02:11:53 -08:00
downloaded_pipe_objects.append(pipe_obj_dict)
2025-12-16 01:45:01 -08:00
except Exception:
pass
2025-12-13 12:09:50 -08:00
2025-12-22 02:11:53 -08:00
downloaded_count += len(pipe_objects)
debug("✓ Downloaded and emitted")
2025-12-16 01:45:01 -08:00
2025-12-22 02:11:53 -08:00
except DownloadError as e:
log(f"Download failed for {url}: {e}", file=sys.stderr)
except Exception as e:
log(f"Error processing {url}: {e}", file=sys.stderr)
2025-12-21 16:59:37 -08:00
2025-12-22 02:11:53 -08:00
if downloaded_count > 0:
debug(f"✓ Successfully processed {downloaded_count} URL(s)")
return 0
2025-12-13 00:18:30 -08:00
2025-12-22 02:11:53 -08:00
log("No downloads completed", file=sys.stderr)
return 1
2025-12-13 00:18:30 -08:00
def _run_impl(
self,
result: Any,
args: Sequence[str],
config: Dict[str,
Any]
) -> int:
2025-12-22 02:11:53 -08:00
"""Main download implementation for yt-dlp-supported url."""
try:
debug("Starting download-media")
2025-12-20 23:57:44 -08:00
2025-12-22 02:11:53 -08:00
ytdlp_tool = YtDlpTool(config)
2025-12-11 12:47:30 -08:00
2025-12-22 02:11:53 -08:00
# Parse arguments
parsed = parse_cmdlet_args(args, self)
2025-12-11 12:47:30 -08:00
2025-12-22 02:11:53 -08:00
raw_url = self._normalize_urls(parsed)
raw_url = self._append_urls_from_piped_result(raw_url, result)
2025-12-20 23:57:44 -08:00
2025-12-22 02:11:53 -08:00
supported_url, unsupported_list = self._filter_supported_urls(raw_url)
2025-12-11 12:47:30 -08:00
2025-12-22 02:11:53 -08:00
if not supported_url:
log("No yt-dlp-supported url to download", file=sys.stderr)
return 1
if unsupported_list:
2025-12-29 17:05:03 -08:00
debug(
f"Skipping {len(unsupported_list)} unsupported url (use download-file for direct downloads)"
)
2025-12-22 02:11:53 -08:00
# Get output directory
final_output_dir = self._resolve_output_dir(parsed, config)
if not final_output_dir:
return 1
debug(f"Output directory: {final_output_dir}")
# Get other options
clip_spec = parsed.get("clip")
query_spec = parsed.get("query")
# download-media supports a small keyed spec language inside -query.
# Examples:
# -query "hash:<sha256>"
# -query "clip:1m-1m15s,2m1s-2m11s"
# -query "hash:<sha256>,clip:1m-1m15s,item:2-3"
2025-12-23 16:36:39 -08:00
# -query "format:audio,item:1-3" (audio-only + playlist selection)
2025-12-29 17:05:03 -08:00
query_keyed = self._parse_query_keyed_spec(
str(query_spec) if query_spec is not None else None
)
2025-12-22 02:11:53 -08:00
# Optional: allow an explicit hash via -query "hash:<sha256>".
# This is used as the preferred king hash for multi-clip relationships.
query_hash_override = self._extract_hash_override(
str(query_spec) if query_spec is not None else None,
query_keyed,
)
# Always enable chapters + subtitles so downstream pipes (e.g. mpv) can consume them.
embed_chapters = True
write_sub = True
2025-12-23 16:36:39 -08:00
# QueryArgs:
# - format:audio => audio-only (highest quality audio)
# - format:<ytdlp-format> => equivalent to -format <ytdlp-format>
2025-12-29 17:05:03 -08:00
# - audio:true|false => audio-only mode toggle (preferred over format:audio when provided)
2025-12-23 16:36:39 -08:00
query_format: Optional[str] = None
try:
fmt_values = query_keyed.get("format",
[]) if isinstance(query_keyed,
dict) else []
2025-12-23 16:36:39 -08:00
fmt_candidate = fmt_values[-1] if fmt_values else None
if fmt_candidate is not None:
query_format = str(fmt_candidate).strip()
except Exception:
query_format = None
2025-12-29 17:05:03 -08:00
query_audio: Optional[bool] = None
try:
audio_values = query_keyed.get("audio",
[]) if isinstance(query_keyed,
dict) else []
2025-12-29 17:05:03 -08:00
audio_candidate = audio_values[-1] if audio_values else None
if audio_candidate is not None:
s = str(audio_candidate).strip().lower()
if s in {"1",
"true",
"t",
"yes",
"y",
"on"}:
2025-12-29 17:05:03 -08:00
query_audio = True
elif s in {"0",
"false",
"f",
"no",
"n",
"off"}:
2025-12-29 17:05:03 -08:00
query_audio = False
elif s:
# Any other non-empty value implies "on".
query_audio = True
except Exception:
query_audio = None
2025-12-23 16:36:39 -08:00
query_wants_audio = False
if query_format:
try:
query_wants_audio = str(query_format).strip().lower() == "audio"
except Exception:
query_wants_audio = False
2025-12-29 17:05:03 -08:00
# Explicit CLI flag wins; else query audio: can select audio mode.
# Back-compat: format:audio is still supported.
audio_flag = bool(parsed.get("audio") is True)
wants_audio = audio_flag
if query_audio is not None:
wants_audio = wants_audio or bool(query_audio)
else:
wants_audio = wants_audio or bool(query_wants_audio)
mode = "audio" if wants_audio else "video"
2025-12-22 02:11:53 -08:00
clip_ranges, clip_invalid, clip_values = self._parse_clip_ranges_and_apply_items(
clip_spec=str(clip_spec) if clip_spec is not None else None,
query_keyed=query_keyed,
parsed=parsed,
query_spec=str(query_spec) if query_spec is not None else None,
)
if clip_invalid:
return 1
if clip_ranges:
try:
debug(f"Clip ranges: {clip_ranges}")
except Exception:
pass
2025-12-29 17:05:03 -08:00
quiet_mode = (
bool(config.get("_quiet_background_output"))
if isinstance(config,
dict) else False
2025-12-29 17:05:03 -08:00
)
2025-12-22 02:11:53 -08:00
2025-12-29 17:05:03 -08:00
storage, hydrus_available = self._init_storage(
config if isinstance(config, dict) else {}
)
2025-12-22 02:11:53 -08:00
# Check if we need to show format selection
formats_cache: Dict[str,
Optional[List[Dict[str,
Any]]]] = {}
2025-12-22 02:11:53 -08:00
playlist_items = str(parsed.get("item")) if parsed.get("item") else None
ytdl_format = parsed.get("format")
2025-12-23 16:36:39 -08:00
# If user didn't pass -format, allow -query "format:<...>" to provide it.
# Supported query forms:
# - format:audio => audio-only mode (handled above)
# - format:720p => pick best video <= 720p and always include audio
# - format:<ytdlp -f> => treated as a raw yt-dlp selector (non-numeric)
# - format:<N> => treated as a 1-based index into the shown format list (resolved below)
if not ytdl_format and query_format and not query_wants_audio:
try:
height_selector = self._format_selector_for_query_height(
query_format
)
2025-12-23 16:36:39 -08:00
except ValueError as e:
log(f"Error parsing format selection: {e}", file=sys.stderr)
return 1
if height_selector:
ytdl_format = height_selector
else:
import re
# Preserve numeric index selection and avoid ambiguity with numeric format IDs.
if not re.match(r"^\s*#?\d+\s*$", str(query_format)):
ytdl_format = query_format
2025-12-22 02:11:53 -08:00
playlist_selection_handled = False
# Playlist/multi-entry detection: if the URL has multiple items and the user didn't
# specify -item or -format, show a normal selectable table and return.
if len(supported_url) == 1 and not playlist_items and not ytdl_format:
candidate_url = supported_url[0]
2025-12-23 16:36:39 -08:00
# Support numeric index selection via -query "format:<N>" where N is 1-based index
# into the filtered format list (e.g., -query "format:7" selects the 7th listed format).
# This allows non-interactive invocation from shells (PowerShell treats '@' specially).
if query_format and not query_wants_audio:
try:
2025-12-29 17:05:03 -08:00
idx_fmt = self._format_id_for_query_index(
query_format,
candidate_url,
formats_cache,
ytdlp_tool
2025-12-29 17:05:03 -08:00
)
2025-12-23 16:36:39 -08:00
except ValueError as e:
log(f"Error parsing format selection: {e}", file=sys.stderr)
return 1
if idx_fmt:
debug(
f"Resolved numeric format selection '{query_format}' -> {idx_fmt}"
)
2025-12-23 16:36:39 -08:00
ytdl_format = idx_fmt
if not ytdl_format:
if self._maybe_show_playlist_table(url=candidate_url,
ytdlp_tool=ytdlp_tool):
2025-12-23 16:36:39 -08:00
playlist_selection_handled = True
# Let the user pick items using the normal REPL prompt:
# @* | download-media ...
# If we printed a format table, give a quick hint for non-interactive selection.
try:
2025-12-29 17:05:03 -08:00
last_table = (
pipeline_context.get_last_result_table()
if hasattr(pipeline_context,
"get_last_result_table") else None
2025-12-29 17:05:03 -08:00
)
if hasattr(last_table,
"rows") and getattr(last_table,
"rows",
None):
2025-12-23 16:36:39 -08:00
# Build user-friendly examples using the base command we already constructed
sample_index = 1
sample_fmt_id = None
try:
sample_row = last_table.rows[0]
2025-12-29 17:05:03 -08:00
sample_fmt_id = (
sample_row._full_metadata.get("item_selector")
if getattr(sample_row,
"_full_metadata",
None) else None
2025-12-29 17:05:03 -08:00
)
2025-12-23 16:36:39 -08:00
except Exception:
sample_fmt_id = None
try:
# Use single quotes inside the outer quotes so PowerShell doesn't interpret the pipe character
2025-12-29 17:05:03 -08:00
sample_pipeline = base_cmd.replace(
f'"{candidate_url}"',
f"'{candidate_url}'"
2025-12-29 17:05:03 -08:00
)
2025-12-23 16:36:39 -08:00
hint = (
"To select non-interactively, re-run with an explicit format: "
2025-12-29 17:05:03 -08:00
'e.g. mm "{pipeline} -format {fmt} | add-file -store <store>" or '
2025-12-23 16:36:39 -08:00
"mm \"{pipeline} -query 'format:{index}' | add-file -store <store>\""
2025-12-29 17:05:03 -08:00
).format(
pipeline=sample_pipeline,
fmt=sample_fmt_id or "<format_id>",
index=sample_index,
)
2025-12-23 16:36:39 -08:00
log(hint, file=sys.stderr)
except Exception:
pass
except Exception:
pass
return 0
2025-12-22 02:11:53 -08:00
# Bulk preflight for playlist selections (per-entry URLs): check all URLs once before downloading.
skip_per_url_preflight = False
if len(supported_url) > 1:
if not self._preflight_url_duplicates_bulk(
storage=storage,
hydrus_available=hydrus_available,
final_output_dir=final_output_dir,
urls=list(supported_url),
2025-12-22 02:11:53 -08:00
):
return 0
skip_per_url_preflight = True
# Playlist-level format preflight: if the batch has only one available format,
# discover it once and force it for every item. This avoids per-item failures
# and per-item --list-formats calls (e.g. Bandcamp albums).
2025-12-23 16:36:39 -08:00
2025-12-22 02:11:53 -08:00
forced_single_format_id: Optional[str] = None
forced_single_format_for_batch = False
if len(supported_url) > 1 and not playlist_items and not ytdl_format:
try:
sample_url = str(supported_url[0])
fmts = self._list_formats_cached(
sample_url,
playlist_items_value=None,
formats_cache=formats_cache,
ytdlp_tool=ytdlp_tool,
)
if isinstance(fmts,
list) and len(fmts) == 1 and isinstance(fmts[0],
dict):
2025-12-22 02:11:53 -08:00
only_id = str(fmts[0].get("format_id") or "").strip()
if only_id:
forced_single_format_id = only_id
forced_single_format_for_batch = True
debug(
f"Playlist format preflight: only one format available; using {forced_single_format_id} for all items"
)
except Exception:
forced_single_format_id = None
forced_single_format_for_batch = False
early_ret = self._maybe_show_format_table_for_single_url(
mode=mode,
clip_spec=clip_spec,
clip_values=clip_values,
playlist_items=playlist_items,
ytdl_format=ytdl_format,
supported_url=supported_url,
playlist_selection_handled=playlist_selection_handled,
ytdlp_tool=ytdlp_tool,
formats_cache=formats_cache,
storage=storage,
hydrus_available=hydrus_available,
final_output_dir=final_output_dir,
args=args,
)
if early_ret is not None:
return int(early_ret)
return self._download_supported_urls(
supported_url=supported_url,
ytdlp_tool=ytdlp_tool,
args=args,
config=config,
final_output_dir=final_output_dir,
mode=mode,
clip_spec=clip_spec,
clip_ranges=clip_ranges,
query_hash_override=query_hash_override,
embed_chapters=embed_chapters,
write_sub=write_sub,
quiet_mode=quiet_mode,
playlist_items=playlist_items,
ytdl_format=ytdl_format,
skip_per_url_preflight=skip_per_url_preflight,
forced_single_format_id=forced_single_format_id,
forced_single_format_for_batch=forced_single_format_for_batch,
formats_cache=formats_cache,
storage=storage,
hydrus_available=hydrus_available,
)
2025-12-11 12:47:30 -08:00
except Exception as e:
log(f"Error in download-media: {e}", file=sys.stderr)
return 1
def _resolve_output_dir(self,
parsed: Dict[str,
Any],
config: Dict[str,
Any]) -> Optional[Path]:
2025-12-16 01:45:01 -08:00
"""Resolve the output directory.
Rules:
- If user passes `-path`, use that directory (override).
- Otherwise default to a temp directory (config["temp"] if present, else OS temp).
"""
2025-12-11 12:47:30 -08:00
2025-12-16 01:45:01 -08:00
# Priority 1: explicit output directory override
path_override = parsed.get("path")
if path_override:
2025-12-11 12:47:30 -08:00
try:
2025-12-16 01:45:01 -08:00
candidate = Path(str(path_override)).expanduser()
# If user passed a file path, treat its parent as output dir.
if candidate.suffix:
candidate = candidate.parent
candidate.mkdir(parents=True, exist_ok=True)
debug(f"Using output directory override: {candidate}")
return candidate
2025-12-11 12:47:30 -08:00
except Exception as e:
2025-12-16 01:45:01 -08:00
log(f"Invalid -path output directory: {e}", file=sys.stderr)
2025-12-11 12:47:30 -08:00
return None
2025-12-16 01:45:01 -08:00
# Priority 2: config-provided temp/output directory
2025-12-13 00:18:30 -08:00
try:
temp_value = (config
or {}).get("temp") if isinstance(config,
dict) else None
2025-12-13 00:18:30 -08:00
except Exception:
2025-12-16 01:45:01 -08:00
temp_value = None
if temp_value:
try:
candidate = Path(str(temp_value)).expanduser()
candidate.mkdir(parents=True, exist_ok=True)
debug(f"Using config temp directory: {candidate}")
return candidate
except Exception as e:
log(
f"Cannot use configured temp directory '{temp_value}': {e}",
file=sys.stderr
)
2025-12-16 01:45:01 -08:00
return None
2025-12-11 12:47:30 -08:00
2025-12-16 01:45:01 -08:00
# Priority 3: OS temp fallback
2025-12-11 12:47:30 -08:00
try:
2025-12-16 01:45:01 -08:00
candidate = Path(tempfile.gettempdir()) / "Medios-Macina"
candidate.mkdir(parents=True, exist_ok=True)
debug(f"Using OS temp directory: {candidate}")
return candidate
2025-12-11 12:47:30 -08:00
except Exception as e:
2025-12-16 01:45:01 -08:00
log(f"Cannot create OS temp directory: {e}", file=sys.stderr)
2025-12-11 12:47:30 -08:00
return None
2025-12-16 01:45:01 -08:00
def _parse_time_ranges(self, spec: str) -> List[tuple[int, int]]:
"""Parse clip specs into a list of (start_seconds, end_seconds).
2025-12-11 12:47:30 -08:00
2025-12-16 01:45:01 -08:00
Supported inputs:
- "MM:SS-MM:SS"
- "HH:MM:SS-HH:MM:SS"
- seconds: "280-300"
- multiple ranges separated by commas: "4:40-5:00,5:15-5:25"
"""
def _to_seconds(ts: str) -> Optional[int]:
ts = str(ts).strip()
if not ts:
2025-12-11 12:47:30 -08:00
return None
2025-12-16 01:45:01 -08:00
2025-12-20 02:12:45 -08:00
# Support compact units like 3m4s, 1h22m, 1h2m3s
# (case-insensitive; seconds may be fractional but are truncated to int)
try:
unit_match = re.fullmatch(
r"(?i)\s*(?:(?P<h>\d+)h)?\s*(?:(?P<m>\d+)m)?\s*(?:(?P<s>\d+(?:\.\d+)?)s)?\s*",
ts,
)
except Exception:
unit_match = None
if (unit_match and unit_match.group(0).strip()
and any(unit_match.group(g) for g in ("h", "m", "s"))):
2025-12-20 02:12:45 -08:00
try:
hours = int(unit_match.group("h") or 0)
minutes = int(unit_match.group("m") or 0)
seconds = float(unit_match.group("s") or 0)
total = (hours * 3600) + (minutes * 60) + seconds
return int(total)
except Exception:
return None
2025-12-16 01:45:01 -08:00
if ":" in ts:
parts = [p.strip() for p in ts.split(":")]
if len(parts) == 2:
hh_s = "0"
mm_s, ss_s = parts
elif len(parts) == 3:
hh_s, mm_s, ss_s = parts
else:
return None
try:
hours = int(hh_s)
minutes = int(mm_s)
seconds = float(ss_s)
total = (hours * 3600) + (minutes * 60) + seconds
return int(total)
except Exception:
return None
try:
return int(float(ts))
except Exception:
return None
ranges: List[tuple[int, int]] = []
if not spec:
return ranges
for piece in str(spec).split(","):
piece = piece.strip()
if not piece:
continue
if "-" not in piece:
return []
start_s, end_s = [p.strip() for p in piece.split("-", 1)]
start = _to_seconds(start_s)
end = _to_seconds(end_s)
if start is None or end is None or start >= end:
return []
ranges.append((start, end))
return ranges
2025-12-11 12:47:30 -08:00
2025-12-20 02:12:45 -08:00
@staticmethod
def _parse_keyed_csv_spec(spec: str, *, default_key: str) -> Dict[str, List[str]]:
"""Parse comma-separated values with optional sticky `key:` prefixes.
Example:
clip:3m4s-3m14s,1h22m-1h33m,item:2-3
Rules:
- Items are split on commas.
- If an item begins with `key:` then key becomes active for subsequent items.
- If an item has no `key:` prefix, it belongs to the last active key.
- If no key has been set yet, values belong to default_key.
"""
out: Dict[str,
List[str]] = {}
2025-12-20 02:12:45 -08:00
if not isinstance(spec, str):
spec = str(spec)
text = spec.strip()
if not text:
return out
active = (default_key or "").strip().lower() or "clip"
key_pattern = re.compile(r"^([A-Za-z_][A-Za-z0-9_-]*)\s*:\s*(.*)$")
for raw_piece in text.split(","):
piece = raw_piece.strip()
if not piece:
continue
m = key_pattern.match(piece)
if m:
active = (m.group(1) or "").strip().lower() or active
value = (m.group(2) or "").strip()
if value:
out.setdefault(active, []).append(value)
continue
out.setdefault(active, []).append(piece)
return out
2025-12-11 12:47:30 -08:00
def _build_clip_sections_spec(
self,
clip_ranges: Optional[List[tuple[int,
int]]],
2025-12-11 12:47:30 -08:00
) -> Optional[str]:
2025-12-12 21:55:38 -08:00
"""Convert parsed clip range into downloader spec (seconds)."""
2025-12-11 12:47:30 -08:00
ranges: List[str] = []
2025-12-16 01:45:01 -08:00
if clip_ranges:
for start_s, end_s in clip_ranges:
ranges.append(f"{start_s}-{end_s}")
2025-12-11 12:47:30 -08:00
return ",".join(ranges) if ranges else None
def _build_pipe_object(self,
download_result: Any,
url: str,
opts: DownloadOptions) -> Dict[str,
Any]:
2025-12-11 12:47:30 -08:00
"""Create a PipeObject-compatible dict from a DownloadMediaResult."""
info: Dict[str,
Any] = (
download_result.info if isinstance(download_result.info,
dict) else {}
)
2025-12-11 12:47:30 -08:00
media_path = Path(download_result.path)
hash_value = download_result.hash_value or self._compute_file_hash(media_path)
title = info.get("title") or media_path.stem
2025-12-11 23:21:45 -08:00
tag = list(download_result.tag or [])
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Add title tag for searchability
2025-12-11 23:21:45 -08:00
if title and f"title:{title}" not in tag:
tag.insert(0, f"title:{title}")
2025-12-11 12:47:30 -08:00
2025-12-14 00:53:52 -08:00
# Store the canonical URL for de-dup/search purposes.
# Prefer yt-dlp's webpage_url, and do not mix in the raw requested URL (which may contain timestamps).
final_url = None
2025-12-11 12:47:30 -08:00
try:
page_url = info.get("webpage_url") or info.get("original_url"
) or info.get("url")
2025-12-11 12:47:30 -08:00
if page_url:
2025-12-14 00:53:52 -08:00
final_url = str(page_url)
2025-12-11 12:47:30 -08:00
except Exception:
2025-12-14 00:53:52 -08:00
final_url = None
if not final_url and url:
final_url = str(url)
2025-12-11 12:47:30 -08:00
# Construct canonical PipeObject dict: hash, store, path, url, title, tags
# Prefer explicit backend names (storage_name/storage_location). If none, default to PATH
# which indicates the file is available at a filesystem path and hasn't been added to a backend yet.
return {
"path":
str(media_path),
"hash":
hash_value,
"title":
title,
"url":
final_url,
"tag":
tag,
"action":
"cmdlet:download-media",
"is_temp":
True,
"ytdl_format":
getattr(opts,
"ytdl_format",
None),
2025-12-11 12:47:30 -08:00
# download_mode removed (deprecated), keep media_kind
"store":
getattr(opts,
"storage_name",
None) or getattr(opts,
"storage_location",
None) or "PATH",
"media_kind":
"video" if opts.mode == "video" else "audio",
2025-12-11 12:47:30 -08:00
}
2025-12-16 01:45:01 -08:00
@staticmethod
def _normalise_hash_hex(value: Optional[str]) -> Optional[str]:
if not value or not isinstance(value, str):
return None
candidate = value.strip().lower()
if len(candidate) == 64 and all(c in "0123456789abcdef" for c in candidate):
return candidate
return None
@classmethod
def _extract_hash_from_search_hit(cls, hit: Any) -> Optional[str]:
if not isinstance(hit, dict):
return None
for key in ("hash", "hash_hex", "file_hash", "hydrus_hash"):
v = hit.get(key)
normalized = cls._normalise_hash_hex(str(v) if v is not None else None)
if normalized:
return normalized
return None
@classmethod
def _find_existing_hash_for_url(
cls,
storage: Any,
canonical_url: str,
*,
hydrus_available: bool,
) -> Optional[str]:
"""Best-effort lookup of an existing stored item hash by url:<canonical_url>.
Used to make the stored source video the king for multi-clip relationships.
"""
if storage is None or not canonical_url:
return None
try:
from Store.HydrusNetwork import HydrusNetwork
except Exception:
HydrusNetwork = None # type: ignore
try:
backend_names = list(storage.list_searchable_backends() or [])
except Exception:
backend_names = []
for backend_name in backend_names:
try:
backend = storage[backend_name]
except Exception:
continue
try:
if str(backend_name).strip().lower() == "temp":
continue
except Exception:
pass
try:
if (HydrusNetwork is not None and isinstance(backend,
HydrusNetwork)
and not hydrus_available):
2025-12-16 01:45:01 -08:00
continue
except Exception:
pass
try:
hits = backend.search(f"url:{canonical_url}", limit=5) or []
except Exception:
hits = []
for hit in hits:
extracted = cls._extract_hash_from_search_hit(hit)
if extracted:
return extracted
return None
@staticmethod
def _format_timecode(seconds: int, *, force_hours: bool) -> str:
total = max(0, int(seconds))
minutes, secs = divmod(total, 60)
hours, minutes = divmod(minutes, 60)
if force_hours:
return f"{hours:02d}:{minutes:02d}:{secs:02d}"
return f"{minutes:02d}:{secs:02d}"
@classmethod
def _format_clip_range(cls, start_s: int, end_s: int) -> str:
force_hours = bool(start_s >= 3600 or end_s >= 3600)
return f"{cls._format_timecode(start_s, force_hours=force_hours)}-{cls._format_timecode(end_s, force_hours=force_hours)}"
@classmethod
def _apply_clip_decorations(
cls,
pipe_objects: List[Dict[str,
Any]],
clip_ranges: List[tuple[int,
int]],
2025-12-16 01:45:01 -08:00
*,
source_king_hash: Optional[str],
) -> None:
"""Apply clip:{range} tags/titles and relationship metadata for multi-clip downloads.
- Sets the clip title (and title: tag) to exactly `clip:{range}`.
- Adds `clip:{range}` tag.
- Sets `relationships` on each emitted item (king hash first, then alt hashes)
so downstream can persist relationships into a DB/API without storing relationship tags.
"""
if not pipe_objects or len(pipe_objects) != len(clip_ranges):
return
# Always apply clip titles/tags (even for a single clip).
for po, (start_s, end_s) in zip(pipe_objects, clip_ranges):
clip_range = cls._format_clip_range(start_s, end_s)
clip_tag = f"clip:{clip_range}"
# Title: make it generic/consistent for clips.
po["title"] = clip_tag
tags = po.get("tag")
if not isinstance(tags, list):
tags = []
# Replace any existing title: tags with title:<clip_tag>
tags = [t for t in tags if not str(t).strip().lower().startswith("title:")]
# Relationships must not be stored as tags.
tags = [
t for t in tags
if not str(t).strip().lower().startswith("relationship:")
]
2025-12-16 01:45:01 -08:00
tags.insert(0, f"title:{clip_tag}")
# Ensure clip tag exists
if clip_tag not in tags:
tags.append(clip_tag)
po["tag"] = tags
# Relationship tagging only makes sense when multiple clips exist.
if len(pipe_objects) < 2:
return
hashes: List[str] = []
for po in pipe_objects:
h = cls._normalise_hash_hex(str(po.get("hash") or ""))
hashes.append(h or "")
# Determine king: prefer an existing source video hash if present; else first clip becomes king.
king_hash = cls._normalise_hash_hex(
source_king_hash
) if source_king_hash else None
2025-12-16 01:45:01 -08:00
if not king_hash:
king_hash = hashes[0] if hashes and hashes[0] else None
if not king_hash:
return
alt_hashes: List[str] = [h for h in hashes if h and h != king_hash]
if not alt_hashes:
return
# Carry relationship metadata through the pipeline without using tags.
rel_payload = {
"king": [king_hash],
"alt": list(alt_hashes)
}
2025-12-16 01:45:01 -08:00
for po in pipe_objects:
po["relationships"] = {
"king": [king_hash],
"alt": list(alt_hashes)
}
2025-12-16 01:45:01 -08:00
2025-12-11 12:47:30 -08:00
def _compute_file_hash(self, filepath: Path) -> str:
"""Compute SHA256 hash of a file."""
import hashlib
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
sha256_hash = hashlib.sha256()
with open(filepath, "rb") as f:
for byte_block in iter(lambda: f.read(4096), b""):
sha256_hash.update(byte_block)
return sha256_hash.hexdigest()
# Module-level singleton registration
CMDLET = Download_Media()