Files
Medios-Macina/SYS/download.py
2025-12-18 22:50:21 -08:00

857 lines
32 KiB
Python

"""Download media files using yt-dlp with support for direct file downloads.
Lean, focused downloader without event infrastructure overhead.
- yt-dlp integration for streaming sites
- Direct file download fallback for PDFs, images, documents
- Tag extraction via metadata.extract_ytdlp_tags()
- Logging via helper.logger.log()
"""
from __future__ import annotations
import glob # noqa: F401
import hashlib
import json # noqa: F401
import random
import re
import string
import subprocess
import sys
import time
import traceback
from pathlib import Path
from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import urljoin, urlparse
import httpx
from SYS.logger import log, debug
from SYS.utils import ensure_directory, sha256_file
from API.HTTP import HTTPClient
from models import DownloadError, DownloadOptions, DownloadMediaResult, DebugLogger, ProgressBar
try:
import yt_dlp # type: ignore
from yt_dlp.extractor import gen_extractors # type: ignore
except Exception as exc:
yt_dlp = None # type: ignore
YTDLP_IMPORT_ERROR = exc
else:
YTDLP_IMPORT_ERROR = None
try:
from metadata import extract_ytdlp_tags
except ImportError:
extract_ytdlp_tags = None
_EXTRACTOR_CACHE: List[Any] | None = None
def _ensure_yt_dlp_ready() -> None:
"""Verify yt-dlp is available, raise if not."""
if yt_dlp is not None:
return
detail = str(YTDLP_IMPORT_ERROR or "yt-dlp is not installed")
raise DownloadError(f"yt-dlp module not available: {detail}")
def _progress_callback(status: Dict[str, Any]) -> None:
"""Simple progress callback using logger."""
event = status.get("status")
if event == "downloading":
percent = status.get("_percent_str", "?")
speed = status.get("_speed_str", "?")
eta = status.get("_eta_str", "?")
sys.stdout.write(f"\r[download] {percent} at {speed} ETA {eta} ")
sys.stdout.flush()
elif event == "finished":
sys.stdout.write("\r" + " " * 70 + "\r")
sys.stdout.flush()
debug(f"✓ Download finished: {status.get('filename')}")
elif event in ("postprocessing", "processing"):
debug(f"Post-processing: {status.get('postprocessor')}")
def is_url_supported_by_ytdlp(url: str) -> bool:
"""Check if URL is supported by yt-dlp."""
if yt_dlp is None:
return False
global _EXTRACTOR_CACHE
if _EXTRACTOR_CACHE is None:
try:
_EXTRACTOR_CACHE = [ie for ie in gen_extractors()] # type: ignore[arg-type]
except Exception:
_EXTRACTOR_CACHE = []
for extractor in _EXTRACTOR_CACHE:
try:
if not extractor.suitable(url):
continue
except Exception:
continue
name = getattr(extractor, "IE_NAME", "")
if name.lower() == "generic":
continue
return True
return False
def list_formats(url: str, no_playlist: bool = False, playlist_items: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
"""Get list of available formats for a URL using yt-dlp."""
_ensure_yt_dlp_ready()
try:
ydl_opts = {
"quiet": True,
"no_warnings": True,
"socket_timeout": 30,
}
if no_playlist:
ydl_opts["noplaylist"] = True
if playlist_items:
ydl_opts["playlist_items"] = playlist_items
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
debug(f"Fetching format list for: {url}")
info = ydl.extract_info(url, download=False)
formats = info.get("formats", [])
if not formats:
log("No formats available", file=sys.stderr)
return None
result_formats = []
for fmt in formats:
result_formats.append({
"format_id": fmt.get("format_id", ""),
"format": fmt.get("format", ""),
"ext": fmt.get("ext", ""),
"resolution": fmt.get("resolution", ""),
"width": fmt.get("width"),
"height": fmt.get("height"),
"fps": fmt.get("fps"),
"vcodec": fmt.get("vcodec", "none"),
"acodec": fmt.get("acodec", "none"),
"filesize": fmt.get("filesize"),
"tbr": fmt.get("tbr"),
})
debug(f"Found {len(result_formats)} available formats")
return result_formats
except Exception as e:
log(f"✗ Error fetching formats: {e}", file=sys.stderr)
return None
def _download_with_sections_via_cli(url: str, ytdl_options: Dict[str, Any], sections: List[str], quiet: bool = False) -> tuple[Optional[str], Dict[str, Any]]:
"""Download each section separately so merge-file can combine them.
yt-dlp with multiple --download-sections args merges them into one file.
We need separate files for merge-file, so download each section individually.
Uses hash-based filenames for sections (not title-based) to prevent yt-dlp from
thinking sections are already downloaded. The title is extracted and stored in tags.
Returns:
(session_id, first_section_info_dict) - session_id for finding files, info dict for metadata extraction
"""
sections_list = ytdl_options.get("download_sections", [])
if not sections_list:
return "", {}
# Generate a unique hash-based ID for this download session
# This ensures different videos/downloads don't have filename collisions
session_id = hashlib.md5(
(url + str(time.time()) + ''.join(random.choices(string.ascii_letters, k=10))).encode()
).hexdigest()[:12]
first_section_info = None
title_from_first = None
# Download each section separately with unique output template using session ID
for section_idx, section in enumerate(sections_list, 1):
# Build unique output template for this section using session-based filename
# e.g., "{session_id}_{section_idx}.ext" - simple and unique per section
base_outtmpl = ytdl_options.get("outtmpl", "%(title)s.%(ext)s")
output_dir_path = Path(base_outtmpl).parent
# Use session_id + section index for temp filename
# e.g., "/path/{session_id}_1.%(ext)s"
filename_tmpl = f"{session_id}_{section_idx}"
if base_outtmpl.endswith(".%(ext)s"):
filename_tmpl += ".%(ext)s"
# Use Path to handle separators correctly for the OS
section_outtmpl = str(output_dir_path / filename_tmpl)
# For the first section, extract metadata first (separate call)
if section_idx == 1:
metadata_cmd = ["yt-dlp", "--dump-json", "--skip-download"]
if ytdl_options.get("cookiefile"):
cookies_path = ytdl_options["cookiefile"].replace("\\", "/")
metadata_cmd.extend(["--cookies", cookies_path])
if ytdl_options.get("noplaylist"):
metadata_cmd.append("--no-playlist")
metadata_cmd.append(url)
try:
meta_result = subprocess.run(metadata_cmd, capture_output=True, text=True)
if meta_result.returncode == 0 and meta_result.stdout:
try:
info_dict = json.loads(meta_result.stdout.strip())
first_section_info = info_dict
title_from_first = info_dict.get('title')
if not quiet:
debug(f"Extracted title from metadata: {title_from_first}")
except json.JSONDecodeError:
if not quiet:
debug("Could not parse JSON metadata")
except Exception as e:
if not quiet:
debug(f"Error extracting metadata: {e}")
# Build yt-dlp command for downloading this section
cmd = ["yt-dlp"]
# Add format
if ytdl_options.get("format"):
cmd.extend(["-f", ytdl_options["format"]])
# Add ONLY this section (not all sections)
cmd.extend(["--download-sections", section])
# Add force-keyframes-at-cuts if specified
if ytdl_options.get("force_keyframes_at_cuts"):
cmd.append("--force-keyframes-at-cuts")
# Add output template for this section
cmd.extend(["-o", section_outtmpl])
# Add cookies file if present
if ytdl_options.get("cookiefile"):
# Convert backslashes to forward slashes for better compatibility
cookies_path = ytdl_options["cookiefile"].replace("\\", "/")
cmd.extend(["--cookies", cookies_path])
# Add no-playlist if specified
if ytdl_options.get("noplaylist"):
cmd.append("--no-playlist")
# Add the URL
cmd.append(url)
if not quiet:
debug(f"Running yt-dlp for section {section_idx}/{len(sections_list)}: {section}")
debug(f"Command: {' '.join(cmd)}")
# Run the subprocess - don't capture output so progress is shown
try:
result = subprocess.run(cmd)
if result.returncode != 0:
raise DownloadError(f"yt-dlp subprocess failed for section {section_idx} with code {result.returncode}")
except Exception as exc:
raise DownloadError(f"yt-dlp subprocess error for section {section_idx}: {exc}") from exc
return session_id, first_section_info or {}
def _build_ytdlp_options(opts: DownloadOptions) -> Dict[str, Any]:
"""Build yt-dlp download options."""
ensure_directory(opts.output_dir)
# Build output template
# When downloading sections, each section will have .section_N_of_M added by _download_with_sections_via_cli
outtmpl = str((opts.output_dir / "%(title)s.%(ext)s").resolve())
base_options: Dict[str, Any] = {
"outtmpl": outtmpl,
"quiet": True,
"no_warnings": True,
"noprogress": True,
"socket_timeout": 30,
"retries": 10,
"fragment_retries": 10,
"http_chunk_size": 10_485_760,
"restrictfilenames": True,
"progress_hooks": [] if opts.quiet else [_progress_callback],
}
if opts.cookies_path and opts.cookies_path.is_file():
base_options["cookiefile"] = str(opts.cookies_path)
else:
# Fallback to browser cookies
base_options["cookiesfrombrowser"] = ("chrome",)
# Add no-playlist option if specified (for single video from playlist url)
if opts.no_playlist:
base_options["noplaylist"] = True
# Configure based on mode
if opts.mode == "audio":
base_options["format"] = opts.ytdl_format or "251/140/bestaudio"
base_options["postprocessors"] = [{"key": "FFmpegExtractAudio"}]
else: # video
base_options["format"] = opts.ytdl_format or "bestvideo+bestaudio/best"
base_options["format_sort"] = [
"res:4320", "res:2880", "res:2160", "res:1440", "res:1080", "res:720", "res"
]
# Add clip sections if provided (yt-dlp will download only these sections)
if opts.clip_sections:
# Parse section ranges like "48-65,120-152,196-205" (seconds)
# and convert to yt-dlp format: "*HH:MM:SS-HH:MM:SS,*HH:MM:SS-HH:MM:SS"
sections = []
for section_range in opts.clip_sections.split(','):
try:
start_str, end_str = section_range.strip().split('-')
start_sec = float(start_str)
end_sec = float(end_str)
# Convert seconds to HH:MM:SS format
def sec_to_hhmmss(seconds):
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = int(seconds % 60)
return f"{hours:02d}:{minutes:02d}:{secs:02d}"
start_time = sec_to_hhmmss(start_sec)
end_time = sec_to_hhmmss(end_sec)
sections.append(f"*{start_time}-{end_time}")
except (ValueError, AttributeError):
pass
if sections:
# Pass each section as a separate element in the list (yt-dlp expects multiple --download-sections args)
base_options["download_sections"] = sections
debug(f"Download sections configured: {', '.join(sections)}")
# Note: Not using --force-keyframes-at-cuts to avoid re-encoding
# This may result in less precise cuts but faster downloads
# Add playlist items selection if provided
if opts.playlist_items:
base_options["playlist_items"] = opts.playlist_items
if not opts.quiet:
debug(f"yt-dlp: mode={opts.mode}, format={base_options.get('format')}")
return base_options
def _iter_download_entries(info: Dict[str, Any]) -> Iterator[Dict[str, Any]]:
"""Iterate through download entries, handling playlists."""
queue: List[Dict[str, Any]] = [info]
seen: set[int] = set()
while queue:
current = queue.pop(0)
obj_id = id(current)
if obj_id in seen:
continue
seen.add(obj_id)
entries = current.get("entries")
if isinstance(entries, list):
for entry in entries:
if isinstance(entry, dict):
queue.append(entry)
if current.get("requested_downloads") or not entries:
yield current
def _candidate_paths(entry: Dict[str, Any], output_dir: Path) -> Iterator[Path]:
"""Get candidate file paths for downloaded media."""
requested = entry.get("requested_downloads")
if isinstance(requested, list):
for item in requested:
if isinstance(item, dict):
for key in ("filepath", "_filename", "filename"):
value = item.get(key)
if value:
yield Path(value)
for key in ("filepath", "_filename", "filename"):
value = entry.get(key)
if value:
yield Path(value)
if entry.get("filename"):
yield output_dir / entry["filename"]
def _resolve_entry_and_path(info: Dict[str, Any], output_dir: Path) -> tuple[Dict[str, Any], Path]:
"""Find downloaded file in yt-dlp metadata."""
for entry in _iter_download_entries(info):
for candidate in _candidate_paths(entry, output_dir):
if candidate.is_file():
return entry, candidate
if not candidate.is_absolute():
resolved = output_dir / candidate
if resolved.is_file():
return entry, resolved
raise FileNotFoundError("yt-dlp did not report a downloaded media file")
def _extract_sha256(info: Dict[str, Any]) -> Optional[str]:
"""Extract SHA256 hash from yt-dlp metadata."""
for payload in [info] + info.get("entries", []):
if not isinstance(payload, dict):
continue
hashes = payload.get("hashes")
if isinstance(hashes, dict):
for key in ("sha256", "sha-256", "sha_256"):
value = hashes.get(key)
if isinstance(value, str) and value.strip():
return value.strip().lower()
for key in ("sha256", "sha-256", "sha_256"):
value = payload.get(key)
if isinstance(value, str) and value.strip():
return value.strip().lower()
return None
def _get_libgen_download_url(libgen_url: str) -> Optional[str]:
"""Extract the actual download link from LibGen redirect URL.
LibGen url like https://libgen.gl/file.php?id=123456 redirect to
actual mirror url. This follows the redirect chain to get the real file.
Args:
libgen_url: LibGen file.php URL
Returns:
Actual download URL or None if extraction fails
"""
try:
import requests
from urllib.parse import urlparse
# Check if this is a LibGen URL
parsed = urlparse(libgen_url)
if 'libgen' not in parsed.netloc.lower():
return None
if '/file.php' not in parsed.path.lower():
return None
# LibGen redirects to actual mirrors, follow redirects to get final URL
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
debug(f"Following LibGen redirect chain for: {libgen_url}")
# First, get the page and look for direct download link
try:
response = session.get(libgen_url, timeout=10, allow_redirects=True)
final_url = response.url
# Try to find actual download link in the page
try:
try:
from lxml import html as lxml_html
except ImportError:
lxml_html = None
if lxml_html is not None:
doc = lxml_html.fromstring(response.content)
for a in doc.xpath("//a[@href]"):
href = str(a.get("href") or "").strip()
if not href:
continue
href_lower = href.lower()
if "get.php" in href_lower or href_lower.endswith((".pdf", ".epub", ".djvu", ".mobi")):
download_url = href if href.startswith("http") else urljoin(final_url, href)
debug(f"Found download link: {download_url}")
return download_url
else:
# Regex fallback
for m in re.finditer(
r"href=[\"\']([^\"\']+)[\"\']",
response.text or "",
flags=re.IGNORECASE,
):
href = str(m.group(1) or "").strip()
if not href or href.lower().startswith("javascript:"):
continue
href_lower = href.lower()
if "get.php" in href_lower or href_lower.endswith((".pdf", ".epub", ".djvu", ".mobi")):
download_url = href if href.startswith("http") else urljoin(final_url, href)
debug(f"Found download link: {download_url}")
return download_url
except Exception:
pass
# If we followed redirects successfully, return the final URL
# This handles cases where libgen redirects to a direct download mirror
if final_url != libgen_url:
debug(f"LibGen resolved to mirror: {final_url}")
return final_url
except requests.RequestException as e:
log(f"Error following LibGen redirects: {e}", file=sys.stderr)
# Try head request as fallback
try:
response = session.head(libgen_url, allow_redirects=True, timeout=10)
if response.url != libgen_url:
debug(f"LibGen HEAD resolved to: {response.url}")
return response.url
except:
pass
return None
except Exception as e:
log(f"Error resolving LibGen URL: {e}", file=sys.stderr)
return None
def _download_direct_file(
url: str,
output_dir: Path,
debug_logger: Optional[DebugLogger] = None,
quiet: bool = False,
suggested_filename: Optional[str] = None,
) -> DownloadMediaResult:
"""Download a direct file (PDF, image, document, etc.) without yt-dlp."""
ensure_directory(output_dir)
from urllib.parse import unquote, urlparse, parse_qs
import re
def _sanitize_filename(name: str) -> str:
# Windows-safe filename sanitization.
# Keep it simple: strip path parts, drop invalid chars, collapse whitespace.
text = str(name or "").strip()
if not text:
return ""
# Remove any path components
text = text.replace("/", "\\")
text = text.split("\\")[-1]
invalid = set('<>:"/\\|?*')
cleaned_chars: List[str] = []
for ch in text:
o = ord(ch)
if o < 32:
cleaned_chars.append(" ")
continue
if ch in invalid:
cleaned_chars.append(" ")
continue
cleaned_chars.append(ch)
cleaned = " ".join("".join(cleaned_chars).split()).strip()
# Avoid trailing dots/spaces on Windows
cleaned = cleaned.rstrip(" .")
return cleaned
def _unique_path(path: Path) -> Path:
if not path.exists():
return path
stem = path.stem
suffix = path.suffix
parent = path.parent
for i in range(1, 10_000):
candidate = parent / f"{stem} ({i}){suffix}"
if not candidate.exists():
return candidate
return parent / f"{stem} ({int(time.time())}){suffix}"
# Extract filename from URL
parsed_url = urlparse(url)
url_path = parsed_url.path
# Try to get filename from query parameters first (for LibGen and similar services)
# e.g., ?filename=Book+Title.pdf or &download=filename.pdf
filename = None
if parsed_url.query:
query_params = parse_qs(parsed_url.query)
for param_name in ('filename', 'download', 'file', 'name'):
if param_name in query_params and query_params[param_name]:
filename = query_params[param_name][0]
filename = unquote(filename)
break
# If not found in query params, extract from URL path
if not filename or not filename.strip():
filename = url_path.split("/")[-1] if url_path else ""
filename = unquote(filename)
# Remove query strings from filename if any
if "?" in filename:
filename = filename.split("?")[0]
# Try to get real filename from Content-Disposition header (HEAD request)
try:
with HTTPClient(timeout=10.0) as client:
response = client._request("HEAD", url, follow_redirects=True)
content_disposition = response.headers.get("content-disposition", "")
if content_disposition:
# Extract filename from Content-Disposition header
# Format: attachment; filename="filename.pdf" or filename=filename.pdf
match = re.search(r'filename\*?=(?:"([^"]*)"|([^;\s]*))', content_disposition)
if match:
extracted_name = match.group(1) or match.group(2)
if extracted_name:
filename = unquote(extracted_name)
if not quiet:
debug(f"Filename from Content-Disposition: {filename}")
except Exception as e:
if not quiet:
log(f"Could not get filename from headers: {e}", file=sys.stderr)
# Apply suggested filename (from provider title) if given.
suggested = _sanitize_filename(suggested_filename) if suggested_filename else ""
if suggested:
# Preserve extension from suggested name if present; otherwise borrow from detected filename.
suggested_path = Path(suggested)
if suggested_path.suffix:
filename = suggested
else:
detected_ext = ""
try:
detected_ext = Path(str(filename)).suffix
except Exception:
detected_ext = ""
if detected_ext:
filename = suggested + detected_ext
else:
filename = suggested
# Final fallback if we still don't have a good filename
if not filename or "." not in filename:
filename = "downloaded_file.bin"
file_path = _unique_path(output_dir / filename)
progress_bar = ProgressBar()
if not quiet:
debug(f"Direct download: {filename}")
try:
start_time = time.time()
downloaded_bytes = [0]
total_bytes = [0]
last_progress_time = [start_time]
def progress_callback(bytes_downloaded: int, content_length: int) -> None:
downloaded_bytes[0] = bytes_downloaded
total_bytes[0] = content_length
now = time.time()
if now - last_progress_time[0] < 0.5:
return
elapsed = now - start_time
percent = (bytes_downloaded / content_length) * 100 if content_length > 0 else 0
speed = bytes_downloaded / elapsed if elapsed > 0 else 0
eta_str: Optional[str] = None
if content_length > 0 and speed > 0:
try:
eta_seconds = max(0.0, float(content_length - bytes_downloaded) / float(speed))
minutes, seconds = divmod(int(eta_seconds), 60)
hours, minutes = divmod(minutes, 60)
eta_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
except Exception:
eta_str = None
speed_str = progress_bar.format_bytes(speed) + "/s"
progress_line = progress_bar.format_progress(
percent_str=f"{percent:.1f}%",
downloaded=bytes_downloaded,
total=content_length if content_length > 0 else None,
speed_str=speed_str,
eta_str=eta_str,
)
if not quiet:
try:
if getattr(sys.stderr, "isatty", lambda: False)():
sys.stderr.write("\r" + progress_line + " ")
sys.stderr.flush()
else:
# Non-interactive: print occasional progress lines.
log(progress_line, file=sys.stderr)
except Exception:
pass
last_progress_time[0] = now
with HTTPClient(timeout=30.0) as client:
client.download(url, str(file_path), progress_callback=progress_callback)
elapsed = time.time() - start_time
# Clear in-place progress bar.
if not quiet:
try:
if getattr(sys.stderr, "isatty", lambda: False)():
sys.stderr.write("\r" + (" " * 140) + "\r")
sys.stderr.flush()
except Exception:
pass
avg_speed_str = progress_bar.format_bytes(downloaded_bytes[0] / elapsed if elapsed > 0 else 0) + "/s"
if not quiet:
debug(f"✓ Downloaded in {elapsed:.1f}s at {avg_speed_str}")
# For direct file downloads, create minimal info dict without filename as title
# This prevents creating duplicate title: tags when filename gets auto-generated
# We'll add title back later only if we couldn't extract meaningful tags
info = {
"id": filename.rsplit(".", 1)[0],
"ext": filename.rsplit(".", 1)[1] if "." in filename else "bin",
"webpage_url": url,
}
hash_value = None
try:
hash_value = sha256_file(file_path)
except Exception:
pass
tags = []
if extract_ytdlp_tags:
try:
tags = extract_ytdlp_tags(info)
except Exception as e:
log(f"Error extracting tags: {e}", file=sys.stderr)
# Only use filename as a title tag if we couldn't extract any meaningful tags
# This prevents duplicate title: tags when the filename could be mistaken for metadata
if not any(t.startswith('title:') for t in tags):
# Re-extract tags with filename as title only if needed
info['title'] = filename
tags = []
if extract_ytdlp_tags:
try:
tags = extract_ytdlp_tags(info)
except Exception as e:
log(f"Error extracting tags with filename: {e}", file=sys.stderr)
if debug_logger is not None:
debug_logger.write_record(
"direct-file-downloaded",
{"url": url, "path": str(file_path), "hash": hash_value},
)
return DownloadMediaResult(
path=file_path,
info=info,
tag=tags,
source_url=url,
hash_value=hash_value,
)
except (httpx.HTTPError, httpx.RequestError) as exc:
log(f"Download error: {exc}", file=sys.stderr)
if debug_logger is not None:
debug_logger.write_record(
"exception",
{"phase": "direct-file", "url": url, "error": str(exc)},
)
raise DownloadError(f"Failed to download {url}: {exc}") from exc
except Exception as exc:
log(f"Error downloading file: {exc}", file=sys.stderr)
if debug_logger is not None:
debug_logger.write_record(
"exception",
{
"phase": "direct-file",
"url": url,
"error": str(exc),
"traceback": traceback.format_exc(),
},
)
raise DownloadError(f"Error downloading file: {exc}") from exc
def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15) -> Optional[Dict[str, Any]]:
"""Probe URL to extract metadata WITHOUT downloading.
Args:
url: URL to probe
no_playlist: If True, ignore playlists and probe only the single video
timeout_seconds: Max seconds to wait for probe (default 15s)
Returns:
Dict with keys: extractor, title, entries (if playlist), duration, etc.
Returns None if not supported by yt-dlp or on timeout.
"""
if not is_url_supported_by_ytdlp(url):
return None
# Wrap probe in timeout to prevent hanging on large playlists
import threading
from typing import cast
result_container: List[Optional[Any]] = [None, None] # [result, error]
def _do_probe() -> None:
try:
_ensure_yt_dlp_ready()
assert yt_dlp is not None
# Extract info without downloading
# Use extract_flat='in_playlist' to get full metadata for playlist items
ydl_opts = {
"quiet": True, # Suppress all output
"no_warnings": True,
"socket_timeout": 10,
"retries": 2, # Reduce retries for faster timeout
"skip_download": True, # Don't actually download
"extract_flat": "in_playlist", # Get playlist with metadata for each entry
"noprogress": True, # No progress bars
}
# Cookies are optional for probing; callers should pass cookiefile via DownloadOptions when needed.
# Add no_playlist option if specified
if no_playlist:
ydl_opts["noplaylist"] = True
with yt_dlp.YoutubeDL(ydl_opts) as ydl: # type: ignore[arg-type]
info = ydl.extract_info(url, download=False)
if not isinstance(info, dict):
result_container[0] = None
return
# Extract relevant fields
result_container[0] = {
"extractor": info.get("extractor", ""),
"title": info.get("title", ""),
"entries": info.get("entries", []), # Will be populated if playlist
"duration": info.get("duration"),
"uploader": info.get("uploader"),
"description": info.get("description"),
"url": url,
}
except Exception as exc:
log(f"Probe error for {url}: {exc}")
result_container[1] = exc
thread = threading.Thread(target=_do_probe, daemon=False)
thread.start()
thread.join(timeout=timeout_seconds)
if thread.is_alive():
# Probe timed out - return None to fall back to direct download
debug(f"Probe timeout for {url} (>={timeout_seconds}s), proceeding with download")
return None
if result_container[1] is not None:
# Probe error - return None to proceed anyway
return None
return cast(Optional[Dict[str, Any]], result_container[0])
__all__ = [
"is_url_supported_by_ytdlp",
"list_formats",
"probe_url",
"DownloadError",
"DownloadOptions",
"DownloadMediaResult",
]