Files
Medios-Macina/SYS/download.py

1117 lines
41 KiB
Python
Raw Normal View History

2025-11-25 20:09:33 -08:00
"""Download media files using yt-dlp with support for direct file downloads.
Lean, focused downloader without event infrastructure overhead.
- yt-dlp integration for streaming sites
- Direct file download fallback for PDFs, images, documents
- Tag extraction via metadata.extract_ytdlp_tags()
- Logging via helper.logger.log()
"""
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
from __future__ import annotations
2025-12-03 15:18:57 -08:00
import glob # noqa: F401
import hashlib
import json # noqa: F401
import random
import re
import string
2025-12-01 14:42:30 -08:00
import subprocess
2025-11-25 20:09:33 -08:00
import sys
import time
import traceback
from pathlib import Path
from typing import Any, Dict, Iterator, List, Optional
2025-12-07 00:21:30 -08:00
from urllib.parse import urljoin, urlparse
2025-11-25 20:09:33 -08:00
import httpx
2025-12-11 19:04:02 -08:00
from SYS.logger import log, debug
from SYS.utils import ensure_directory, sha256_file
from API.HTTP import HTTPClient
from SYS.models import DownloadError, DownloadOptions, DownloadMediaResult, DebugLogger, ProgressBar
2025-11-25 20:09:33 -08:00
try:
import yt_dlp # type: ignore
from yt_dlp.extractor import gen_extractors # type: ignore
except Exception as exc:
yt_dlp = None # type: ignore
YTDLP_IMPORT_ERROR = exc
else:
YTDLP_IMPORT_ERROR = None
try:
from SYS.metadata import extract_ytdlp_tags
2025-11-25 20:09:33 -08:00
except ImportError:
extract_ytdlp_tags = None
_EXTRACTOR_CACHE: List[Any] | None = None
2025-12-20 23:57:44 -08:00
_YTDLP_PROGRESS = ProgressBar()
2025-11-25 20:09:33 -08:00
def _ensure_yt_dlp_ready() -> None:
"""Verify yt-dlp is available, raise if not."""
if yt_dlp is not None:
return
detail = str(YTDLP_IMPORT_ERROR or "yt-dlp is not installed")
raise DownloadError(f"yt-dlp module not available: {detail}")
def _progress_callback(status: Dict[str, Any]) -> None:
"""Simple progress callback using logger."""
event = status.get("status")
if event == "downloading":
2025-12-20 23:57:44 -08:00
downloaded = status.get("downloaded_bytes")
total = status.get("total_bytes") or status.get("total_bytes_estimate")
_YTDLP_PROGRESS.update(
downloaded=int(downloaded or 0),
total=int(total) if total else None,
label="download",
file=sys.stderr,
)
2025-11-25 20:09:33 -08:00
elif event == "finished":
2025-12-20 23:57:44 -08:00
_YTDLP_PROGRESS.finish()
2025-11-27 10:59:01 -08:00
debug(f"✓ Download finished: {status.get('filename')}")
2025-11-25 20:09:33 -08:00
elif event in ("postprocessing", "processing"):
debug(f"Post-processing: {status.get('postprocessor')}")
def is_url_supported_by_ytdlp(url: str) -> bool:
"""Check if URL is supported by yt-dlp."""
if yt_dlp is None:
return False
global _EXTRACTOR_CACHE
if _EXTRACTOR_CACHE is None:
try:
_EXTRACTOR_CACHE = [ie for ie in gen_extractors()] # type: ignore[arg-type]
except Exception:
_EXTRACTOR_CACHE = []
for extractor in _EXTRACTOR_CACHE:
try:
if not extractor.suitable(url):
continue
except Exception:
continue
name = getattr(extractor, "IE_NAME", "")
if name.lower() == "generic":
continue
return True
return False
2025-12-29 17:05:03 -08:00
def list_formats(
url: str,
no_playlist: bool = False,
playlist_items: Optional[str] = None
) -> Optional[List[Dict[str,
Any]]]:
2025-12-07 00:21:30 -08:00
"""Get list of available formats for a URL using yt-dlp."""
2025-11-25 20:09:33 -08:00
_ensure_yt_dlp_ready()
try:
ydl_opts = {
2025-11-26 00:29:10 -08:00
"quiet": True,
"no_warnings": True,
2025-11-25 20:09:33 -08:00
"socket_timeout": 30,
}
2025-12-07 00:21:30 -08:00
2025-11-25 20:09:33 -08:00
if no_playlist:
ydl_opts["noplaylist"] = True
2025-12-07 00:21:30 -08:00
2025-11-25 20:09:33 -08:00
if playlist_items:
ydl_opts["playlist_items"] = playlist_items
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
debug(f"Fetching format list for: {url}")
info = ydl.extract_info(url, download=False)
2025-12-07 00:21:30 -08:00
2025-11-25 20:09:33 -08:00
formats = info.get("formats", [])
if not formats:
log("No formats available", file=sys.stderr)
return None
2025-12-07 00:21:30 -08:00
2025-11-25 20:09:33 -08:00
result_formats = []
for fmt in formats:
2025-12-29 17:05:03 -08:00
result_formats.append(
{
"format_id": fmt.get("format_id",
""),
"format": fmt.get("format",
""),
"ext": fmt.get("ext",
""),
"resolution": fmt.get("resolution",
""),
2025-12-29 17:05:03 -08:00
"width": fmt.get("width"),
"height": fmt.get("height"),
"fps": fmt.get("fps"),
"vcodec": fmt.get("vcodec",
"none"),
"acodec": fmt.get("acodec",
"none"),
2025-12-29 17:05:03 -08:00
"filesize": fmt.get("filesize"),
"tbr": fmt.get("tbr"),
}
)
2025-12-07 00:21:30 -08:00
2025-11-25 20:09:33 -08:00
debug(f"Found {len(result_formats)} available formats")
return result_formats
2025-12-07 00:21:30 -08:00
2025-11-25 20:09:33 -08:00
except Exception as e:
log(f"✗ Error fetching formats: {e}", file=sys.stderr)
return None
2025-12-01 14:42:30 -08:00
2025-12-29 17:05:03 -08:00
def _download_with_sections_via_cli(
url: str,
ytdl_options: Dict[str,
Any],
sections: List[str],
quiet: bool = False
) -> tuple[Optional[str],
Dict[str,
Any]]:
2025-12-01 14:42:30 -08:00
"""Download each section separately so merge-file can combine them.
2025-12-29 17:05:03 -08:00
2025-12-01 14:42:30 -08:00
yt-dlp with multiple --download-sections args merges them into one file.
We need separate files for merge-file, so download each section individually.
2025-12-29 17:05:03 -08:00
2025-12-03 15:18:57 -08:00
Uses hash-based filenames for sections (not title-based) to prevent yt-dlp from
thinking sections are already downloaded. The title is extracted and stored in tags.
2025-12-29 17:05:03 -08:00
2025-12-03 15:18:57 -08:00
Returns:
(session_id, first_section_info_dict) - session_id for finding files, info dict for metadata extraction
2025-12-01 14:42:30 -08:00
"""
2025-12-29 17:05:03 -08:00
2025-12-01 14:42:30 -08:00
sections_list = ytdl_options.get("download_sections", [])
if not sections_list:
2025-12-03 15:18:57 -08:00
return "", {}
2025-12-29 17:05:03 -08:00
2025-12-03 15:18:57 -08:00
# Generate a unique hash-based ID for this download session
# This ensures different videos/downloads don't have filename collisions
session_id = hashlib.md5(
(url + str(time.time()) + "".join(random.choices(string.ascii_letters,
k=10))).encode()
2025-12-03 15:18:57 -08:00
).hexdigest()[:12]
2025-12-29 17:05:03 -08:00
2025-12-03 15:18:57 -08:00
first_section_info = None
title_from_first = None
2025-12-29 17:05:03 -08:00
2025-12-03 15:18:57 -08:00
# Download each section separately with unique output template using session ID
2025-12-01 14:42:30 -08:00
for section_idx, section in enumerate(sections_list, 1):
2025-12-03 15:18:57 -08:00
# Build unique output template for this section using session-based filename
# e.g., "{session_id}_{section_idx}.ext" - simple and unique per section
2025-12-01 14:42:30 -08:00
base_outtmpl = ytdl_options.get("outtmpl", "%(title)s.%(ext)s")
2025-12-03 15:18:57 -08:00
output_dir_path = Path(base_outtmpl).parent
2025-12-29 17:05:03 -08:00
2025-12-03 15:18:57 -08:00
# Use session_id + section index for temp filename
# e.g., "/path/{session_id}_1.%(ext)s"
filename_tmpl = f"{session_id}_{section_idx}"
2025-12-01 14:42:30 -08:00
if base_outtmpl.endswith(".%(ext)s"):
2025-12-03 15:18:57 -08:00
filename_tmpl += ".%(ext)s"
2025-12-29 17:05:03 -08:00
2025-12-03 15:18:57 -08:00
# Use Path to handle separators correctly for the OS
section_outtmpl = str(output_dir_path / filename_tmpl)
2025-12-29 17:05:03 -08:00
2025-12-03 15:18:57 -08:00
# For the first section, extract metadata first (separate call)
if section_idx == 1:
metadata_cmd = ["yt-dlp", "--dump-json", "--skip-download"]
if ytdl_options.get("cookiefile"):
cookies_path = ytdl_options["cookiefile"].replace("\\", "/")
metadata_cmd.extend(["--cookies", cookies_path])
if ytdl_options.get("noplaylist"):
metadata_cmd.append("--no-playlist")
metadata_cmd.append(url)
2025-12-29 17:05:03 -08:00
2025-12-03 15:18:57 -08:00
try:
meta_result = subprocess.run(
metadata_cmd,
capture_output=True,
text=True
)
2025-12-03 15:18:57 -08:00
if meta_result.returncode == 0 and meta_result.stdout:
try:
info_dict = json.loads(meta_result.stdout.strip())
first_section_info = info_dict
2025-12-29 17:05:03 -08:00
title_from_first = info_dict.get("title")
2025-12-11 12:47:30 -08:00
if not quiet:
debug(f"Extracted title from metadata: {title_from_first}")
2025-12-03 15:18:57 -08:00
except json.JSONDecodeError:
2025-12-11 12:47:30 -08:00
if not quiet:
debug("Could not parse JSON metadata")
2025-12-03 15:18:57 -08:00
except Exception as e:
2025-12-11 12:47:30 -08:00
if not quiet:
debug(f"Error extracting metadata: {e}")
2025-12-29 17:05:03 -08:00
2025-12-03 15:18:57 -08:00
# Build yt-dlp command for downloading this section
2025-12-01 14:42:30 -08:00
cmd = ["yt-dlp"]
2025-12-29 17:05:03 -08:00
2025-12-01 14:42:30 -08:00
# Add format
if ytdl_options.get("format"):
cmd.extend(["-f", ytdl_options["format"]])
2025-12-29 17:05:03 -08:00
2025-12-01 14:42:30 -08:00
# Add ONLY this section (not all sections)
cmd.extend(["--download-sections", section])
2025-12-29 17:05:03 -08:00
2025-12-01 14:42:30 -08:00
# Add force-keyframes-at-cuts if specified
if ytdl_options.get("force_keyframes_at_cuts"):
cmd.append("--force-keyframes-at-cuts")
2025-12-29 17:05:03 -08:00
2025-12-01 14:42:30 -08:00
# Add output template for this section
cmd.extend(["-o", section_outtmpl])
2025-12-29 17:05:03 -08:00
2025-12-01 14:42:30 -08:00
# Add cookies file if present
if ytdl_options.get("cookiefile"):
# Convert backslashes to forward slashes for better compatibility
cookies_path = ytdl_options["cookiefile"].replace("\\", "/")
cmd.extend(["--cookies", cookies_path])
2025-12-29 17:05:03 -08:00
2025-12-01 14:42:30 -08:00
# Add no-playlist if specified
if ytdl_options.get("noplaylist"):
cmd.append("--no-playlist")
2025-12-29 17:05:03 -08:00
2025-12-01 14:42:30 -08:00
# Add the URL
cmd.append(url)
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
if not quiet:
debug(
f"Running yt-dlp for section {section_idx}/{len(sections_list)}: {section}"
)
2025-12-11 12:47:30 -08:00
debug(f"Command: {' '.join(cmd)}")
2025-12-29 17:05:03 -08:00
2025-12-03 15:18:57 -08:00
# Run the subprocess - don't capture output so progress is shown
2025-12-01 14:42:30 -08:00
try:
2025-12-03 15:18:57 -08:00
result = subprocess.run(cmd)
2025-12-29 17:05:03 -08:00
2025-12-01 14:42:30 -08:00
if result.returncode != 0:
2025-12-29 17:05:03 -08:00
raise DownloadError(
f"yt-dlp subprocess failed for section {section_idx} with code {result.returncode}"
)
2025-12-01 14:42:30 -08:00
except Exception as exc:
2025-12-29 17:05:03 -08:00
raise DownloadError(
f"yt-dlp subprocess error for section {section_idx}: {exc}"
) from exc
2025-12-03 15:18:57 -08:00
return session_id, first_section_info or {}
2025-12-01 14:42:30 -08:00
2025-11-25 20:09:33 -08:00
def _build_ytdlp_options(opts: DownloadOptions) -> Dict[str, Any]:
"""Build yt-dlp download options."""
ensure_directory(opts.output_dir)
2025-12-01 14:42:30 -08:00
# Build output template
# When downloading sections, each section will have .section_N_of_M added by _download_with_sections_via_cli
2025-11-25 20:09:33 -08:00
outtmpl = str((opts.output_dir / "%(title)s.%(ext)s").resolve())
base_options: Dict[str,
Any] = {
"outtmpl": outtmpl,
"quiet": True,
"no_warnings": True,
"noprogress": True,
"socket_timeout": 30,
"retries": 10,
"fragment_retries": 10,
"http_chunk_size": 10_485_760,
"restrictfilenames": True,
"progress_hooks": [] if opts.quiet else [_progress_callback],
}
2025-11-25 20:09:33 -08:00
if opts.cookies_path and opts.cookies_path.is_file():
base_options["cookiefile"] = str(opts.cookies_path)
2025-12-01 14:42:30 -08:00
else:
2025-12-16 23:23:43 -08:00
# Fallback to browser cookies
base_options["cookiesfrombrowser"] = ("chrome",
)
2025-11-25 20:09:33 -08:00
2025-12-11 12:47:30 -08:00
# Add no-playlist option if specified (for single video from playlist url)
2025-11-25 20:09:33 -08:00
if opts.no_playlist:
base_options["noplaylist"] = True
# Configure based on mode
if opts.mode == "audio":
base_options["format"] = opts.ytdl_format or "251/140/bestaudio"
base_options["postprocessors"] = [{
"key": "FFmpegExtractAudio"
}]
2025-11-25 20:09:33 -08:00
else: # video
base_options["format"] = opts.ytdl_format or "bestvideo+bestaudio/best"
base_options["format_sort"] = [
2025-12-29 17:05:03 -08:00
"res:4320",
"res:2880",
"res:2160",
"res:1440",
"res:1080",
"res:720",
"res",
2025-11-25 20:09:33 -08:00
]
2025-12-01 14:42:30 -08:00
# Add clip sections if provided (yt-dlp will download only these sections)
2025-11-25 20:09:33 -08:00
if opts.clip_sections:
2025-12-29 17:05:03 -08:00
# Parse section ranges like "48-65,120-152,196-205" (seconds)
2025-12-01 14:42:30 -08:00
# and convert to yt-dlp format: "*HH:MM:SS-HH:MM:SS,*HH:MM:SS-HH:MM:SS"
sections = []
2025-12-29 17:05:03 -08:00
for section_range in opts.clip_sections.split(","):
2025-12-01 14:42:30 -08:00
try:
2025-12-29 17:05:03 -08:00
start_str, end_str = section_range.strip().split("-")
2025-12-01 14:42:30 -08:00
start_sec = float(start_str)
end_sec = float(end_str)
2025-12-29 17:05:03 -08:00
2025-12-01 14:42:30 -08:00
# Convert seconds to HH:MM:SS format
def sec_to_hhmmss(seconds):
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = int(seconds % 60)
return f"{hours:02d}:{minutes:02d}:{secs:02d}"
2025-12-29 17:05:03 -08:00
2025-12-01 14:42:30 -08:00
start_time = sec_to_hhmmss(start_sec)
end_time = sec_to_hhmmss(end_sec)
sections.append(f"*{start_time}-{end_time}")
except (ValueError, AttributeError):
pass
2025-12-29 17:05:03 -08:00
2025-12-01 14:42:30 -08:00
if sections:
# Pass each section as a separate element in the list (yt-dlp expects multiple --download-sections args)
base_options["download_sections"] = sections
debug(f"Download sections configured: {', '.join(sections)}")
2025-12-03 15:18:57 -08:00
# Note: Not using --force-keyframes-at-cuts to avoid re-encoding
# This may result in less precise cuts but faster downloads
2025-11-25 20:09:33 -08:00
# Add playlist items selection if provided
if opts.playlist_items:
base_options["playlist_items"] = opts.playlist_items
2025-12-11 12:47:30 -08:00
if not opts.quiet:
debug(f"yt-dlp: mode={opts.mode}, format={base_options.get('format')}")
2025-11-25 20:09:33 -08:00
return base_options
def _iter_download_entries(info: Dict[str, Any]) -> Iterator[Dict[str, Any]]:
"""Iterate through download entries, handling playlists."""
queue: List[Dict[str, Any]] = [info]
seen: set[int] = set()
while queue:
current = queue.pop(0)
obj_id = id(current)
if obj_id in seen:
continue
seen.add(obj_id)
entries = current.get("entries")
if isinstance(entries, list):
for entry in entries:
if isinstance(entry, dict):
queue.append(entry)
if current.get("requested_downloads") or not entries:
yield current
def _candidate_paths(entry: Dict[str, Any], output_dir: Path) -> Iterator[Path]:
"""Get candidate file paths for downloaded media."""
requested = entry.get("requested_downloads")
if isinstance(requested, list):
for item in requested:
if isinstance(item, dict):
for key in ("filepath", "_filename", "filename"):
value = item.get(key)
if value:
yield Path(value)
for key in ("filepath", "_filename", "filename"):
value = entry.get(key)
if value:
yield Path(value)
if entry.get("filename"):
yield output_dir / entry["filename"]
def _resolve_entry_and_path(info: Dict[str,
Any],
output_dir: Path) -> tuple[Dict[str,
Any],
Path]:
2025-11-25 20:09:33 -08:00
"""Find downloaded file in yt-dlp metadata."""
for entry in _iter_download_entries(info):
for candidate in _candidate_paths(entry, output_dir):
if candidate.is_file():
return entry, candidate
if not candidate.is_absolute():
resolved = output_dir / candidate
if resolved.is_file():
return entry, resolved
raise FileNotFoundError("yt-dlp did not report a downloaded media file")
def _extract_sha256(info: Dict[str, Any]) -> Optional[str]:
"""Extract SHA256 hash from yt-dlp metadata."""
for payload in [info] + info.get("entries", []):
if not isinstance(payload, dict):
continue
hashes = payload.get("hashes")
if isinstance(hashes, dict):
for key in ("sha256", "sha-256", "sha_256"):
value = hashes.get(key)
if isinstance(value, str) and value.strip():
return value.strip().lower()
for key in ("sha256", "sha-256", "sha_256"):
value = payload.get(key)
if isinstance(value, str) and value.strip():
return value.strip().lower()
return None
def _get_libgen_download_url(libgen_url: str) -> Optional[str]:
"""Extract the actual download link from LibGen redirect URL.
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
LibGen url like https://libgen.gl/file.php?id=123456 redirect to
actual mirror url. This follows the redirect chain to get the real file.
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Args:
libgen_url: LibGen file.php URL
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Returns:
Actual download URL or None if extraction fails
"""
try:
import requests
from urllib.parse import urlparse
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Check if this is a LibGen URL
parsed = urlparse(libgen_url)
2025-12-29 17:05:03 -08:00
if "libgen" not in parsed.netloc.lower():
2025-11-25 20:09:33 -08:00
return None
2025-12-29 17:05:03 -08:00
if "/file.php" not in parsed.path.lower():
2025-11-25 20:09:33 -08:00
return None
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# LibGen redirects to actual mirrors, follow redirects to get final URL
session = requests.Session()
2025-12-29 17:05:03 -08:00
session.headers.update(
{
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
2025-12-29 17:05:03 -08:00
)
2025-11-25 20:09:33 -08:00
debug(f"Following LibGen redirect chain for: {libgen_url}")
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# First, get the page and look for direct download link
try:
response = session.get(libgen_url, timeout=10, allow_redirects=True)
final_url = response.url
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Try to find actual download link in the page
try:
2025-12-16 23:23:43 -08:00
try:
from lxml import html as lxml_html
except ImportError:
lxml_html = None
if lxml_html is not None:
doc = lxml_html.fromstring(response.content)
for a in doc.xpath("//a[@href]"):
href = str(a.get("href") or "").strip()
if not href:
continue
href_lower = href.lower()
if "get.php" in href_lower or href_lower.endswith((".pdf",
".epub",
".djvu",
".mobi")):
2025-12-29 17:05:03 -08:00
download_url = (
href if href.startswith("http") else
urljoin(final_url,
href)
2025-12-29 17:05:03 -08:00
)
2025-11-25 20:09:33 -08:00
debug(f"Found download link: {download_url}")
return download_url
2025-12-16 23:23:43 -08:00
else:
# Regex fallback
for m in re.finditer(
r"href=[\"\']([^\"\']+)[\"\']",
response.text or "",
flags=re.IGNORECASE,
2025-12-16 23:23:43 -08:00
):
href = str(m.group(1) or "").strip()
if not href or href.lower().startswith("javascript:"):
continue
href_lower = href.lower()
if "get.php" in href_lower or href_lower.endswith((".pdf",
".epub",
".djvu",
".mobi")):
2025-12-29 17:05:03 -08:00
download_url = (
href if href.startswith("http") else
urljoin(final_url,
href)
2025-12-29 17:05:03 -08:00
)
2025-12-16 23:23:43 -08:00
debug(f"Found download link: {download_url}")
return download_url
except Exception:
pass
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# If we followed redirects successfully, return the final URL
# This handles cases where libgen redirects to a direct download mirror
if final_url != libgen_url:
debug(f"LibGen resolved to mirror: {final_url}")
return final_url
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
except requests.RequestException as e:
log(f"Error following LibGen redirects: {e}", file=sys.stderr)
# Try head request as fallback
try:
response = session.head(libgen_url, allow_redirects=True, timeout=10)
if response.url != libgen_url:
debug(f"LibGen HEAD resolved to: {response.url}")
return response.url
except:
pass
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
return None
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
except Exception as e:
log(f"Error resolving LibGen URL: {e}", file=sys.stderr)
return None
def _download_direct_file(
url: str,
output_dir: Path,
debug_logger: Optional[DebugLogger] = None,
2025-12-11 12:47:30 -08:00
quiet: bool = False,
2025-12-18 22:50:21 -08:00
suggested_filename: Optional[str] = None,
2025-12-28 04:13:11 -08:00
pipeline_progress: Optional[Any] = None,
2025-11-25 20:09:33 -08:00
) -> DownloadMediaResult:
"""Download a direct file (PDF, image, document, etc.) without yt-dlp."""
ensure_directory(output_dir)
from urllib.parse import unquote, urlparse, parse_qs
import re
2025-12-29 17:05:03 -08:00
2025-12-18 22:50:21 -08:00
def _sanitize_filename(name: str) -> str:
# Windows-safe filename sanitization.
# Keep it simple: strip path parts, drop invalid chars, collapse whitespace.
text = str(name or "").strip()
if not text:
return ""
# Remove any path components
text = text.replace("/", "\\")
text = text.split("\\")[-1]
invalid = set('<>:"/\\|?*')
cleaned_chars: List[str] = []
for ch in text:
o = ord(ch)
if o < 32:
cleaned_chars.append(" ")
continue
if ch in invalid:
cleaned_chars.append(" ")
continue
cleaned_chars.append(ch)
cleaned = " ".join("".join(cleaned_chars).split()).strip()
# Avoid trailing dots/spaces on Windows
cleaned = cleaned.rstrip(" .")
return cleaned
def _unique_path(path: Path) -> Path:
if not path.exists():
return path
stem = path.stem
suffix = path.suffix
parent = path.parent
for i in range(1, 10_000):
candidate = parent / f"{stem} ({i}){suffix}"
if not candidate.exists():
return candidate
return parent / f"{stem} ({int(time.time())}){suffix}"
2025-11-25 20:09:33 -08:00
# Extract filename from URL
parsed_url = urlparse(url)
url_path = parsed_url.path
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Try to get filename from query parameters first (for LibGen and similar services)
# e.g., ?filename=Book+Title.pdf or &download=filename.pdf
filename = None
if parsed_url.query:
query_params = parse_qs(parsed_url.query)
2025-12-29 17:05:03 -08:00
for param_name in ("filename", "download", "file", "name"):
2025-11-25 20:09:33 -08:00
if param_name in query_params and query_params[param_name]:
filename = query_params[param_name][0]
filename = unquote(filename)
break
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# If not found in query params, extract from URL path
if not filename or not filename.strip():
filename = url_path.split("/")[-1] if url_path else ""
filename = unquote(filename)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Remove query strings from filename if any
if "?" in filename:
filename = filename.split("?")[0]
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
# Try to get real filename from Content-Disposition header (HEAD request)
2025-12-22 02:11:53 -08:00
content_type = ""
2025-11-25 20:09:33 -08:00
try:
with HTTPClient(timeout=10.0) as client:
response = client._request("HEAD", url, follow_redirects=True)
content_disposition = response.headers.get("content-disposition", "")
2025-12-22 02:11:53 -08:00
try:
content_type = str(response.headers.get("content-type",
"") or "").strip().lower()
2025-12-22 02:11:53 -08:00
except Exception:
content_type = ""
2025-11-25 20:09:33 -08:00
if content_disposition:
# Extract filename from Content-Disposition header
# Format: attachment; filename="filename.pdf" or filename=filename.pdf
match = re.search(
r'filename\*?=(?:"([^"]*)"|([^;\s]*))',
content_disposition
)
2025-11-25 20:09:33 -08:00
if match:
extracted_name = match.group(1) or match.group(2)
if extracted_name:
filename = unquote(extracted_name)
2025-12-11 12:47:30 -08:00
if not quiet:
debug(f"Filename from Content-Disposition: {filename}")
2025-11-25 20:09:33 -08:00
except Exception as e:
2025-12-11 12:47:30 -08:00
if not quiet:
log(f"Could not get filename from headers: {e}", file=sys.stderr)
2025-12-25 04:49:22 -08:00
# Guardrail: never treat HTML landing pages as downloadable files.
# We explicitly probe with GET for page-like endpoints (e.g. *.php) since some
# servers block/lie on HEAD, and a URL path like `edition.php` would otherwise
# be saved as a bogus file.
try:
page_like_exts = {".php",
".asp",
".aspx",
".jsp",
".cgi"}
2025-12-25 04:49:22 -08:00
ext = ""
try:
ext = Path(str(filename or "")).suffix.lower()
except Exception:
ext = ""
ct0 = (content_type or "").split(";", 1)[0].strip().lower()
must_probe = bool(ct0.startswith("text/html") or ext in page_like_exts)
if must_probe:
with HTTPClient(timeout=10.0) as client:
with client._request_stream("GET", url, follow_redirects=True) as resp:
resp.raise_for_status()
2025-12-29 17:05:03 -08:00
ct = (
str(resp.headers.get("content-type",
"") or "").split(";",
1)[0].strip().lower()
2025-12-29 17:05:03 -08:00
)
2025-12-25 04:49:22 -08:00
if ct.startswith("text/html"):
raise DownloadError(
"URL appears to be an HTML page, not a direct file"
)
2025-12-25 04:49:22 -08:00
except DownloadError:
raise
except Exception:
# If we can't probe, keep going; later logic may still infer a safe extension.
pass
2025-12-29 17:05:03 -08:00
2025-12-18 22:50:21 -08:00
# Apply suggested filename (from provider title) if given.
suggested = _sanitize_filename(suggested_filename) if suggested_filename else ""
if suggested:
# Preserve extension from suggested name if present; otherwise borrow from detected filename.
suggested_path = Path(suggested)
if suggested_path.suffix:
filename = suggested
else:
detected_ext = ""
try:
detected_ext = Path(str(filename)).suffix
except Exception:
detected_ext = ""
if detected_ext:
filename = suggested + detected_ext
else:
filename = suggested
2025-12-22 02:11:53 -08:00
# If we still don't have an extension, try to infer one from Content-Type.
# Never fall back to a generic `.bin` extension.
try:
has_ext = bool(filename and Path(str(filename)).suffix)
except Exception:
has_ext = False
if filename and (not has_ext):
ct = (content_type or "").split(";")[0].strip().lower()
ext_by_ct = {
"application/pdf": ".pdf",
"application/epub+zip": ".epub",
"application/x-mobipocket-ebook": ".mobi",
"image/jpeg": ".jpg",
"image/png": ".png",
"image/webp": ".webp",
"image/gif": ".gif",
"text/plain": ".txt",
"application/zip": ".zip",
}
if ct in ext_by_ct:
filename = f"{filename}{ext_by_ct[ct]}"
elif ct.startswith("text/html"):
# Guardrail: HTML landing pages should not be downloaded as opaque files.
raise DownloadError("URL appears to be an HTML page, not a direct file")
# Final guardrail: if filename is empty, refuse rather than inventing `download.bin`.
if not filename or not str(filename).strip():
2025-12-29 17:05:03 -08:00
raise DownloadError(
"Could not determine filename for URL (no Content-Disposition and no path filename)"
)
2025-11-25 20:09:33 -08:00
2025-12-18 22:50:21 -08:00
file_path = _unique_path(output_dir / filename)
2025-12-28 04:13:11 -08:00
# Prefer pipeline transfer bars when a Live UI is active.
use_pipeline_transfer = False
try:
if pipeline_progress is not None and hasattr(pipeline_progress,
"update_transfer"):
2025-12-28 04:13:11 -08:00
ui = None
if hasattr(pipeline_progress, "ui_and_pipe_index"):
ui, _ = pipeline_progress.ui_and_pipe_index() # type: ignore[attr-defined]
use_pipeline_transfer = ui is not None
except Exception:
use_pipeline_transfer = False
progress_bar: Optional[ProgressBar] = None
if (not quiet) and (not use_pipeline_transfer):
progress_bar = ProgressBar()
2025-11-25 20:09:33 -08:00
2025-12-11 12:47:30 -08:00
if not quiet:
debug(f"Direct download: {filename}")
2025-11-25 20:09:33 -08:00
try:
start_time = time.time()
downloaded_bytes = [0]
total_bytes = [0]
last_progress_time = [start_time]
2025-12-20 23:57:44 -08:00
rendered_once = [False]
2025-12-28 04:13:11 -08:00
transfer_started = [False]
def _maybe_begin_transfer(content_length: int) -> None:
if pipeline_progress is None:
return
if transfer_started[0]:
return
try:
2025-12-29 17:05:03 -08:00
total_val: Optional[int] = (
int(content_length)
if isinstance(content_length,
int) and content_length > 0 else None
2025-12-29 17:05:03 -08:00
)
2025-12-28 04:13:11 -08:00
except Exception:
total_val = None
try:
if hasattr(pipeline_progress, "begin_transfer"):
2025-12-29 17:05:03 -08:00
pipeline_progress.begin_transfer(
label=str(filename or "download"),
total=total_val
2025-12-29 17:05:03 -08:00
)
2025-12-28 04:13:11 -08:00
transfer_started[0] = True
except Exception:
return
2025-11-25 20:09:33 -08:00
def progress_callback(bytes_downloaded: int, content_length: int) -> None:
downloaded_bytes[0] = bytes_downloaded
total_bytes[0] = content_length
2025-12-28 04:13:11 -08:00
# Update pipeline transfer bar when present.
try:
if pipeline_progress is not None and hasattr(pipeline_progress,
"update_transfer"):
2025-12-28 04:13:11 -08:00
_maybe_begin_transfer(content_length)
2025-12-29 17:05:03 -08:00
total_val: Optional[int] = (
int(content_length)
if isinstance(content_length,
int) and content_length > 0 else None
2025-12-29 17:05:03 -08:00
)
2025-12-28 04:13:11 -08:00
pipeline_progress.update_transfer(
label=str(filename or "download"),
completed=int(bytes_downloaded)
if bytes_downloaded is not None else None,
2025-12-28 04:13:11 -08:00
total=total_val,
)
except Exception:
pass
2025-11-25 20:09:33 -08:00
now = time.time()
2025-12-20 23:57:44 -08:00
is_final = bool(content_length > 0 and bytes_downloaded >= content_length)
if (not rendered_once[0]) or is_final:
pass
elif now - last_progress_time[0] < 0.5:
2025-12-18 22:50:21 -08:00
return
elapsed = now - start_time
percent = (
bytes_downloaded / content_length
) * 100 if content_length > 0 else 0
2025-12-18 22:50:21 -08:00
speed = bytes_downloaded / elapsed if elapsed > 0 else 0
eta_str: Optional[str] = None
if content_length > 0 and speed > 0:
try:
eta_seconds = max(
0.0,
float(content_length - bytes_downloaded) / float(speed)
)
2025-12-18 22:50:21 -08:00
minutes, seconds = divmod(int(eta_seconds), 60)
hours, minutes = divmod(minutes, 60)
eta_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
except Exception:
eta_str = None
2025-12-28 04:13:11 -08:00
if progress_bar is not None:
progress_bar.update(
downloaded=bytes_downloaded,
total=content_length if content_length > 0 else None,
label=str(filename or "download"),
file=sys.stderr,
)
2025-12-18 22:50:21 -08:00
2025-12-20 23:57:44 -08:00
rendered_once[0] = True
2025-12-18 22:50:21 -08:00
last_progress_time[0] = now
2025-11-25 20:09:33 -08:00
with HTTPClient(timeout=30.0) as client:
client.download(url, str(file_path), progress_callback=progress_callback)
elapsed = time.time() - start_time
2025-12-28 04:13:11 -08:00
try:
if progress_bar is not None:
progress_bar.finish()
except Exception:
pass
try:
if (pipeline_progress is not None and transfer_started[0]
and hasattr(pipeline_progress,
"finish_transfer")):
2025-12-28 04:13:11 -08:00
pipeline_progress.finish_transfer(label=str(filename or "download"))
except Exception:
pass
try:
if progress_bar is not None:
2025-12-29 17:05:03 -08:00
avg_speed_str = (
progress_bar.
format_bytes(downloaded_bytes[0] / elapsed if elapsed > 0 else 0) +
"/s"
2025-12-29 17:05:03 -08:00
)
2025-12-28 04:13:11 -08:00
else:
avg_speed_str = f"{(downloaded_bytes[0] / elapsed if elapsed > 0 else 0):.1f} B/s"
except Exception:
avg_speed_str = ""
2025-12-11 12:47:30 -08:00
if not quiet:
debug(f"✓ Downloaded in {elapsed:.1f}s at {avg_speed_str}")
2025-11-25 20:09:33 -08:00
# For direct file downloads, create minimal info dict without filename as title
# This prevents creating duplicate title: tags when filename gets auto-generated
# We'll add title back later only if we couldn't extract meaningful tags
2025-12-22 02:11:53 -08:00
ext = ""
try:
ext = Path(str(filename)).suffix.lstrip(".")
except Exception:
ext = ""
2025-11-25 20:09:33 -08:00
info = {
"id": str(filename).rsplit(".",
1)[0] if "." in str(filename) else str(filename),
2025-12-22 02:11:53 -08:00
"ext": ext,
2025-11-25 20:09:33 -08:00
"webpage_url": url,
}
hash_value = None
try:
hash_value = sha256_file(file_path)
except Exception:
pass
tags = []
if extract_ytdlp_tags:
try:
tags = extract_ytdlp_tags(info)
except Exception as e:
log(f"Error extracting tags: {e}", file=sys.stderr)
# Only use filename as a title tag if we couldn't extract any meaningful tags
# This prevents duplicate title: tags when the filename could be mistaken for metadata
2025-12-29 17:05:03 -08:00
if not any(t.startswith("title:") for t in tags):
2025-11-25 20:09:33 -08:00
# Re-extract tags with filename as title only if needed
2025-12-29 17:05:03 -08:00
info["title"] = filename
2025-11-25 20:09:33 -08:00
tags = []
if extract_ytdlp_tags:
try:
tags = extract_ytdlp_tags(info)
except Exception as e:
log(f"Error extracting tags with filename: {e}", file=sys.stderr)
if debug_logger is not None:
debug_logger.write_record(
"direct-file-downloaded",
{
"url": url,
"path": str(file_path),
"hash": hash_value
},
2025-11-25 20:09:33 -08:00
)
return DownloadMediaResult(
path=file_path,
info=info,
2025-12-14 00:53:52 -08:00
tag=tags,
2025-11-25 20:09:33 -08:00
source_url=url,
hash_value=hash_value,
)
except (httpx.HTTPError, httpx.RequestError) as exc:
2025-12-20 23:57:44 -08:00
try:
2025-12-28 04:13:11 -08:00
if progress_bar is not None:
progress_bar.finish()
except Exception:
pass
try:
if (pipeline_progress is not None and transfer_started[0]
and hasattr(pipeline_progress,
"finish_transfer")):
2025-12-28 04:13:11 -08:00
pipeline_progress.finish_transfer(label=str(filename or "download"))
2025-12-20 23:57:44 -08:00
except Exception:
pass
2025-11-25 20:09:33 -08:00
log(f"Download error: {exc}", file=sys.stderr)
if debug_logger is not None:
debug_logger.write_record(
"exception",
{
"phase": "direct-file",
"url": url,
"error": str(exc)
},
2025-11-25 20:09:33 -08:00
)
raise DownloadError(f"Failed to download {url}: {exc}") from exc
except Exception as exc:
2025-12-20 23:57:44 -08:00
try:
2025-12-28 04:13:11 -08:00
if progress_bar is not None:
progress_bar.finish()
except Exception:
pass
try:
if (pipeline_progress is not None and transfer_started[0]
and hasattr(pipeline_progress,
"finish_transfer")):
2025-12-28 04:13:11 -08:00
pipeline_progress.finish_transfer(label=str(filename or "download"))
2025-12-20 23:57:44 -08:00
except Exception:
pass
2025-11-25 20:09:33 -08:00
log(f"Error downloading file: {exc}", file=sys.stderr)
if debug_logger is not None:
debug_logger.write_record(
"exception",
{
"phase": "direct-file",
"url": url,
"error": str(exc),
"traceback": traceback.format_exc(),
},
)
raise DownloadError(f"Error downloading file: {exc}") from exc
def probe_url(url: str,
no_playlist: bool = False,
timeout_seconds: int = 15) -> Optional[Dict[str,
Any]]:
2025-11-25 20:09:33 -08:00
"""Probe URL to extract metadata WITHOUT downloading.
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Args:
url: URL to probe
no_playlist: If True, ignore playlists and probe only the single video
2025-12-11 12:47:30 -08:00
timeout_seconds: Max seconds to wait for probe (default 15s)
2025-12-29 17:05:03 -08:00
2025-11-25 20:09:33 -08:00
Returns:
Dict with keys: extractor, title, entries (if playlist), duration, etc.
2025-12-11 12:47:30 -08:00
Returns None if not supported by yt-dlp or on timeout.
2025-11-25 20:09:33 -08:00
"""
if not is_url_supported_by_ytdlp(url):
return None
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Wrap probe in timeout to prevent hanging on large playlists
import threading
from typing import cast
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
result_container: List[Optional[Any]] = [None, None] # [result, error]
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
def _do_probe() -> None:
2025-12-01 14:42:30 -08:00
try:
2025-12-11 12:47:30 -08:00
_ensure_yt_dlp_ready()
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
assert yt_dlp is not None
# Extract info without downloading
# Use extract_flat='in_playlist' to get full metadata for playlist items
ydl_opts = {
"quiet": True, # Suppress all output
"no_warnings": True,
"socket_timeout": 10,
"retries": 2, # Reduce retries for faster timeout
"skip_download": True, # Don't actually download
"extract_flat": "in_playlist", # Get playlist with metadata for each entry
"noprogress": True, # No progress bars
}
2025-12-29 17:05:03 -08:00
2025-12-16 23:23:43 -08:00
# Cookies are optional for probing; callers should pass cookiefile via DownloadOptions when needed.
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Add no_playlist option if specified
if no_playlist:
ydl_opts["noplaylist"] = True
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
with yt_dlp.YoutubeDL(ydl_opts) as ydl: # type: ignore[arg-type]
info = ydl.extract_info(url, download=False)
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
if not isinstance(info, dict):
result_container[0] = None
return
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
# Extract relevant fields
result_container[0] = {
"extractor": info.get("extractor", ""),
"title": info.get("title", ""),
"entries": info.get("entries", []), # Will be populated if playlist
"duration": info.get("duration"),
"uploader": info.get("uploader"),
"description": info.get("description"),
"url": url,
2025-12-03 15:18:57 -08:00
}
2025-12-11 12:47:30 -08:00
except Exception as exc:
log(f"Probe error for {url}: {exc}")
result_container[1] = exc
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
thread = threading.Thread(target=_do_probe, daemon=False)
thread.start()
thread.join(timeout=timeout_seconds)
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
if thread.is_alive():
# Probe timed out - return None to fall back to direct download
debug(
f"Probe timeout for {url} (>={timeout_seconds}s), proceeding with download"
)
2025-12-11 12:47:30 -08:00
return None
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
if result_container[1] is not None:
# Probe error - return None to proceed anyway
return None
2025-12-29 17:05:03 -08:00
2025-12-11 12:47:30 -08:00
return cast(Optional[Dict[str, Any]], result_container[0])
2025-11-25 20:09:33 -08:00
__all__ = [
"is_url_supported_by_ytdlp",
2025-12-11 12:47:30 -08:00
"list_formats",
"probe_url",
2025-11-25 20:09:33 -08:00
"DownloadError",
"DownloadOptions",
"DownloadMediaResult",
]