Medios-Macina/SYS/download.py

"""Download media files using yt-dlp with support for direct file downloads.

Lean, focused downloader without event infrastructure overhead.
- yt-dlp integration for streaming sites
- Direct file download fallback for PDFs, images, documents
- Tag extraction via metadata.extract_ytdlp_tags()
- Logging via helper.logger.log()
"""
from __future__ import annotations

import glob  # noqa: F401
import hashlib
import json  # noqa: F401
import random
import re
import string
import subprocess
import sys
import time
import traceback
from pathlib import Path
from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import urljoin, urlparse

import httpx

from SYS.logger import log, debug
from SYS.utils import ensure_directory, sha256_file
from API.HTTP import HTTPClient
from models import DownloadError, DownloadOptions, DownloadMediaResult, DebugLogger, ProgressBar

try:
    import yt_dlp  # type: ignore
    from yt_dlp.extractor import gen_extractors  # type: ignore
except Exception as exc:
    yt_dlp = None  # type: ignore
    YTDLP_IMPORT_ERROR = exc
else:
    YTDLP_IMPORT_ERROR = None

try:
    from metadata import extract_ytdlp_tags
except ImportError:
    extract_ytdlp_tags = None

_EXTRACTOR_CACHE: List[Any] | None = None
_YTDLP_PROGRESS = ProgressBar()


def _ensure_yt_dlp_ready() -> None:
    """Verify yt-dlp is available, raise if not."""
    if yt_dlp is not None:
        return
    detail = str(YTDLP_IMPORT_ERROR or "yt-dlp is not installed")
    raise DownloadError(f"yt-dlp module not available: {detail}")


def _progress_callback(status: Dict[str, Any]) -> None:
    """Simple progress callback using logger."""
    event = status.get("status")
    if event == "downloading":
        downloaded = status.get("downloaded_bytes")
        total = status.get("total_bytes") or status.get("total_bytes_estimate")
        _YTDLP_PROGRESS.update(
            downloaded=int(downloaded or 0),
            total=int(total) if total else None,
            label="download",
            file=sys.stderr,
        )
    elif event == "finished":
        _YTDLP_PROGRESS.finish()
        debug(f"✓ Download finished: {status.get('filename')}")
    elif event in ("postprocessing", "processing"):
        debug(f"Post-processing: {status.get('postprocessor')}")


def is_url_supported_by_ytdlp(url: str) -> bool:
    """Check if URL is supported by yt-dlp."""
    if yt_dlp is None:
        return False
    global _EXTRACTOR_CACHE
    if _EXTRACTOR_CACHE is None:
        try:
            _EXTRACTOR_CACHE = [ie for ie in gen_extractors()]  # type: ignore[arg-type]
        except Exception:
            _EXTRACTOR_CACHE = []
    for extractor in _EXTRACTOR_CACHE:
        try:
            if not extractor.suitable(url):
                continue
        except Exception:
            continue
        name = getattr(extractor, "IE_NAME", "")
        if name.lower() == "generic":
            continue
        return True
    return False


def list_formats(url: str, no_playlist: bool = False, playlist_items: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
    """Get list of available formats for a URL using yt-dlp."""
    _ensure_yt_dlp_ready()

    try:
        ydl_opts = {
            "quiet": True,
            "no_warnings": True,
            "socket_timeout": 30,
        }

        if no_playlist:
            ydl_opts["noplaylist"] = True

        if playlist_items:
            ydl_opts["playlist_items"] = playlist_items

        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            debug(f"Fetching format list for: {url}")
            info = ydl.extract_info(url, download=False)

            formats = info.get("formats", [])
            if not formats:
                log("No formats available", file=sys.stderr)
                return None

            result_formats = []
            for fmt in formats:
                result_formats.append({
                    "format_id": fmt.get("format_id", ""),
                    "format": fmt.get("format", ""),
                    "ext": fmt.get("ext", ""),
                    "resolution": fmt.get("resolution", ""),
                    "width": fmt.get("width"),
                    "height": fmt.get("height"),
                    "fps": fmt.get("fps"),
                    "vcodec": fmt.get("vcodec", "none"),
                    "acodec": fmt.get("acodec", "none"),
                    "filesize": fmt.get("filesize"),
                    "tbr": fmt.get("tbr"),
                })

            debug(f"Found {len(result_formats)} available formats")
            return result_formats

    except Exception as e:
        log(f"✗ Error fetching formats: {e}", file=sys.stderr)
        return None


def _download_with_sections_via_cli(url: str, ytdl_options: Dict[str, Any], sections: List[str], quiet: bool = False) -> tuple[Optional[str], Dict[str, Any]]:
    """Download each section separately so merge-file can combine them.
    
    yt-dlp with multiple --download-sections args merges them into one file.
    We need separate files for merge-file, so download each section individually.
    
    Uses hash-based filenames for sections (not title-based) to prevent yt-dlp from
    thinking sections are already downloaded. The title is extracted and stored in tags.
    
    Returns:
        (session_id, first_section_info_dict) - session_id for finding files, info dict for metadata extraction
    """
    
    sections_list = ytdl_options.get("download_sections", [])
    if not sections_list:
        return "", {}
    
    # Generate a unique hash-based ID for this download session
    # This ensures different videos/downloads don't have filename collisions
    session_id = hashlib.md5(
        (url + str(time.time()) + ''.join(random.choices(string.ascii_letters, k=10))).encode()
    ).hexdigest()[:12]
    
    first_section_info = None
    title_from_first = None
    
    # Download each section separately with unique output template using session ID
    for section_idx, section in enumerate(sections_list, 1):
        # Build unique output template for this section using session-based filename
        # e.g., "{session_id}_{section_idx}.ext" - simple and unique per section
        base_outtmpl = ytdl_options.get("outtmpl", "%(title)s.%(ext)s")
        output_dir_path = Path(base_outtmpl).parent
        
        # Use session_id + section index for temp filename
        # e.g., "/path/{session_id}_1.%(ext)s"
        filename_tmpl = f"{session_id}_{section_idx}"
        if base_outtmpl.endswith(".%(ext)s"):
            filename_tmpl += ".%(ext)s"
            
        # Use Path to handle separators correctly for the OS
        section_outtmpl = str(output_dir_path / filename_tmpl)
        
        # For the first section, extract metadata first (separate call)
        if section_idx == 1:
            metadata_cmd = ["yt-dlp", "--dump-json", "--skip-download"]
            if ytdl_options.get("cookiefile"):
                cookies_path = ytdl_options["cookiefile"].replace("\\", "/")
                metadata_cmd.extend(["--cookies", cookies_path])
            if ytdl_options.get("noplaylist"):
                metadata_cmd.append("--no-playlist")
            metadata_cmd.append(url)
            
            try:
                meta_result = subprocess.run(metadata_cmd, capture_output=True, text=True)
                if meta_result.returncode == 0 and meta_result.stdout:
                    try:
                        info_dict = json.loads(meta_result.stdout.strip())
                        first_section_info = info_dict
                        title_from_first = info_dict.get('title')
                        if not quiet:
                            debug(f"Extracted title from metadata: {title_from_first}")
                    except json.JSONDecodeError:
                        if not quiet:
                            debug("Could not parse JSON metadata")
            except Exception as e:
                if not quiet:
                    debug(f"Error extracting metadata: {e}")
        
        # Build yt-dlp command for downloading this section
        cmd = ["yt-dlp"]
        
        # Add format
        if ytdl_options.get("format"):
            cmd.extend(["-f", ytdl_options["format"]])
        
        # Add ONLY this section (not all sections)
        cmd.extend(["--download-sections", section])
        
        # Add force-keyframes-at-cuts if specified
        if ytdl_options.get("force_keyframes_at_cuts"):
            cmd.append("--force-keyframes-at-cuts")
        
        # Add output template for this section
        cmd.extend(["-o", section_outtmpl])
        
        # Add cookies file if present
        if ytdl_options.get("cookiefile"):
            # Convert backslashes to forward slashes for better compatibility
            cookies_path = ytdl_options["cookiefile"].replace("\\", "/")
            cmd.extend(["--cookies", cookies_path])
        
        # Add no-playlist if specified
        if ytdl_options.get("noplaylist"):
            cmd.append("--no-playlist")
        
        # Add the URL
        cmd.append(url)
        
        if not quiet:
            debug(f"Running yt-dlp for section {section_idx}/{len(sections_list)}: {section}")
            debug(f"Command: {' '.join(cmd)}")
        
        # Run the subprocess - don't capture output so progress is shown
        try:
            result = subprocess.run(cmd)
            
            if result.returncode != 0:
                raise DownloadError(f"yt-dlp subprocess failed for section {section_idx} with code {result.returncode}")
        except Exception as exc:
            raise DownloadError(f"yt-dlp subprocess error for section {section_idx}: {exc}") from exc
    
    return session_id, first_section_info or {}


def _build_ytdlp_options(opts: DownloadOptions) -> Dict[str, Any]:
    """Build yt-dlp download options."""
    ensure_directory(opts.output_dir)

    # Build output template
    # When downloading sections, each section will have .section_N_of_M added by _download_with_sections_via_cli
    outtmpl = str((opts.output_dir / "%(title)s.%(ext)s").resolve())

    base_options: Dict[str, Any] = {
        "outtmpl": outtmpl,
        "quiet": True,
        "no_warnings": True,
        "noprogress": True,
        "socket_timeout": 30,
        "retries": 10,
        "fragment_retries": 10,
        "http_chunk_size": 10_485_760,
        "restrictfilenames": True,
        "progress_hooks": [] if opts.quiet else [_progress_callback],
    }

    if opts.cookies_path and opts.cookies_path.is_file():
        base_options["cookiefile"] = str(opts.cookies_path)
    else:
        # Fallback to browser cookies
        base_options["cookiesfrombrowser"] = ("chrome",)

    # Add no-playlist option if specified (for single video from playlist url)
    if opts.no_playlist:
        base_options["noplaylist"] = True

    # Configure based on mode
    if opts.mode == "audio":
        base_options["format"] = opts.ytdl_format or "251/140/bestaudio"
        base_options["postprocessors"] = [{"key": "FFmpegExtractAudio"}]
    else:  # video
        base_options["format"] = opts.ytdl_format or "bestvideo+bestaudio/best"
        base_options["format_sort"] = [
            "res:4320", "res:2880", "res:2160", "res:1440", "res:1080", "res:720", "res"
        ]

    # Add clip sections if provided (yt-dlp will download only these sections)
    if opts.clip_sections:
        # Parse section ranges like "48-65,120-152,196-205" (seconds) 
        # and convert to yt-dlp format: "*HH:MM:SS-HH:MM:SS,*HH:MM:SS-HH:MM:SS"
        sections = []
        for section_range in opts.clip_sections.split(','):
            try:
                start_str, end_str = section_range.strip().split('-')
                start_sec = float(start_str)
                end_sec = float(end_str)
                
                # Convert seconds to HH:MM:SS format
                def sec_to_hhmmss(seconds):
                    hours = int(seconds // 3600)
                    minutes = int((seconds % 3600) // 60)
                    secs = int(seconds % 60)
                    return f"{hours:02d}:{minutes:02d}:{secs:02d}"
                
                start_time = sec_to_hhmmss(start_sec)
                end_time = sec_to_hhmmss(end_sec)
                sections.append(f"*{start_time}-{end_time}")
            except (ValueError, AttributeError):
                pass
        
        if sections:
            # Pass each section as a separate element in the list (yt-dlp expects multiple --download-sections args)
            base_options["download_sections"] = sections
            debug(f"Download sections configured: {', '.join(sections)}")
            # Note: Not using --force-keyframes-at-cuts to avoid re-encoding
            # This may result in less precise cuts but faster downloads

    # Add playlist items selection if provided
    if opts.playlist_items:
        base_options["playlist_items"] = opts.playlist_items

    if not opts.quiet:
        debug(f"yt-dlp: mode={opts.mode}, format={base_options.get('format')}")
    return base_options


def _iter_download_entries(info: Dict[str, Any]) -> Iterator[Dict[str, Any]]:
    """Iterate through download entries, handling playlists."""
    queue: List[Dict[str, Any]] = [info]
    seen: set[int] = set()
    while queue:
        current = queue.pop(0)
        obj_id = id(current)
        if obj_id in seen:
            continue
        seen.add(obj_id)
        entries = current.get("entries")
        if isinstance(entries, list):
            for entry in entries:
                if isinstance(entry, dict):
                    queue.append(entry)
        if current.get("requested_downloads") or not entries:
            yield current


def _candidate_paths(entry: Dict[str, Any], output_dir: Path) -> Iterator[Path]:
    """Get candidate file paths for downloaded media."""
    requested = entry.get("requested_downloads")
    if isinstance(requested, list):
        for item in requested:
            if isinstance(item, dict):
                for key in ("filepath", "_filename", "filename"):
                    value = item.get(key)
                    if value:
                        yield Path(value)
    for key in ("filepath", "_filename", "filename"):
        value = entry.get(key)
        if value:
            yield Path(value)
    if entry.get("filename"):
        yield output_dir / entry["filename"]


def _resolve_entry_and_path(info: Dict[str, Any], output_dir: Path) -> tuple[Dict[str, Any], Path]:
    """Find downloaded file in yt-dlp metadata."""
    for entry in _iter_download_entries(info):
        for candidate in _candidate_paths(entry, output_dir):
            if candidate.is_file():
                return entry, candidate
            if not candidate.is_absolute():
                resolved = output_dir / candidate
                if resolved.is_file():
                    return entry, resolved
    raise FileNotFoundError("yt-dlp did not report a downloaded media file")


def _extract_sha256(info: Dict[str, Any]) -> Optional[str]:
    """Extract SHA256 hash from yt-dlp metadata."""
    for payload in [info] + info.get("entries", []):
        if not isinstance(payload, dict):
            continue
        hashes = payload.get("hashes")
        if isinstance(hashes, dict):
            for key in ("sha256", "sha-256", "sha_256"):
                value = hashes.get(key)
                if isinstance(value, str) and value.strip():
                    return value.strip().lower()
        for key in ("sha256", "sha-256", "sha_256"):
            value = payload.get(key)
            if isinstance(value, str) and value.strip():
                return value.strip().lower()
    return None


def _get_libgen_download_url(libgen_url: str) -> Optional[str]:
    """Extract the actual download link from LibGen redirect URL.
    
    LibGen url like https://libgen.gl/file.php?id=123456 redirect to
    actual mirror url. This follows the redirect chain to get the real file.
    
    Args:
        libgen_url: LibGen file.php URL
        
    Returns:
        Actual download URL or None if extraction fails
    """
    try:
        import requests
        from urllib.parse import urlparse
        
        # Check if this is a LibGen URL
        parsed = urlparse(libgen_url)
        if 'libgen' not in parsed.netloc.lower():
            return None
        
        if '/file.php' not in parsed.path.lower():
            return None
        
        # LibGen redirects to actual mirrors, follow redirects to get final URL
        session = requests.Session()
        session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        
        debug(f"Following LibGen redirect chain for: {libgen_url}")
        
        # First, get the page and look for direct download link
        try:
            response = session.get(libgen_url, timeout=10, allow_redirects=True)
            final_url = response.url
            
            # Try to find actual download link in the page
            try:
                try:
                    from lxml import html as lxml_html
                except ImportError:
                    lxml_html = None

                if lxml_html is not None:
                    doc = lxml_html.fromstring(response.content)
                    for a in doc.xpath("//a[@href]"):
                        href = str(a.get("href") or "").strip()
                        if not href:
                            continue

                        href_lower = href.lower()
                        if "get.php" in href_lower or href_lower.endswith((".pdf", ".epub", ".djvu", ".mobi")):
                            download_url = href if href.startswith("http") else urljoin(final_url, href)
                            debug(f"Found download link: {download_url}")
                            return download_url
                else:
                    # Regex fallback
                    for m in re.finditer(
                        r"href=[\"\']([^\"\']+)[\"\']",
                        response.text or "",
                        flags=re.IGNORECASE,
                    ):
                        href = str(m.group(1) or "").strip()
                        if not href or href.lower().startswith("javascript:"):
                            continue
                        href_lower = href.lower()
                        if "get.php" in href_lower or href_lower.endswith((".pdf", ".epub", ".djvu", ".mobi")):
                            download_url = href if href.startswith("http") else urljoin(final_url, href)
                            debug(f"Found download link: {download_url}")
                            return download_url
            except Exception:
                pass
            
            # If we followed redirects successfully, return the final URL
            # This handles cases where libgen redirects to a direct download mirror
            if final_url != libgen_url:
                debug(f"LibGen resolved to mirror: {final_url}")
                return final_url
        
        except requests.RequestException as e:
            log(f"Error following LibGen redirects: {e}", file=sys.stderr)
            # Try head request as fallback
            try:
                response = session.head(libgen_url, allow_redirects=True, timeout=10)
                if response.url != libgen_url:
                    debug(f"LibGen HEAD resolved to: {response.url}")
                    return response.url
            except:
                pass
        
        return None
        
    except Exception as e:
        log(f"Error resolving LibGen URL: {e}", file=sys.stderr)
        return None


def _download_direct_file(
    url: str,
    output_dir: Path,
    debug_logger: Optional[DebugLogger] = None,
    quiet: bool = False,
    suggested_filename: Optional[str] = None,
) -> DownloadMediaResult:
    """Download a direct file (PDF, image, document, etc.) without yt-dlp."""
    ensure_directory(output_dir)

    from urllib.parse import unquote, urlparse, parse_qs
    import re
    
    def _sanitize_filename(name: str) -> str:
        # Windows-safe filename sanitization.
        # Keep it simple: strip path parts, drop invalid chars, collapse whitespace.
        text = str(name or "").strip()
        if not text:
            return ""
        # Remove any path components
        text = text.replace("/", "\\")
        text = text.split("\\")[-1]

        invalid = set('<>:"/\\|?*')
        cleaned_chars: List[str] = []
        for ch in text:
            o = ord(ch)
            if o < 32:
                cleaned_chars.append(" ")
                continue
            if ch in invalid:
                cleaned_chars.append(" ")
                continue
            cleaned_chars.append(ch)
        cleaned = " ".join("".join(cleaned_chars).split()).strip()
        # Avoid trailing dots/spaces on Windows
        cleaned = cleaned.rstrip(" .")
        return cleaned

    def _unique_path(path: Path) -> Path:
        if not path.exists():
            return path
        stem = path.stem
        suffix = path.suffix
        parent = path.parent
        for i in range(1, 10_000):
            candidate = parent / f"{stem} ({i}){suffix}"
            if not candidate.exists():
                return candidate
        return parent / f"{stem} ({int(time.time())}){suffix}"

    # Extract filename from URL
    parsed_url = urlparse(url)
    url_path = parsed_url.path
    
    # Try to get filename from query parameters first (for LibGen and similar services)
    # e.g., ?filename=Book+Title.pdf or &download=filename.pdf
    filename = None
    if parsed_url.query:
        query_params = parse_qs(parsed_url.query)
        for param_name in ('filename', 'download', 'file', 'name'):
            if param_name in query_params and query_params[param_name]:
                filename = query_params[param_name][0]
                filename = unquote(filename)
                break
    
    # If not found in query params, extract from URL path
    if not filename or not filename.strip():
        filename = url_path.split("/")[-1] if url_path else ""
        filename = unquote(filename)
    
    # Remove query strings from filename if any
    if "?" in filename:
        filename = filename.split("?")[0]
    
    # Try to get real filename from Content-Disposition header (HEAD request)
    content_type = ""
    try:
        with HTTPClient(timeout=10.0) as client:
            response = client._request("HEAD", url, follow_redirects=True)
            content_disposition = response.headers.get("content-disposition", "")
            try:
                content_type = str(response.headers.get("content-type", "") or "").strip().lower()
            except Exception:
                content_type = ""
            if content_disposition:
                # Extract filename from Content-Disposition header
                # Format: attachment; filename="filename.pdf" or filename=filename.pdf
                match = re.search(r'filename\*?=(?:"([^"]*)"|([^;\s]*))', content_disposition)
                if match:
                    extracted_name = match.group(1) or match.group(2)
                    if extracted_name:
                        filename = unquote(extracted_name)
                        if not quiet:
                            debug(f"Filename from Content-Disposition: {filename}")
    except Exception as e:
        if not quiet:
            log(f"Could not get filename from headers: {e}", file=sys.stderr)

    # Guardrail: never treat HTML landing pages as downloadable files.
    # We explicitly probe with GET for page-like endpoints (e.g. *.php) since some
    # servers block/lie on HEAD, and a URL path like `edition.php` would otherwise
    # be saved as a bogus file.
    try:
        page_like_exts = {".php", ".asp", ".aspx", ".jsp", ".cgi"}
        ext = ""
        try:
            ext = Path(str(filename or "")).suffix.lower()
        except Exception:
            ext = ""

        ct0 = (content_type or "").split(";", 1)[0].strip().lower()
        must_probe = bool(ct0.startswith("text/html") or ext in page_like_exts)

        if must_probe:
            with HTTPClient(timeout=10.0) as client:
                with client._request_stream("GET", url, follow_redirects=True) as resp:
                    resp.raise_for_status()
                    ct = str(resp.headers.get("content-type", "") or "").split(";", 1)[0].strip().lower()
                    if ct.startswith("text/html"):
                        raise DownloadError("URL appears to be an HTML page, not a direct file")
    except DownloadError:
        raise
    except Exception:
        # If we can't probe, keep going; later logic may still infer a safe extension.
        pass
    
    # Apply suggested filename (from provider title) if given.
    suggested = _sanitize_filename(suggested_filename) if suggested_filename else ""
    if suggested:
        # Preserve extension from suggested name if present; otherwise borrow from detected filename.
        suggested_path = Path(suggested)
        if suggested_path.suffix:
            filename = suggested
        else:
            detected_ext = ""
            try:
                detected_ext = Path(str(filename)).suffix
            except Exception:
                detected_ext = ""
            if detected_ext:
                filename = suggested + detected_ext
            else:
                filename = suggested

    # If we still don't have an extension, try to infer one from Content-Type.
    # Never fall back to a generic `.bin` extension.
    try:
        has_ext = bool(filename and Path(str(filename)).suffix)
    except Exception:
        has_ext = False

    if filename and (not has_ext):
        ct = (content_type or "").split(";")[0].strip().lower()
        ext_by_ct = {
            "application/pdf": ".pdf",
            "application/epub+zip": ".epub",
            "application/x-mobipocket-ebook": ".mobi",
            "image/jpeg": ".jpg",
            "image/png": ".png",
            "image/webp": ".webp",
            "image/gif": ".gif",
            "text/plain": ".txt",
            "application/zip": ".zip",
        }

        if ct in ext_by_ct:
            filename = f"{filename}{ext_by_ct[ct]}"
        elif ct.startswith("text/html"):
            # Guardrail: HTML landing pages should not be downloaded as opaque files.
            raise DownloadError("URL appears to be an HTML page, not a direct file")

    # Final guardrail: if filename is empty, refuse rather than inventing `download.bin`.
    if not filename or not str(filename).strip():
        raise DownloadError("Could not determine filename for URL (no Content-Disposition and no path filename)")

    file_path = _unique_path(output_dir / filename)
    progress_bar = ProgressBar()

    if not quiet:
        debug(f"Direct download: {filename}")

    try:
        start_time = time.time()
        downloaded_bytes = [0]
        total_bytes = [0]
        last_progress_time = [start_time]
        rendered_once = [False]

        def progress_callback(bytes_downloaded: int, content_length: int) -> None:
            downloaded_bytes[0] = bytes_downloaded
            total_bytes[0] = content_length

            now = time.time()
            is_final = bool(content_length > 0 and bytes_downloaded >= content_length)
            if (not rendered_once[0]) or is_final:
                pass
            elif now - last_progress_time[0] < 0.5:
                return

            elapsed = now - start_time
            percent = (bytes_downloaded / content_length) * 100 if content_length > 0 else 0
            speed = bytes_downloaded / elapsed if elapsed > 0 else 0
            eta_str: Optional[str] = None
            if content_length > 0 and speed > 0:
                try:
                    eta_seconds = max(0.0, float(content_length - bytes_downloaded) / float(speed))
                    minutes, seconds = divmod(int(eta_seconds), 60)
                    hours, minutes = divmod(minutes, 60)
                    eta_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
                except Exception:
                    eta_str = None

            progress_bar.update(
                downloaded=bytes_downloaded,
                total=content_length if content_length > 0 else None,
                label=str(filename or "download"),
                file=sys.stderr,
            )

            rendered_once[0] = True

            last_progress_time[0] = now

        with HTTPClient(timeout=30.0) as client:
            client.download(url, str(file_path), progress_callback=progress_callback)

        elapsed = time.time() - start_time
        progress_bar.finish()
        avg_speed_str = progress_bar.format_bytes(downloaded_bytes[0] / elapsed if elapsed > 0 else 0) + "/s"
        if not quiet:
            debug(f"✓ Downloaded in {elapsed:.1f}s at {avg_speed_str}")

        # For direct file downloads, create minimal info dict without filename as title
        # This prevents creating duplicate title: tags when filename gets auto-generated
        # We'll add title back later only if we couldn't extract meaningful tags
        ext = ""
        try:
            ext = Path(str(filename)).suffix.lstrip(".")
        except Exception:
            ext = ""

        info = {
            "id": str(filename).rsplit(".", 1)[0] if "." in str(filename) else str(filename),
            "ext": ext,
            "webpage_url": url,
        }

        hash_value = None
        try:
            hash_value = sha256_file(file_path)
        except Exception:
            pass

        tags = []
        if extract_ytdlp_tags:
            try:
                tags = extract_ytdlp_tags(info)
            except Exception as e:
                log(f"Error extracting tags: {e}", file=sys.stderr)

        # Only use filename as a title tag if we couldn't extract any meaningful tags
        # This prevents duplicate title: tags when the filename could be mistaken for metadata
        if not any(t.startswith('title:') for t in tags):
            # Re-extract tags with filename as title only if needed
            info['title'] = filename
            tags = []
            if extract_ytdlp_tags:
                try:
                    tags = extract_ytdlp_tags(info)
                except Exception as e:
                    log(f"Error extracting tags with filename: {e}", file=sys.stderr)

        if debug_logger is not None:
            debug_logger.write_record(
                "direct-file-downloaded",
                {"url": url, "path": str(file_path), "hash": hash_value},
            )

        return DownloadMediaResult(
            path=file_path,
            info=info,
            tag=tags,
            source_url=url,
            hash_value=hash_value,
        )

    except (httpx.HTTPError, httpx.RequestError) as exc:
        try:
            progress_bar.finish()
        except Exception:
            pass
        log(f"Download error: {exc}", file=sys.stderr)
        if debug_logger is not None:
            debug_logger.write_record(
                "exception",
                {"phase": "direct-file", "url": url, "error": str(exc)},
            )
        raise DownloadError(f"Failed to download {url}: {exc}") from exc
    except Exception as exc:
        try:
            progress_bar.finish()
        except Exception:
            pass
        log(f"Error downloading file: {exc}", file=sys.stderr)
        if debug_logger is not None:
            debug_logger.write_record(
                "exception",
                {
                    "phase": "direct-file",
                    "url": url,
                    "error": str(exc),
                    "traceback": traceback.format_exc(),
                },
            )
        raise DownloadError(f"Error downloading file: {exc}") from exc


def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15) -> Optional[Dict[str, Any]]:
    """Probe URL to extract metadata WITHOUT downloading.
    
    Args:
        url: URL to probe
        no_playlist: If True, ignore playlists and probe only the single video
        timeout_seconds: Max seconds to wait for probe (default 15s)
    
    Returns:
        Dict with keys: extractor, title, entries (if playlist), duration, etc.
        Returns None if not supported by yt-dlp or on timeout.
    """
    if not is_url_supported_by_ytdlp(url):
        return None
    
    # Wrap probe in timeout to prevent hanging on large playlists
    import threading
    from typing import cast
    
    result_container: List[Optional[Any]] = [None, None]  # [result, error]
    
    def _do_probe() -> None:
        try:
            _ensure_yt_dlp_ready()
            
            assert yt_dlp is not None
            # Extract info without downloading
            # Use extract_flat='in_playlist' to get full metadata for playlist items
            ydl_opts = {
                "quiet": True,  # Suppress all output
                "no_warnings": True,
                "socket_timeout": 10,
                "retries": 2,  # Reduce retries for faster timeout
                "skip_download": True,  # Don't actually download
                "extract_flat": "in_playlist",  # Get playlist with metadata for each entry
                "noprogress": True,  # No progress bars
            }
            
            # Cookies are optional for probing; callers should pass cookiefile via DownloadOptions when needed.
            
            # Add no_playlist option if specified
            if no_playlist:
                ydl_opts["noplaylist"] = True
            
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:  # type: ignore[arg-type]
                info = ydl.extract_info(url, download=False)
            
            if not isinstance(info, dict):
                result_container[0] = None
                return
            
            # Extract relevant fields
            result_container[0] = {
                "extractor": info.get("extractor", ""),
                "title": info.get("title", ""),
                "entries": info.get("entries", []),  # Will be populated if playlist
                "duration": info.get("duration"),
                "uploader": info.get("uploader"),
                "description": info.get("description"),
                "url": url,
            }
        except Exception as exc:
            log(f"Probe error for {url}: {exc}")
            result_container[1] = exc
    
    thread = threading.Thread(target=_do_probe, daemon=False)
    thread.start()
    thread.join(timeout=timeout_seconds)
    
    if thread.is_alive():
        # Probe timed out - return None to fall back to direct download
        debug(f"Probe timeout for {url} (>={timeout_seconds}s), proceeding with download")
        return None
    
    if result_container[1] is not None:
        # Probe error - return None to proceed anyway
        return None
    
    return cast(Optional[Dict[str, Any]], result_container[0])


__all__ = [
    "is_url_supported_by_ytdlp",
    "list_formats",
    "probe_url",
    "DownloadError",
    "DownloadOptions",
    "DownloadMediaResult",
]