AST
This commit is contained in:
730
helper/download.py
Normal file
730
helper/download.py
Normal file
@@ -0,0 +1,730 @@
|
||||
"""Download media files using yt-dlp with support for direct file downloads.
|
||||
|
||||
Lean, focused downloader without event infrastructure overhead.
|
||||
- yt-dlp integration for streaming sites
|
||||
- Direct file download fallback for PDFs, images, documents
|
||||
- Tag extraction via metadata.extract_ytdlp_tags()
|
||||
- Logging via helper.logger.log()
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re # noqa: F401
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterator, List, Optional
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import httpx
|
||||
|
||||
from helper.logger import log, debug
|
||||
from .utils import ensure_directory, sha256_file
|
||||
from .http_client import HTTPClient
|
||||
from models import DownloadError, DownloadOptions, DownloadMediaResult, DebugLogger, ProgressBar
|
||||
|
||||
try:
|
||||
import yt_dlp # type: ignore
|
||||
from yt_dlp.extractor import gen_extractors # type: ignore
|
||||
except Exception as exc:
|
||||
yt_dlp = None # type: ignore
|
||||
YTDLP_IMPORT_ERROR = exc
|
||||
else:
|
||||
YTDLP_IMPORT_ERROR = None
|
||||
|
||||
try:
|
||||
from metadata import extract_ytdlp_tags
|
||||
except ImportError:
|
||||
extract_ytdlp_tags = None
|
||||
|
||||
_EXTRACTOR_CACHE: List[Any] | None = None
|
||||
|
||||
|
||||
def _ensure_yt_dlp_ready() -> None:
|
||||
"""Verify yt-dlp is available, raise if not."""
|
||||
if yt_dlp is not None:
|
||||
return
|
||||
detail = str(YTDLP_IMPORT_ERROR or "yt-dlp is not installed")
|
||||
raise DownloadError(f"yt-dlp module not available: {detail}")
|
||||
|
||||
|
||||
def _progress_callback(status: Dict[str, Any]) -> None:
|
||||
"""Simple progress callback using logger."""
|
||||
event = status.get("status")
|
||||
if event == "downloading":
|
||||
percent = status.get("_percent_str", "?")
|
||||
speed = status.get("_speed_str", "?")
|
||||
debug(f"Downloading {percent} at {speed}")
|
||||
elif event == "finished":
|
||||
debug(f"✓ Download finished: {status.get('filename')}")
|
||||
elif event in ("postprocessing", "processing"):
|
||||
debug(f"Post-processing: {status.get('postprocessor')}")
|
||||
|
||||
|
||||
def is_url_supported_by_ytdlp(url: str) -> bool:
|
||||
"""Check if URL is supported by yt-dlp."""
|
||||
if yt_dlp is None:
|
||||
return False
|
||||
global _EXTRACTOR_CACHE
|
||||
if _EXTRACTOR_CACHE is None:
|
||||
try:
|
||||
_EXTRACTOR_CACHE = [ie for ie in gen_extractors()] # type: ignore[arg-type]
|
||||
except Exception:
|
||||
_EXTRACTOR_CACHE = []
|
||||
for extractor in _EXTRACTOR_CACHE:
|
||||
try:
|
||||
if not extractor.suitable(url):
|
||||
continue
|
||||
except Exception:
|
||||
continue
|
||||
name = getattr(extractor, "IE_NAME", "")
|
||||
if name.lower() == "generic":
|
||||
continue
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def list_formats(url: str, no_playlist: bool = False, playlist_items: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
|
||||
"""Get list of available formats for a URL using yt-dlp.
|
||||
|
||||
Args:
|
||||
url: URL to get formats for
|
||||
no_playlist: If True, ignore playlists and list formats for single video
|
||||
playlist_items: If specified, only list formats for these playlist items (e.g., "1,3,5-8")
|
||||
|
||||
Returns:
|
||||
List of format dictionaries with keys: format_id, format, resolution, fps, vcodec, acodec, filesize, etc.
|
||||
Returns None if yt-dlp is not available or format listing fails.
|
||||
"""
|
||||
_ensure_yt_dlp_ready()
|
||||
|
||||
try:
|
||||
ydl_opts = {
|
||||
"quiet": False,
|
||||
"no_warnings": False,
|
||||
"socket_timeout": 30,
|
||||
}
|
||||
|
||||
# Add no_playlist option if specified
|
||||
if no_playlist:
|
||||
ydl_opts["noplaylist"] = True
|
||||
|
||||
# Add playlist_items filter if specified
|
||||
if playlist_items:
|
||||
ydl_opts["playlist_items"] = playlist_items
|
||||
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||
debug(f"Fetching format list for: {url}")
|
||||
info = ydl.extract_info(url, download=False)
|
||||
|
||||
formats = info.get("formats", [])
|
||||
if not formats:
|
||||
log("No formats available", file=sys.stderr)
|
||||
return None
|
||||
|
||||
# Parse and extract relevant format info
|
||||
result_formats = []
|
||||
for fmt in formats:
|
||||
format_info = {
|
||||
"format_id": fmt.get("format_id", ""),
|
||||
"format": fmt.get("format", ""),
|
||||
"ext": fmt.get("ext", ""),
|
||||
"resolution": fmt.get("resolution", ""),
|
||||
"width": fmt.get("width"),
|
||||
"height": fmt.get("height"),
|
||||
"fps": fmt.get("fps"),
|
||||
"vcodec": fmt.get("vcodec", "none"),
|
||||
"acodec": fmt.get("acodec", "none"),
|
||||
"filesize": fmt.get("filesize"),
|
||||
"tbr": fmt.get("tbr"), # Total bitrate
|
||||
}
|
||||
result_formats.append(format_info)
|
||||
|
||||
debug(f"Found {len(result_formats)} available formats")
|
||||
return result_formats
|
||||
|
||||
except Exception as e:
|
||||
log(f"✗ Error fetching formats: {e}", file=sys.stderr)
|
||||
return None
|
||||
def _build_ytdlp_options(opts: DownloadOptions) -> Dict[str, Any]:
|
||||
"""Build yt-dlp download options."""
|
||||
ensure_directory(opts.output_dir)
|
||||
|
||||
outtmpl = str((opts.output_dir / "%(title)s.%(ext)s").resolve())
|
||||
|
||||
base_options: Dict[str, Any] = {
|
||||
"outtmpl": outtmpl,
|
||||
"quiet": False,
|
||||
"no_warnings": False,
|
||||
"noprogress": False,
|
||||
"socket_timeout": 30,
|
||||
"retries": 10,
|
||||
"fragment_retries": 10,
|
||||
"http_chunk_size": 10_485_760,
|
||||
"restrictfilenames": True,
|
||||
"progress_hooks": [_progress_callback],
|
||||
}
|
||||
|
||||
if opts.cookies_path and opts.cookies_path.is_file():
|
||||
base_options["cookiefile"] = str(opts.cookies_path)
|
||||
|
||||
# Add no-playlist option if specified (for single video from playlist URLs)
|
||||
if opts.no_playlist:
|
||||
base_options["noplaylist"] = True
|
||||
|
||||
# Configure based on mode
|
||||
if opts.mode == "audio":
|
||||
base_options["format"] = opts.ytdl_format or "251/140/bestaudio"
|
||||
base_options["postprocessors"] = [{"key": "FFmpegExtractAudio"}]
|
||||
else: # video
|
||||
base_options["format"] = opts.ytdl_format or "bestvideo+bestaudio/best"
|
||||
base_options["format_sort"] = [
|
||||
"res:4320", "res:2880", "res:2160", "res:1440", "res:1080", "res:720", "res"
|
||||
]
|
||||
|
||||
# Add clip sections if provided
|
||||
if opts.clip_sections:
|
||||
base_options["download_sections"] = opts.clip_sections
|
||||
|
||||
# Add playlist items selection if provided
|
||||
if opts.playlist_items:
|
||||
base_options["playlist_items"] = opts.playlist_items
|
||||
|
||||
debug(f"yt-dlp: mode={opts.mode}, format={base_options.get('format')}")
|
||||
return base_options
|
||||
|
||||
|
||||
def _iter_download_entries(info: Dict[str, Any]) -> Iterator[Dict[str, Any]]:
|
||||
"""Iterate through download entries, handling playlists."""
|
||||
queue: List[Dict[str, Any]] = [info]
|
||||
seen: set[int] = set()
|
||||
while queue:
|
||||
current = queue.pop(0)
|
||||
obj_id = id(current)
|
||||
if obj_id in seen:
|
||||
continue
|
||||
seen.add(obj_id)
|
||||
entries = current.get("entries")
|
||||
if isinstance(entries, list):
|
||||
for entry in entries:
|
||||
if isinstance(entry, dict):
|
||||
queue.append(entry)
|
||||
if current.get("requested_downloads") or not entries:
|
||||
yield current
|
||||
|
||||
|
||||
def _candidate_paths(entry: Dict[str, Any], output_dir: Path) -> Iterator[Path]:
|
||||
"""Get candidate file paths for downloaded media."""
|
||||
requested = entry.get("requested_downloads")
|
||||
if isinstance(requested, list):
|
||||
for item in requested:
|
||||
if isinstance(item, dict):
|
||||
for key in ("filepath", "_filename", "filename"):
|
||||
value = item.get(key)
|
||||
if value:
|
||||
yield Path(value)
|
||||
for key in ("filepath", "_filename", "filename"):
|
||||
value = entry.get(key)
|
||||
if value:
|
||||
yield Path(value)
|
||||
if entry.get("filename"):
|
||||
yield output_dir / entry["filename"]
|
||||
|
||||
|
||||
def _resolve_entry_and_path(info: Dict[str, Any], output_dir: Path) -> tuple[Dict[str, Any], Path]:
|
||||
"""Find downloaded file in yt-dlp metadata."""
|
||||
for entry in _iter_download_entries(info):
|
||||
for candidate in _candidate_paths(entry, output_dir):
|
||||
if candidate.is_file():
|
||||
return entry, candidate
|
||||
if not candidate.is_absolute():
|
||||
resolved = output_dir / candidate
|
||||
if resolved.is_file():
|
||||
return entry, resolved
|
||||
raise FileNotFoundError("yt-dlp did not report a downloaded media file")
|
||||
|
||||
|
||||
def _extract_sha256(info: Dict[str, Any]) -> Optional[str]:
|
||||
"""Extract SHA256 hash from yt-dlp metadata."""
|
||||
for payload in [info] + info.get("entries", []):
|
||||
if not isinstance(payload, dict):
|
||||
continue
|
||||
hashes = payload.get("hashes")
|
||||
if isinstance(hashes, dict):
|
||||
for key in ("sha256", "sha-256", "sha_256"):
|
||||
value = hashes.get(key)
|
||||
if isinstance(value, str) and value.strip():
|
||||
return value.strip().lower()
|
||||
for key in ("sha256", "sha-256", "sha_256"):
|
||||
value = payload.get(key)
|
||||
if isinstance(value, str) and value.strip():
|
||||
return value.strip().lower()
|
||||
return None
|
||||
|
||||
|
||||
def _get_libgen_download_url(libgen_url: str) -> Optional[str]:
|
||||
"""Extract the actual download link from LibGen redirect URL.
|
||||
|
||||
LibGen URLs like https://libgen.gl/file.php?id=123456 redirect to
|
||||
actual mirror URLs. This follows the redirect chain to get the real file.
|
||||
|
||||
Args:
|
||||
libgen_url: LibGen file.php URL
|
||||
|
||||
Returns:
|
||||
Actual download URL or None if extraction fails
|
||||
"""
|
||||
try:
|
||||
import requests
|
||||
from urllib.parse import urlparse
|
||||
|
||||
# Check if this is a LibGen URL
|
||||
parsed = urlparse(libgen_url)
|
||||
if 'libgen' not in parsed.netloc.lower():
|
||||
return None
|
||||
|
||||
if '/file.php' not in parsed.path.lower():
|
||||
return None
|
||||
|
||||
# LibGen redirects to actual mirrors, follow redirects to get final URL
|
||||
session = requests.Session()
|
||||
session.headers.update({
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||
})
|
||||
|
||||
debug(f"Following LibGen redirect chain for: {libgen_url}")
|
||||
|
||||
# First, get the page and look for direct download link
|
||||
try:
|
||||
response = session.get(libgen_url, timeout=10, allow_redirects=True)
|
||||
final_url = response.url
|
||||
|
||||
# Try to find actual download link in the page
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
# Look for download links - LibGen typically has forms with download buttons
|
||||
# Look for all links and forms that might lead to download
|
||||
for link in soup.find_all('a'):
|
||||
href = link.get('href')
|
||||
if href and isinstance(href, str):
|
||||
# Look for direct file links or get.php redirects
|
||||
if 'get.php' in href.lower() or href.endswith(('.pdf', '.epub', '.djvu', '.mobi')):
|
||||
download_url = href if href.startswith('http') else urljoin(final_url, href)
|
||||
debug(f"Found download link: {download_url}")
|
||||
return download_url
|
||||
except ImportError:
|
||||
pass # BeautifulSoup not available
|
||||
|
||||
# If we followed redirects successfully, return the final URL
|
||||
# This handles cases where libgen redirects to a direct download mirror
|
||||
if final_url != libgen_url:
|
||||
debug(f"LibGen resolved to mirror: {final_url}")
|
||||
return final_url
|
||||
|
||||
except requests.RequestException as e:
|
||||
log(f"Error following LibGen redirects: {e}", file=sys.stderr)
|
||||
# Try head request as fallback
|
||||
try:
|
||||
response = session.head(libgen_url, allow_redirects=True, timeout=10)
|
||||
if response.url != libgen_url:
|
||||
debug(f"LibGen HEAD resolved to: {response.url}")
|
||||
return response.url
|
||||
except:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
log(f"Error resolving LibGen URL: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def _download_direct_file(
|
||||
url: str,
|
||||
output_dir: Path,
|
||||
debug_logger: Optional[DebugLogger] = None,
|
||||
) -> DownloadMediaResult:
|
||||
"""Download a direct file (PDF, image, document, etc.) without yt-dlp."""
|
||||
ensure_directory(output_dir)
|
||||
|
||||
from urllib.parse import unquote, urlparse, parse_qs
|
||||
import re
|
||||
|
||||
# Extract filename from URL
|
||||
parsed_url = urlparse(url)
|
||||
url_path = parsed_url.path
|
||||
|
||||
# Try to get filename from query parameters first (for LibGen and similar services)
|
||||
# e.g., ?filename=Book+Title.pdf or &download=filename.pdf
|
||||
filename = None
|
||||
if parsed_url.query:
|
||||
query_params = parse_qs(parsed_url.query)
|
||||
for param_name in ('filename', 'download', 'file', 'name'):
|
||||
if param_name in query_params and query_params[param_name]:
|
||||
filename = query_params[param_name][0]
|
||||
filename = unquote(filename)
|
||||
break
|
||||
|
||||
# If not found in query params, extract from URL path
|
||||
if not filename or not filename.strip():
|
||||
filename = url_path.split("/")[-1] if url_path else ""
|
||||
filename = unquote(filename)
|
||||
|
||||
# Remove query strings from filename if any
|
||||
if "?" in filename:
|
||||
filename = filename.split("?")[0]
|
||||
|
||||
# Try to get real filename from Content-Disposition header (HEAD request)
|
||||
try:
|
||||
with HTTPClient(timeout=10.0) as client:
|
||||
response = client._request("HEAD", url, follow_redirects=True)
|
||||
content_disposition = response.headers.get("content-disposition", "")
|
||||
if content_disposition:
|
||||
# Extract filename from Content-Disposition header
|
||||
# Format: attachment; filename="filename.pdf" or filename=filename.pdf
|
||||
match = re.search(r'filename\*?=(?:"([^"]*)"|([^;\s]*))', content_disposition)
|
||||
if match:
|
||||
extracted_name = match.group(1) or match.group(2)
|
||||
if extracted_name:
|
||||
filename = unquote(extracted_name)
|
||||
debug(f"Filename from Content-Disposition: {filename}")
|
||||
except Exception as e:
|
||||
log(f"Could not get filename from headers: {e}", file=sys.stderr)
|
||||
|
||||
# Fallback if we still don't have a good filename
|
||||
if not filename or "." not in filename:
|
||||
filename = "downloaded_file.bin"
|
||||
|
||||
file_path = output_dir / filename
|
||||
progress_bar = ProgressBar()
|
||||
|
||||
debug(f"Direct download: {filename}")
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
downloaded_bytes = [0]
|
||||
total_bytes = [0]
|
||||
last_progress_time = [start_time]
|
||||
|
||||
def progress_callback(bytes_downloaded: int, content_length: int) -> None:
|
||||
downloaded_bytes[0] = bytes_downloaded
|
||||
total_bytes[0] = content_length
|
||||
|
||||
now = time.time()
|
||||
if now - last_progress_time[0] >= 0.5 and total_bytes[0] > 0:
|
||||
elapsed = now - start_time
|
||||
percent = (bytes_downloaded / content_length) * 100 if content_length > 0 else 0
|
||||
speed = bytes_downloaded / elapsed if elapsed > 0 else 0
|
||||
eta_seconds = (content_length - bytes_downloaded) / speed if speed > 0 else 0
|
||||
|
||||
speed_str = progress_bar.format_bytes(speed) + "/s"
|
||||
minutes, seconds = divmod(int(eta_seconds), 60)
|
||||
hours, minutes = divmod(minutes, 60)
|
||||
eta_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
|
||||
|
||||
progress_line = progress_bar.format_progress(
|
||||
percent_str=f"{percent:.1f}%",
|
||||
downloaded=bytes_downloaded,
|
||||
total=content_length,
|
||||
speed_str=speed_str,
|
||||
eta_str=eta_str,
|
||||
)
|
||||
debug(progress_line)
|
||||
last_progress_time[0] = now
|
||||
|
||||
with HTTPClient(timeout=30.0) as client:
|
||||
client.download(url, str(file_path), progress_callback=progress_callback)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
avg_speed_str = progress_bar.format_bytes(downloaded_bytes[0] / elapsed if elapsed > 0 else 0) + "/s"
|
||||
debug(f"✓ Downloaded in {elapsed:.1f}s at {avg_speed_str}")
|
||||
|
||||
# For direct file downloads, create minimal info dict without filename as title
|
||||
# This prevents creating duplicate title: tags when filename gets auto-generated
|
||||
# We'll add title back later only if we couldn't extract meaningful tags
|
||||
info = {
|
||||
"id": filename.rsplit(".", 1)[0],
|
||||
"ext": filename.rsplit(".", 1)[1] if "." in filename else "bin",
|
||||
"webpage_url": url,
|
||||
}
|
||||
|
||||
hash_value = None
|
||||
try:
|
||||
hash_value = sha256_file(file_path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
tags = []
|
||||
if extract_ytdlp_tags:
|
||||
try:
|
||||
tags = extract_ytdlp_tags(info)
|
||||
except Exception as e:
|
||||
log(f"Error extracting tags: {e}", file=sys.stderr)
|
||||
|
||||
# Only use filename as a title tag if we couldn't extract any meaningful tags
|
||||
# This prevents duplicate title: tags when the filename could be mistaken for metadata
|
||||
if not any(t.startswith('title:') for t in tags):
|
||||
# Re-extract tags with filename as title only if needed
|
||||
info['title'] = filename
|
||||
tags = []
|
||||
if extract_ytdlp_tags:
|
||||
try:
|
||||
tags = extract_ytdlp_tags(info)
|
||||
except Exception as e:
|
||||
log(f"Error extracting tags with filename: {e}", file=sys.stderr)
|
||||
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record(
|
||||
"direct-file-downloaded",
|
||||
{"url": url, "path": str(file_path), "hash": hash_value},
|
||||
)
|
||||
|
||||
return DownloadMediaResult(
|
||||
path=file_path,
|
||||
info=info,
|
||||
tags=tags,
|
||||
source_url=url,
|
||||
hash_value=hash_value,
|
||||
)
|
||||
|
||||
except (httpx.HTTPError, httpx.RequestError) as exc:
|
||||
log(f"Download error: {exc}", file=sys.stderr)
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record(
|
||||
"exception",
|
||||
{"phase": "direct-file", "url": url, "error": str(exc)},
|
||||
)
|
||||
raise DownloadError(f"Failed to download {url}: {exc}") from exc
|
||||
except Exception as exc:
|
||||
log(f"Error downloading file: {exc}", file=sys.stderr)
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record(
|
||||
"exception",
|
||||
{
|
||||
"phase": "direct-file",
|
||||
"url": url,
|
||||
"error": str(exc),
|
||||
"traceback": traceback.format_exc(),
|
||||
},
|
||||
)
|
||||
raise DownloadError(f"Error downloading file: {exc}") from exc
|
||||
|
||||
|
||||
def probe_url(url: str, no_playlist: bool = False) -> Optional[Dict[str, Any]]:
|
||||
"""Probe URL to extract metadata WITHOUT downloading.
|
||||
|
||||
Args:
|
||||
url: URL to probe
|
||||
no_playlist: If True, ignore playlists and probe only the single video
|
||||
|
||||
Returns:
|
||||
Dict with keys: extractor, title, entries (if playlist), duration, etc.
|
||||
Returns None if not supported by yt-dlp.
|
||||
"""
|
||||
if not is_url_supported_by_ytdlp(url):
|
||||
return None
|
||||
|
||||
_ensure_yt_dlp_ready()
|
||||
|
||||
assert yt_dlp is not None
|
||||
try:
|
||||
# Extract info without downloading
|
||||
# Use extract_flat='in_playlist' to get full metadata for playlist items
|
||||
ydl_opts = {
|
||||
"quiet": True, # Suppress all output
|
||||
"no_warnings": True,
|
||||
"socket_timeout": 10,
|
||||
"retries": 3,
|
||||
"skip_download": True, # Don't actually download
|
||||
"extract_flat": "in_playlist", # Get playlist with metadata for each entry
|
||||
"noprogress": True, # No progress bars
|
||||
"quiet": True,
|
||||
}
|
||||
|
||||
# Add no_playlist option if specified
|
||||
if no_playlist:
|
||||
ydl_opts["noplaylist"] = True
|
||||
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl: # type: ignore[arg-type]
|
||||
info = ydl.extract_info(url, download=False)
|
||||
|
||||
if not isinstance(info, dict):
|
||||
return None
|
||||
|
||||
# Extract relevant fields
|
||||
return {
|
||||
"extractor": info.get("extractor", ""),
|
||||
"title": info.get("title", ""),
|
||||
"entries": info.get("entries", []), # Will be populated if playlist
|
||||
"duration": info.get("duration"),
|
||||
"uploader": info.get("uploader"),
|
||||
"description": info.get("description"),
|
||||
"url": url,
|
||||
}
|
||||
except Exception as exc:
|
||||
log(f"Probe failed for {url}: {exc}")
|
||||
return None
|
||||
|
||||
|
||||
def download_media(
|
||||
opts: DownloadOptions,
|
||||
*,
|
||||
debug_logger: Optional[DebugLogger] = None,
|
||||
) -> DownloadMediaResult:
|
||||
"""Download media from URL using yt-dlp or direct HTTP download.
|
||||
|
||||
Args:
|
||||
opts: DownloadOptions with url, mode, output_dir, etc.
|
||||
debug_logger: Optional debug logger for troubleshooting
|
||||
|
||||
Returns:
|
||||
DownloadMediaResult with path, info, tags, hash
|
||||
|
||||
Raises:
|
||||
DownloadError: If download fails
|
||||
"""
|
||||
# Handle LibGen URLs specially
|
||||
# file.php redirects to mirrors, get.php is direct from modern API
|
||||
if 'libgen' in opts.url.lower():
|
||||
if '/get.php' in opts.url.lower():
|
||||
# Modern API get.php links are direct downloads from mirrors (not file redirects)
|
||||
log(f"Detected LibGen get.php URL, downloading directly...")
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record("libgen-direct", {"url": opts.url})
|
||||
return _download_direct_file(opts.url, opts.output_dir, debug_logger)
|
||||
elif '/file.php' in opts.url.lower():
|
||||
# Old-style file.php redirects to mirrors, we need to resolve
|
||||
log(f"Detected LibGen file.php URL, resolving to actual mirror...")
|
||||
actual_url = _get_libgen_download_url(opts.url)
|
||||
if actual_url and actual_url != opts.url:
|
||||
log(f"Resolved LibGen URL to mirror: {actual_url}")
|
||||
opts.url = actual_url
|
||||
# After resolution, this will typically be an onion link or direct file
|
||||
# Skip yt-dlp for this (it won't support onion/mirrors), go direct
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record("libgen-resolved", {"original": opts.url, "resolved": actual_url})
|
||||
return _download_direct_file(opts.url, opts.output_dir, debug_logger)
|
||||
else:
|
||||
log(f"Could not resolve LibGen URL, trying direct download anyway", file=sys.stderr)
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record("libgen-resolve-failed", {"url": opts.url})
|
||||
return _download_direct_file(opts.url, opts.output_dir, debug_logger)
|
||||
|
||||
# Try yt-dlp first if URL is supported
|
||||
if not is_url_supported_by_ytdlp(opts.url):
|
||||
log(f"URL not supported by yt-dlp, trying direct download: {opts.url}")
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record("direct-file-attempt", {"url": opts.url})
|
||||
return _download_direct_file(opts.url, opts.output_dir, debug_logger)
|
||||
|
||||
_ensure_yt_dlp_ready()
|
||||
|
||||
ytdl_options = _build_ytdlp_options(opts)
|
||||
log(f"Starting yt-dlp download: {opts.url}")
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record("ytdlp-start", {"url": opts.url})
|
||||
|
||||
assert yt_dlp is not None
|
||||
try:
|
||||
with yt_dlp.YoutubeDL(ytdl_options) as ydl: # type: ignore[arg-type]
|
||||
info = ydl.extract_info(opts.url, download=True)
|
||||
except Exception as exc:
|
||||
log(f"yt-dlp failed: {exc}", file=sys.stderr)
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record(
|
||||
"exception",
|
||||
{
|
||||
"phase": "yt-dlp",
|
||||
"error": str(exc),
|
||||
"traceback": traceback.format_exc(),
|
||||
},
|
||||
)
|
||||
raise DownloadError("yt-dlp download failed") from exc
|
||||
|
||||
if not isinstance(info, dict):
|
||||
log(f"Unexpected yt-dlp response: {type(info)}", file=sys.stderr)
|
||||
raise DownloadError("Unexpected yt-dlp response type")
|
||||
|
||||
info_dict: Dict[str, Any] = info
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record(
|
||||
"ytdlp-info",
|
||||
{
|
||||
"keys": sorted(info_dict.keys()),
|
||||
"is_playlist": bool(info_dict.get("entries")),
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
entry, media_path = _resolve_entry_and_path(info_dict, opts.output_dir)
|
||||
except FileNotFoundError as exc:
|
||||
log(f"Error: {exc}", file=sys.stderr)
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record(
|
||||
"exception",
|
||||
{"phase": "resolve-path", "error": str(exc)},
|
||||
)
|
||||
raise DownloadError(str(exc)) from exc
|
||||
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record(
|
||||
"resolved-media",
|
||||
{"path": str(media_path), "entry_keys": sorted(entry.keys())},
|
||||
)
|
||||
|
||||
# Extract hash from metadata or compute
|
||||
hash_value = _extract_sha256(entry) or _extract_sha256(info_dict)
|
||||
if not hash_value:
|
||||
try:
|
||||
hash_value = sha256_file(media_path)
|
||||
except OSError as exc:
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record(
|
||||
"hash-error",
|
||||
{"path": str(media_path), "error": str(exc)},
|
||||
)
|
||||
|
||||
# Extract tags using metadata.py
|
||||
tags = []
|
||||
if extract_ytdlp_tags:
|
||||
try:
|
||||
tags = extract_ytdlp_tags(entry)
|
||||
except Exception as e:
|
||||
log(f"Error extracting tags: {e}", file=sys.stderr)
|
||||
|
||||
source_url = (
|
||||
entry.get("webpage_url")
|
||||
or entry.get("original_url")
|
||||
or entry.get("url")
|
||||
)
|
||||
|
||||
log(f"✓ Downloaded: {media_path.name} ({len(tags)} tags)")
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record(
|
||||
"downloaded",
|
||||
{
|
||||
"path": str(media_path),
|
||||
"tag_count": len(tags),
|
||||
"source_url": source_url,
|
||||
"sha256": hash_value,
|
||||
},
|
||||
)
|
||||
|
||||
return DownloadMediaResult(
|
||||
path=media_path,
|
||||
info=entry,
|
||||
tags=tags,
|
||||
source_url=source_url,
|
||||
hash_value=hash_value,
|
||||
)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"download_media",
|
||||
"is_url_supported_by_ytdlp",
|
||||
"DownloadError",
|
||||
"DownloadOptions",
|
||||
"DownloadMediaResult",
|
||||
]
|
||||
Reference in New Issue
Block a user