Files
Medios-Macina/cmdlets/download_data.py

2666 lines
124 KiB
Python
Raw Normal View History

2025-11-25 20:09:33 -08:00
"""Download data from URLs using yt-dlp with playlist, clipping, and format selection.
This is a merged implementation combining:
- cmdlets/download_data.py (pipeline wrapper)
- funact/download_data.py (feature-rich implementation)
- helper/download.py (low-level machinery)
Features:
- Direct file downloads and yt-dlp streaming sites
- Playlist detection with interactive track selection
- Clip extraction (time ranges like 34:03-35:08)
- Format selection and audio/video toggles
- Cookies file support
- Tag extraction and metadata integration
- Progress tracking and debug logging
- Pipeline integration with result emission
- Background torrent/magnet downloads via AllDebrid
"""
from __future__ import annotations
import hashlib
import re
import sys
import threading
import time
from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence, Tuple
import uuid
from helper.logger import log, debug
from helper.download import download_media, probe_url
from helper.utils import sha256_file
from models import DownloadOptions
from . import register
from ._shared import Cmdlet, CmdletArg, SharedArgs, create_pipe_object_result, normalize_result_input, parse_cmdlet_args
import models
import pipeline as pipeline_context
from config import resolve_output_dir
from metadata import (
fetch_openlibrary_metadata_tags,
format_playlist_entry,
extract_ytdlp_tags
)
# ============================================================================
# Try to import optional dependencies
# ============================================================================
try:
from yt_dlp.utils import sanitize_filename as ytdlp_sanitize_filename # type: ignore
except Exception: # pragma: no cover - optional dependency
ytdlp_sanitize_filename = None
# ============================================================================
# Background Worker for AllDebrid Downloads
# ============================================================================
def _download_torrent_worker(
worker_id: str,
magnet_url: str,
output_dir: Path,
config: Dict[str, Any],
api_key: str,
playlist_items: Optional[str] = None,
audio_mode: bool = False,
wait_timeout: int = 600,
worker_manager: Optional[Any] = None,
) -> None:
"""Background worker to download torrent/magnet via AllDebrid.
Runs in a separate thread and updates worker_manager with progress.
Args:
worker_id: Unique ID for this worker task
magnet_url: Magnet link or .torrent URL to download
output_dir: Directory to save downloaded files
config: Configuration dict
api_key: AllDebrid API key
playlist_items: Optional file selection (e.g., "1,3,5-8")
audio_mode: Whether to tag as audio or video
wait_timeout: Timeout in seconds for magnet processing
worker_manager: WorkerManager instance for progress updates
"""
worker = None
downloaded_files = []
try:
from helper.alldebrid import AllDebridClient
# Get worker reference if manager provided
if worker_manager:
try:
workers = worker_manager.get_active_workers()
worker = next((w for w in workers if w.get('id') == worker_id), None)
except:
worker = None
def log_progress(message: str) -> None:
"""Log progress to both console and worker manager."""
debug(message)
if worker_manager and worker_id:
try:
worker_manager.log_step(worker_id, message)
except:
pass
log_progress(f"[Worker {worker_id}] Submitting magnet to AllDebrid...")
client = AllDebridClient(api_key)
# Add magnet
magnet_info = client.magnet_add(magnet_url)
magnet_id = int(magnet_info.get('id', 0))
if magnet_id <= 0:
log_progress(f"[Worker {worker_id}] ✗ Failed to add magnet to AllDebrid")
if worker_manager:
try:
worker_manager.finish_worker(worker_id, "failed", f"Failed to add magnet")
except:
pass
return
log_progress(f"[Worker {worker_id}] ✓ Magnet added (ID: {magnet_id})")
# Poll for ready status
elapsed = 0
last_status_reported = 0
while elapsed < wait_timeout:
try:
status_info = client.magnet_status(magnet_id)
except Exception as e:
log_progress(f"[Worker {worker_id}] ⚠ Failed to get status: {e}")
time.sleep(2)
elapsed += 2
continue
status_code = status_info.get('statusCode', -1)
status_text = status_info.get('status', 'Unknown')
# Report progress every 5 seconds (avoid log spam)
if elapsed - last_status_reported >= 5 or elapsed < 2:
downloaded = status_info.get('downloaded', 0)
total_size = status_info.get('size', 0)
seeders = status_info.get('seeders', 0)
speed = status_info.get('downloadSpeed', 0)
if total_size > 0:
percent = (downloaded / total_size) * 100
speed_str = f" @ {speed / (1024**2):.1f} MB/s" if speed > 0 else ""
seeders_str = f" ({seeders} seeders)" if seeders > 0 else ""
progress_msg = f"[Worker {worker_id}] ⧗ {status_text}: {percent:.1f}% ({downloaded / (1024**3):.2f} / {total_size / (1024**3):.2f} GB){speed_str}{seeders_str}"
log_progress(progress_msg)
# Update worker with progress
if worker_manager:
try:
worker_manager.update_worker(
worker_id,
status="running",
progress=f"{percent:.1f}%",
details=progress_msg
)
except:
pass
else:
log_progress(f"[Worker {worker_id}] ⧗ {status_text}...")
last_status_reported = elapsed
if status_code == 4: # Ready
log_progress(f"[Worker {worker_id}] ✓ Files ready")
break
elif status_code >= 5: # Error
error_status = {
5: "Upload failed",
6: "Internal error during unpacking",
7: "Not downloaded in 20 minutes",
8: "File too big (>1TB)",
9: "Internal error",
10: "Download took >72 hours",
11: "Deleted on hoster website",
12: "Processing failed",
13: "Processing failed",
14: "Tracker error",
15: "No peers available"
}
error_msg = error_status.get(status_code, f"Unknown error {status_code}")
log_progress(f"[Worker {worker_id}] ✗ Magnet failed: {error_msg}")
if worker_manager:
try:
worker_manager.finish_worker(worker_id, "failed", error_msg)
except:
pass
return
time.sleep(2)
elapsed += 2
if elapsed >= wait_timeout:
log_progress(f"[Worker {worker_id}] ✗ Timeout waiting for magnet (>{wait_timeout}s)")
if worker_manager:
try:
worker_manager.finish_worker(worker_id, "failed", f"Timeout after {wait_timeout}s")
except:
pass
return
# Get files
files_result = client.magnet_links([magnet_id])
magnet_files = files_result.get(str(magnet_id), {})
if not magnet_files and isinstance(magnet_id, int):
# Try integer key as fallback
for key in files_result:
if str(key) == str(magnet_id):
magnet_files = files_result[key]
break
files_array = magnet_files.get('files', [])
if not files_array:
log_progress(f"[Worker {worker_id}] ✗ No files found in magnet")
if worker_manager:
try:
worker_manager.finish_worker(worker_id, "failed", "No files found in magnet")
except:
pass
return
log_progress(f"[Worker {worker_id}] ✓ Found {len(files_array)} file(s)")
# Extract download links
download_links = []
def extract_links(items, prefix=""):
if not isinstance(items, list):
return
for item in items:
if isinstance(item, dict):
name = item.get('n', '')
link = item.get('l', '')
size = item.get('s', 0)
entries = item.get('e', [])
if link:
download_links.append({
'link': link,
'name': name,
'size': size,
'path': f"{prefix}/{name}" if prefix else name
})
if entries:
extract_links(entries, f"{prefix}/{name}" if prefix else name)
extract_links(files_array)
if not download_links:
log_progress(f"[Worker {worker_id}] ✗ No downloadable files found")
if worker_manager:
try:
worker_manager.finish_worker(worker_id, "failed", "No downloadable files")
except:
pass
return
# Filter by playlist_items if specified
if playlist_items and playlist_items != '*':
# Parse selection like "1,3,5-8"
selected_indices = []
for part in playlist_items.split(','):
part = part.strip()
if '-' in part:
start, end = part.split('-')
selected_indices.extend(range(int(start)-1, int(end)))
else:
selected_indices.append(int(part)-1)
download_links = [download_links[i] for i in selected_indices if i < len(download_links)]
log_progress(f"[Worker {worker_id}] Downloading {len(download_links)} selected file(s)")
# Download each file
for idx, file_info in enumerate(download_links, 1):
link = file_info['link']
name = file_info['name']
log_progress(f"[Worker {worker_id}] ({idx}/{len(download_links)}) Downloading: {name}")
try:
# Unlock the link
try:
actual_link = client.unlock_link(link)
if actual_link and actual_link != link:
link = actual_link
except:
pass
# Download via HTTP
from helper.http_client import HTTPClient
output_dir.mkdir(parents=True, exist_ok=True)
file_path = output_dir / name
file_path.parent.mkdir(parents=True, exist_ok=True)
with HTTPClient() as http_client:
http_client.download(link, str(file_path))
log_progress(f"[Worker {worker_id}] ✓ Downloaded: {name}")
# Compute hash and emit result
file_hash = _compute_file_hash(file_path)
result_obj = {
'file_path': str(file_path),
'source_url': magnet_url,
'file_hash': file_hash,
'media_kind': 'audio' if audio_mode else 'video',
}
pipeline_context.emit(result_obj)
downloaded_files.append(file_path)
except Exception as e:
log_progress(f"[Worker {worker_id}] ⚠ Failed to download {name}: {e}")
if downloaded_files:
msg = f"✓ Torrent download complete ({len(downloaded_files)} file(s))"
log_progress(f"[Worker {worker_id}] {msg}")
if worker_manager:
try:
worker_manager.finish_worker(worker_id, "success", msg)
except:
pass
else:
if worker_manager:
try:
worker_manager.finish_worker(worker_id, "failed", "No files downloaded")
except:
pass
except ImportError:
log_progress(f"[Worker {worker_id}] ✗ AllDebrid client not available")
if worker_manager:
try:
worker_manager.finish_worker(worker_id, "failed", "AllDebrid client not available")
except:
pass
except Exception as e:
import traceback
log_progress(f"[Worker {worker_id}] ✗ Torrent download failed: {e}")
if worker_manager:
try:
worker_manager.finish_worker(worker_id, "failed", str(e))
except:
pass
traceback.print_exc(file=sys.stderr)
# ============================================================================
# CMDLET Metadata Declaration
# ============================================================================
# ============================================================================
# Torrent File Parsing
# ============================================================================
def _parse_torrent_file(file_path: str) -> Optional[str]:
"""Parse a .torrent file and extract magnet link.
Args:
file_path: Path to .torrent file
Returns:
Magnet link string or None if parsing fails
"""
try:
import bencode3
except ImportError:
log("⚠ bencode3 module not found. Install: pip install bencode3", file=sys.stderr)
return None
try:
with open(file_path, 'rb') as f:
torrent_data = bencode3.bdecode(f.read())
except Exception as e:
log(f"✗ Failed to parse torrent file: {e}", file=sys.stderr)
return None
try:
# Get info dict - bencode3 returns string keys, not bytes
info = torrent_data.get('info')
if not info:
log("✗ No info dict in torrent file", file=sys.stderr)
return None
# Calculate info hash (SHA1 of bencoded info dict)
import hashlib
info_hash = hashlib.sha1(bencode3.bencode(info)).hexdigest()
# Get name
name = info.get('name', 'Unknown')
if isinstance(name, bytes):
name = name.decode('utf-8', errors='ignore')
# Create magnet link
magnet = f"magnet:?xt=urn:btih:{info_hash}&dn={name}"
# Add trackers if available
announce = torrent_data.get('announce')
if announce:
try:
tracker = announce if isinstance(announce, str) else announce.decode('utf-8', errors='ignore')
magnet += f"&tr={tracker}"
except:
pass
announce_list = torrent_data.get('announce-list', [])
for tier in announce_list:
if isinstance(tier, list):
for tracker_item in tier:
try:
tracker = tracker_item if isinstance(tracker_item, str) else tracker_item.decode('utf-8', errors='ignore')
if tracker:
magnet += f"&tr={tracker}"
except:
pass
debug(f"✓ Parsed torrent: {name} (hash: {info_hash})")
return magnet
except Exception as e:
log(f"✗ Error parsing torrent metadata: {e}", file=sys.stderr)
return None
def _download_torrent_file(url: str, temp_dir: Optional[Path] = None) -> Optional[str]:
"""Download a .torrent file from URL and parse it.
Args:
url: URL to .torrent file
temp_dir: Optional temp directory for storing downloaded file
Returns:
Magnet link string or None if download/parsing fails
"""
try:
from helper.http_client import HTTPClient
except ImportError:
log("⚠ HTTPClient not available", file=sys.stderr)
return None
try:
# Download torrent file
debug(f"⇓ Downloading torrent file: {url}")
with HTTPClient(timeout=30.0) as client:
response = client.get(url)
response.raise_for_status()
torrent_data = response.content
# Create temp file
if temp_dir is None:
temp_dir = Path.home() / ".cache" / "downlow"
temp_dir.mkdir(parents=True, exist_ok=True)
# Save to temp file
import hashlib
url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
temp_file = temp_dir / f"torrent_{url_hash}.torrent"
temp_file.write_bytes(torrent_data)
debug(f"✓ Downloaded torrent file: {temp_file}")
# Parse it
magnet = _parse_torrent_file(str(temp_file))
# Clean up
try:
temp_file.unlink()
except:
pass
return magnet
except Exception as e:
log(f"✗ Failed to download/parse torrent: {e}", file=sys.stderr)
return None
def _is_torrent_file_or_url(arg: str) -> bool:
"""Check if argument is a .torrent file path or URL.
Args:
arg: Argument to check
Returns:
True if it's a .torrent file or URL
"""
arg_lower = arg.lower()
# Check if it's a .torrent file path
if arg_lower.endswith('.torrent'):
return Path(arg).exists() or arg_lower.startswith('http')
# Check if it's a URL to .torrent file
if arg_lower.startswith('http://') or arg_lower.startswith('https://'):
return '.torrent' in arg_lower
return False
def _process_torrent_input(arg: str) -> Optional[str]:
"""Process torrent file or URL and convert to magnet link.
Args:
arg: .torrent file path or URL
Returns:
Magnet link or original argument if not processable
"""
try:
if arg.lower().startswith('http://') or arg.lower().startswith('https://'):
# It's a URL
return _download_torrent_file(arg) or arg
else:
# It's a file path
if Path(arg).exists():
return _parse_torrent_file(arg) or arg
else:
return arg
except Exception as e:
log(f"⚠ Error processing torrent: {e}", file=sys.stderr)
return arg
# ============================================================================
# Helper Functions
# ============================================================================
def _show_playlist_table(url: str, probe_info: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""Show playlist result table and get user selection.
Args:
url: Original URL
probe_info: Info dict from probe_url()
Returns:
Modified probe_info with selected_entries, or None if user cancelled
"""
entries = probe_info.get("entries", [])
if not entries:
return probe_info
extractor = probe_info.get("extractor", "")
title = probe_info.get("title", "Playlist")
debug(f"📋 Detected playlist: {title} ({len(entries)} items) - {extractor}")
# Skip full metadata enrichment for speed - extract_flat usually provides enough info
# debug("📋 Fetching metadata for each item...")
# entries = enrich_playlist_entries(entries, extractor)
# Emit each playlist item as a separate result row
for i, entry in enumerate(entries, 1):
formatted = format_playlist_entry(entry, i, extractor)
# Build tags from available metadata
tags = []
artist = formatted.get("artist") or formatted.get("uploader", "")
if artist:
tags.append(artist)
album = formatted.get("album", "")
if album and album != title: # Don't repeat playlist title
tags.append(album)
# Extract individual fields for separate columns
duration = formatted.get("duration", 0)
duration_str = ""
if duration:
minutes = int(duration // 60)
seconds = int(duration % 60)
duration_str = f"{minutes}m{seconds}s"
tags.append(duration_str)
# Normalize extractor for comparison (remove special chars and case)
ext_lower = extractor.lower().replace(":", "").replace(" ", "")
track_number = None
# Add site-specific tags and fields
if "youtube" in ext_lower and formatted.get("channel"):
tags.append(f"channel:{formatted.get('channel')}")
elif "bandcamp" in ext_lower:
track_number = formatted.get("track_number", i)
tags.append(f"track:{track_number}")
# Create result row with separate columns for important metadata
# Build columns dynamically based on available data
columns = [
("#", i),
("Title", formatted["title"]),
]
# Add Artist column if available
if artist:
columns.append(("Artist", artist))
# Add Duration column if available
if duration_str:
columns.append(("Duration", duration_str))
# Add Track number column for music platforms
if track_number is not None:
columns.append(("Track", str(track_number)))
# Add Tags column for remaining tags (if any)
remaining_tags = [t for t in tags if t not in [artist, duration_str]]
if remaining_tags:
columns.append(("Tags", ", ".join(remaining_tags)))
# Create result row with compact columns display
# Using "columns" field tells ResultTable which columns to show
result_row = {
"title": formatted["title"],
"tags": tags,
"index": i,
# Store all metadata but don't display in table (use columns field)
"__source": "playlist-probe",
"__id": f"{i}",
"__file_path": url,
"__action": f"playlist-item:{i}",
"__artist": formatted.get("artist", ""),
"__duration": formatted.get("duration", 0),
"__extractor": extractor,
# Define which columns should be shown in the result table
"columns": columns
}
# Add site-specific metadata for pipeline use
if "youtube" in ext_lower:
result_row["__video_id"] = formatted.get("video_id", "")
result_row["__channel"] = formatted.get("channel", "")
elif "bandcamp" in ext_lower:
result_row["__track_number"] = formatted.get("track_number", i)
result_row["__album"] = formatted.get("album") or title
elif "spotify" in ext_lower:
result_row["__artists"] = formatted.get("artists", "")
result_row["__album"] = formatted.get("album", "")
pipeline_context.emit(result_row)
debug(f" Playlist items displayed. Use result table references (@1, @2, etc.) to select tracks.")
# Return modified probe info
return probe_info
def _parse_time_range(clip_spec: str) -> Optional[Tuple[int, int]]:
"""Parse time range from MM:SS-MM:SS or seconds format.
Args:
clip_spec: Time range string like "34:03-35:08" or "2043-2108"
Returns:
Tuple of (start_seconds, end_seconds) or None if invalid
"""
try:
if '-' not in clip_spec:
return None
parts = clip_spec.split('-')
if len(parts) != 2:
return None
start_str, end_str = parts
# Try MM:SS format first
if ':' in start_str:
start_parts = start_str.split(':')
if len(start_parts) == 2:
start_sec = int(start_parts[0]) * 60 + int(start_parts[1])
else:
return None
else:
start_sec = int(start_str)
if ':' in end_str:
end_parts = end_str.split(':')
if len(end_parts) == 2:
end_sec = int(end_parts[0]) * 60 + int(end_parts[1])
else:
return None
else:
end_sec = int(end_str)
if start_sec >= end_sec:
return None
return (start_sec, end_sec)
except (ValueError, AttributeError):
return None
MEDIA_EXTENSIONS = {'.mp3', '.m4a', '.mp4', '.mkv', '.webm', '.flac', '.wav', '.aac'}
def _parse_playlist_selection_indices(selection: Optional[str], total_items: int) -> list[int]:
"""Convert playlist selection string to 0-based indices."""
if total_items <= 0:
return []
if not selection or selection.strip() in {"*", ""}:
return list(range(total_items))
indices: list[int] = []
for part in selection.split(','):
part = part.strip()
if not part:
continue
if '-' in part:
bounds = part.split('-', 1)
try:
start = int(bounds[0])
end = int(bounds[1])
except ValueError:
continue
if start <= 0 or end <= 0:
continue
if start > end:
start, end = end, start
for idx in range(start - 1, end):
if 0 <= idx < total_items:
indices.append(idx)
else:
try:
idx = int(part) - 1
except ValueError:
continue
if 0 <= idx < total_items:
indices.append(idx)
seen: set[int] = set()
ordered: list[int] = []
for idx in indices:
if idx not in seen:
ordered.append(idx)
seen.add(idx)
return ordered
def _select_playlist_entries(entries: Any, selection: Optional[str]) -> list[Dict[str, Any]]:
"""Pick playlist entries according to a selection string."""
if not isinstance(entries, list):
return []
indices = _parse_playlist_selection_indices(selection, len(entries))
if not indices:
return []
selected: list[Dict[str, Any]] = []
for idx in indices:
entry = entries[idx]
if isinstance(entry, dict):
selected.append(entry)
return selected
def _sanitize_title_for_filename(title: Optional[str]) -> str:
"""Match yt-dlp's restricted filename sanitization for comparisons."""
if not title:
return ""
if ytdlp_sanitize_filename:
try:
return ytdlp_sanitize_filename(title, restricted=True)
except Exception:
pass
sanitized = re.sub(r"[^0-9A-Za-z._-]+", "_", title)
return sanitized.strip() or ""
def _find_playlist_files_from_entries(
entries: Sequence[Dict[str, Any]],
output_dir: Path,
) -> list[Path]:
"""Resolve expected playlist files based on entry titles/exts."""
matched: list[Path] = []
seen: set[str] = set()
for entry in entries:
title = entry.get('title') if isinstance(entry, dict) else None
sanitized = _sanitize_title_for_filename(title)
if not sanitized:
continue
preferred_exts: list[str] = []
for key in ('ext', 'audio_ext', 'video_ext'):
value = entry.get(key) if isinstance(entry, dict) else None
if isinstance(value, str) and value:
preferred_exts.append(value.lower())
if not preferred_exts:
preferred_exts = [ext.strip('.') for ext in MEDIA_EXTENSIONS]
candidate: Optional[Path] = None
for ext in preferred_exts:
ext = ext.lstrip('.').lower()
path = output_dir / f"{sanitized}.{ext}"
if path.exists():
candidate = path
break
if candidate is None:
try:
# Bandcamp/yt-dlp often prefixes uploader info, so fall back to a substring match.
for f in output_dir.glob(f"*{sanitized}*"):
if f.suffix.lower() in MEDIA_EXTENSIONS and f.is_file():
candidate = f
break
except OSError:
candidate = None
if candidate and str(candidate) not in seen:
matched.append(candidate)
seen.add(str(candidate))
return matched
def _snapshot_playlist_paths(
entries: Sequence[Dict[str, Any]],
output_dir: Path,
) -> tuple[list[Path], set[str]]:
"""Capture current playlist file paths for a given selection."""
matches = _find_playlist_files_from_entries(entries, output_dir)
resolved: set[str] = set()
for path in matches:
try:
resolved.add(str(path.resolve()))
except OSError:
resolved.add(str(path))
return matches, resolved
def _expand_playlist_selection(selection: str, num_items: int) -> str:
"""Expand playlist selection string, handling wildcards.
Args:
selection: Selection string like '1,3,5-8' or '*'
num_items: Total number of items in playlist
Returns:
Expanded selection string like '1,3,5,6,7,8' or '1-18' for '*'
"""
if selection.strip() == "*":
# Wildcard: select all items
return f"1-{num_items}"
# Return as-is if not wildcard (yt-dlp will handle ranges and lists)
return selection
def _parse_selection_string(selection: str) -> List[int]:
"""Parse selection string into list of integers.
Handles formats like:
- "2" -> [2]
- "1,3,5" -> [1, 3, 5]
- "1-3" -> [1, 2, 3]
- "1,3-5,7" -> [1, 3, 4, 5, 7]
Args:
selection: Selection string
Returns:
List of integer indices
"""
result = []
for part in selection.split(','):
part = part.strip()
if '-' in part:
# Range like "3-5"
try:
start, end = part.split('-')
start_num = int(start.strip())
end_num = int(end.strip())
result.extend(range(start_num, end_num + 1))
except (ValueError, AttributeError):
continue
else:
# Single number
try:
result.append(int(part))
except ValueError:
continue
return result
def _filter_and_sort_formats(formats: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Filter and sort formats for user selection.
Filters out:
- Storyboards (webp, svg formats)
- Low quality audio (below ~128 kbps, typically 48kHz audio)
- Video below 360p
Sorts to prioritize:
- @1: Best combined audio+video (highest resolution, highest bitrate)
- @2: Best audio-only (highest bitrate audio)
- Then rest by quality
Args:
formats: List of format dicts from yt-dlp
Returns:
Filtered and sorted format list
"""
filtered = []
for fmt in formats:
format_id = fmt.get("format_id", "")
ext = fmt.get("ext", "")
vcodec = fmt.get("vcodec", "")
acodec = fmt.get("acodec", "")
height = fmt.get("height")
tbr = fmt.get("tbr") # Total bitrate
# Skip storyboards (webp images, svg, etc.)
if ext in {"webp", "svg", "mhtml"}:
continue
# Skip video-only formats below 360p
if vcodec != "none" and acodec == "none":
if height and height < 360:
continue
# Skip low-bitrate audio (typically 48kHz, very low quality)
# Keep audio with tbr >= 64 kbps (reasonable quality threshold)
if acodec != "none" and vcodec == "none":
if tbr and tbr < 64:
continue
filtered.append(fmt)
# Sort formats: best combined first, then best audio-only, then video-only
def format_sort_key(fmt: Dict[str, Any]) -> tuple:
vcodec = fmt.get("vcodec", "")
acodec = fmt.get("acodec", "")
height = fmt.get("height", 0) or 0
tbr = fmt.get("tbr", 0) or 0
# Category 0: has both audio and video (sort first)
# Category 1: audio only (sort second)
# Category 2: video only (sort last, by height desc)
if vcodec != "none" and acodec != "none":
category = 0
return (category, -height, -tbr)
elif acodec != "none" and vcodec == "none":
category = 1
return (category, -tbr) # Sort by bitrate descending
else: # Video only
category = 2
return (category, -height, -tbr) # Sort by height descending, then bitrate
return sorted(filtered, key=format_sort_key)
def _compute_file_hash(file_path: Path) -> Optional[str]:
"""Compute SHA256 hash of file."""
try:
return sha256_file(file_path)
except Exception:
return None
# ============================================================================
# Main Cmdlet Function
# ============================================================================
def _run(result: Any, args: Sequence[str], config: Dict[str, Any], emit_results: bool = True) -> int:
"""Download data from URLs with advanced options.
Accepts:
- Single URL as string
- Result object with 'url' or 'file_path' field
- List of results
- File containing URLs (one per line)
Returns:
Exit code (0 for success, 1 for failure)
"""
debug("Starting download-data")
collected_results: List[Dict[str, Any]] = []
def _emit(obj: Any) -> None:
"""Internal helper to collect and optionally emit results."""
collected_results.append(obj)
if emit_results:
pipeline_context.emit(obj)
# Track pipeline mode once so playlist handling can respect current run scope
stage_ctx = pipeline_context.get_stage_context()
in_pipeline = stage_ctx is not None and getattr(stage_ctx, 'total_stages', 1) > 1
# ========================================================================
# ARGUMENT PARSING
# ========================================================================
# Parse arguments using shared parser
parsed = parse_cmdlet_args(args, CMDLET)
audio_mode = parsed.get("audio", False)
format_selector = parsed.get("format")
list_formats_mode = parsed.get("list-formats", False)
clip_spec = parsed.get("clip")
clip_range = None
if clip_spec:
clip_range = _parse_time_range(clip_spec)
if clip_range:
debug(f"Clip range: {clip_spec} ({clip_range[0]}-{clip_range[1]} seconds)")
else:
log(f"Invalid clip format: {clip_spec}", file=sys.stderr)
return 1
cookies_path = parsed.get("cookies")
storage_location = parsed.get("storage")
torrent_mode = parsed.get("torrent", False)
wait_timeout = float(parsed.get("wait", 1800))
# Collect URLs from positional args and -url flag
# Both map to "url" in parsed result
urls_to_download = []
raw_urls = parsed.get("url", [])
if isinstance(raw_urls, str):
raw_urls = [raw_urls]
for arg in raw_urls:
if arg.lower().startswith(('http://', 'https://')):
# Check if it's a .torrent URL or file first
if '.torrent' in arg.lower():
debug(f"Processing torrent URL: {arg}")
magnet = _process_torrent_input(arg)
if magnet and magnet.lower().startswith('magnet:'):
urls_to_download.append(magnet)
debug(f"✓ Converted to magnet: {magnet[:70]}...")
elif magnet:
urls_to_download.append(magnet)
else:
log(f"✗ Failed to process torrent: {arg}", file=sys.stderr)
else:
urls_to_download.append(arg)
elif torrent_mode and (arg.lower().startswith('magnet:') or len(arg) == 40 or len(arg) == 64):
# In torrent mode, accept magnet links or torrent hashes (40-char SHA1 or 64-char SHA256)
urls_to_download.append(arg)
debug(f"Torrent/magnet added: {arg[:50]}...")
elif _is_torrent_file_or_url(arg):
# Handle .torrent files and URLs
log(f"Processing torrent file/URL: {arg}", flush=True)
magnet = _process_torrent_input(arg)
if magnet and magnet.lower().startswith('magnet:'):
urls_to_download.append(magnet)
log(f"✓ Converted to magnet: {magnet[:70]}...", flush=True)
elif magnet:
urls_to_download.append(magnet)
else:
log(f"✗ Failed to process torrent: {arg}", file=sys.stderr)
else:
# Treat as URL if it looks like one
if arg.lower().startswith(('magnet:', 'ftp://')):
urls_to_download.append(arg)
else:
# Check if it's a file containing URLs
path = Path(arg)
if path.exists() and path.is_file():
try:
with open(arg, 'r') as f:
for line in f:
line = line.strip()
if line and line.lower().startswith(('http://', 'https://')):
urls_to_download.append(line)
log(f"Loaded URLs from file: {arg}", flush=True)
except Exception as e:
log(f"Error reading file {arg}: {e}", file=sys.stderr)
else:
log(f"Ignored argument: {arg}", file=sys.stderr)
# Item selection (for playlists/formats)
# Note: -item flag is deprecated in favor of @N pipeline selection, but kept for compatibility
playlist_items = parsed.get("item")
if playlist_items:
log(f"Item selection: {playlist_items}", flush=True)
def _is_openlibrary_downloadable(ebook_access_val: Any, status_val: Any) -> bool:
access = str(ebook_access_val or "").strip().lower()
status = str(status_val or "").strip().lower()
if status == "download":
return True
if access in {"borrowable", "public", "full", "open"} or access.startswith("full "):
return True
if "" in str(status_val or ""):
return True
return False
# ========================================================================
# INPUT PROCESSING - Extract URLs from pipeline or arguments
# ========================================================================
# Initialize worker tracking for downloads
import uuid
from helper.local_library import LocalLibraryDB
from config import get_local_storage_path
worker_id = str(uuid.uuid4())
library_root = get_local_storage_path(config or {})
db = None
if library_root:
try:
db = LocalLibraryDB(library_root)
db.insert_worker(
worker_id,
"download",
title="Download Data",
description="Downloading files from search results",
pipe=pipeline_context.get_current_command_text()
)
except Exception as e:
log(f"⚠ Worker tracking unavailable: {e}", file=sys.stderr)
piped_results = normalize_result_input(result)
# Track files downloaded directly (e.g. Soulseek) to avoid "No URLs" error
files_downloaded_directly = 0
# Only process piped results if no URLs were provided in arguments
# This prevents picking up residue from previous commands when running standalone
if piped_results and not urls_to_download:
for item in piped_results:
url = None
origin = None
# ====== CHECK FOR PLAYLIST ITEM MARKER FROM add-file ======
# When add-file detects a playlist item and wants to download it
if isinstance(item, dict) and item.get('__playlist_url'):
playlist_url = item.get('__playlist_url')
item_num = item.get('__playlist_item', 1)
log(f"📍 Playlist item from add-file: #{item_num}", flush=True)
# Add to download list with marker
urls_to_download.append({
'__playlist_url': playlist_url,
'__playlist_item': int(item_num)
})
continue
# ====== CHECK FOR PLAYLIST ITEM SELECTION FIRST ======
# When user selects @12 from a playlist, item is emitted dict with __action: "playlist-item:12"
if isinstance(item, dict) and '__action' in item and item['__action'].startswith('playlist-item:'):
playlist_url = item.get('__file_path')
playlist_action = item['__action'] # e.g., "playlist-item:12"
item_num = playlist_action.split(':')[1] # Extract item number (1-based)
if playlist_url:
# Playlist item selected - need to download this specific track
log(f"📍 Playlist item selected: #{item_num} - {item.get('title', 'Unknown')}", flush=True)
# Add to download list - the playlist will be probed and item extracted
# Store with special marker so we know which item to select
urls_to_download.append({
'__playlist_url': playlist_url,
'__playlist_item': int(item_num)
})
continue
# ====== CHECK FOR FORMAT SELECTION RESULT ======
if isinstance(item, dict) and item.get('format_id') is not None and item.get('source_url'):
log(f"🎬 Format selected from pipe: {item.get('format_id')}", flush=True)
log(f" Source URL: {item.get('source_url')}", flush=True)
# Store as dict so we can extract format_id + source_url during download
urls_to_download.append(item)
continue
elif hasattr(item, 'format_id') and hasattr(item, 'source_url') and item.format_id is not None:
log(f"🎬 Format selected from pipe: {item.format_id}", flush=True)
log(f" Source URL: {item.source_url}", flush=True)
urls_to_download.append({
'format_id': item.format_id,
'source_url': item.source_url,
})
continue
if isinstance(item, dict):
# Check for search provider results first
origin = item.get('origin')
if origin in {'openlibrary', 'libgen', 'soulseek', 'debrid'}:
# Handle search provider results
title = item.get('title', 'Item')
if origin == 'openlibrary':
# OpenLibrary: First check if lendable/downloadable via Archive.org
# Only route to LibGen if NOT available on Archive.org
metadata = item.get('full_metadata', {}) if isinstance(item.get('full_metadata'), dict) else {}
isbn = metadata.get('isbn') or item.get('isbn')
olid = metadata.get('olid') or item.get('olid')
log(f"[search-result] OpenLibrary: '{title}'", flush=True)
if isbn:
log(f" ISBN: {isbn}", flush=True)
# Check if book is borrowable from ebook_access field or status
ebook_access = metadata.get('ebook_access') or item.get('ebook_access', '')
status_text = metadata.get('status') or item.get('status', '')
archive_id = metadata.get('archive_id') or item.get('archive_id')
# Determine if borrowable based on new status vocabulary
is_borrowable = _is_openlibrary_downloadable(ebook_access, status_text)
if is_borrowable:
log(f" ✓ Available for borrowing on Archive.org", flush=True)
log(f" → Queued for auto-borrowing...", flush=True)
# Queue borrow request as special dict object
# We need OCAID (Archive.org ID), not just numeric OLID
ocaid = archive_id
if not ocaid and isbn:
# If no OCAID in metadata, fetch it from OpenLibrary ISBN lookup
try:
import requests
ol_url = f'https://openlibrary.org/isbn/{isbn}.json'
r = requests.get(ol_url, timeout=5)
if r.status_code == 200:
ol_data = r.json()
ocaid = ol_data.get('ocaid')
except Exception as e:
log(f" ⚠ Could not fetch OCAID from OpenLibrary: {e}", file=sys.stderr)
if ocaid:
urls_to_download.append({
'__borrow_request__': True,
'book_id': ocaid,
'isbn': isbn,
'title': title,
'olid': olid
})
else:
# OCAID not found - book claims borrowable but not on Archive.org
# Fall back to LibGen search instead
log(f" ⚠ Book marked borrowable but not found on Archive.org", file=sys.stderr)
if isbn:
try:
from helper.search_provider import get_provider
libgen_provider = get_provider("libgen", config)
if libgen_provider:
libgen_results = libgen_provider.search(f"isbn:{isbn}", limit=1)
if libgen_results:
libgen_result = libgen_results[0]
url = libgen_result.get('target') if isinstance(libgen_result, dict) else getattr(libgen_result, 'target', None)
if url:
urls_to_download.append(url)
log(f" ✓ Found on LibGen instead", flush=True)
else:
log(f" ⚠ Not found on LibGen", file=sys.stderr)
else:
log(f" ⚠ Not found on LibGen", file=sys.stderr)
else:
log(f" ⚠ LibGen provider not available", file=sys.stderr)
except Exception as e:
log(f" ✗ Error searching LibGen: {e}", file=sys.stderr)
else:
# Book is NOT borrowable - route to LibGen
if isbn:
log(f" ⚠ Not available on Archive.org - attempting LibGen...", flush=True)
try:
from helper.search_provider import get_provider
libgen_provider = get_provider("libgen", config)
if libgen_provider:
libgen_results = libgen_provider.search(f"isbn:{isbn}", limit=1)
if libgen_results:
libgen_result = libgen_results[0]
url = libgen_result.get('target') if isinstance(libgen_result, dict) else getattr(libgen_result, 'target', None)
if url:
urls_to_download.append(url)
log(f" ✓ Found on LibGen", flush=True)
else:
log(f" ⚠ Not found on LibGen", file=sys.stderr)
else:
log(f" ⚠ Not found on LibGen", flush=True)
log(f" ▶ To search LibGen: search-file -provider libgen 'isbn:{isbn}' | @1 | download-data", flush=True)
else:
log(f" ▶ To search LibGen: search-file -provider libgen 'isbn:{isbn}' | @1 | download-data", flush=True)
except Exception as e:
log(f" ⚠ Could not search LibGen: {e}", file=sys.stderr)
log(f" ▶ To search LibGen: search-file -provider libgen 'isbn:{isbn}' | @1 | download-data", flush=True)
else:
log(f" ⚠ ISBN not available", flush=True)
log(f" ▶ Visit: {item.get('target', 'https://openlibrary.org')}", flush=True)
log(f" ▶ Or find ISBN and use: search-file -provider libgen 'isbn:\"<ISBN>\"'", flush=True)
elif origin == 'soulseek':
# Handle Soulseek downloads using the provider
metadata = item.get('full_metadata', {}) if isinstance(item.get('full_metadata'), dict) else {}
username = metadata.get('username')
filename = metadata.get('filename')
size = item.get('size_bytes') or 0
if username and filename:
try:
import asyncio
from helper.search_provider import SoulSeekProvider
provider = SoulSeekProvider(config)
log(f"[search-result] Soulseek: '{title}'", flush=True)
log(f" ▶ Downloading from {username}...", flush=True)
if db:
db.append_worker_stdout(worker_id, f"Downloading from Soulseek: {title} (from {username})")
# Get temp directory from config
temp_dir = config.get('temp')
if temp_dir:
temp_dir = str(Path(temp_dir).expanduser())
# Call async download_file with asyncio.run()
success = asyncio.run(provider.download_file(
username=username,
filename=filename,
file_size=size,
target_dir=temp_dir
))
if success:
downloaded_file = Path(provider.DOWNLOAD_DIR) / Path(filename).name
if downloaded_file.exists():
log(f" ✓ Downloaded: {downloaded_file.name}", flush=True)
files_downloaded_directly += 1
if db:
db.append_worker_stdout(worker_id, f"✓ Downloaded: {downloaded_file.name}")
if pipeline_context._PIPE_ACTIVE:
# Create proper PipeObject result
result_dict = create_pipe_object_result(
source='soulseek',
identifier=filename,
file_path=str(downloaded_file),
cmdlet_name='download-data',
title=title,
target=str(downloaded_file), # Explicit target for add-file
extra={
"metadata": metadata,
"origin": "soulseek"
}
)
pipeline_context.emit(result_dict)
else:
log(f" ✗ Download failed (peer may be offline)", file=sys.stderr)
if db:
db.append_worker_stdout(worker_id, f"✗ Download failed for {title}")
log(f" ▶ Try another result: search-file -provider soulseek \"...\" | @2 | download-data", flush=True)
except Exception as e:
log(f" ✗ Download error: {e}", file=sys.stderr)
if db:
db.append_worker_stdout(worker_id, f"✗ Error: {e}")
log(f" ▶ Alternative: search-soulseek -download \"{title}\" -storage <location>", flush=True)
else:
log(f"[search-result] Soulseek: '{title}'", flush=True)
log(f" ⚠ Missing download info (username/filename)", flush=True)
if db:
db.append_worker_stdout(worker_id, f"⚠ Missing download info for {title}")
elif origin == 'libgen':
# LibGen results can use the direct URL
# Also extract mirrors dict for fallback if primary fails
url = item.get('target')
# Extract mirrors and book_id from full_metadata
metadata = item.get('full_metadata', {}) if isinstance(item.get('full_metadata'), dict) else {}
mirrors = metadata.get('mirrors', {})
book_id = metadata.get('book_id', '')
if url:
url_entry = {
'url': str(url),
'mirrors': mirrors, # Alternative mirrors for fallback
'book_id': book_id,
}
urls_to_download.append(url_entry)
log(f"[search-result] LibGen: '{title}'", flush=True)
log(f" ✓ Queued for download", flush=True)
if mirrors:
log(f" Mirrors available: {len(mirrors)}", flush=True)
elif origin == 'debrid':
# Debrid results can use download-data
url = item.get('target')
if url:
urls_to_download.append(str(url))
log(f"[search-result] Debrid: '{title}'", flush=True)
log(f" ✓ Queued for download", flush=True)
else:
# Regular fields for non-search results
url = item.get('url') or item.get('link') or item.get('href') or item.get('target')
else:
# Object attributes
origin = getattr(item, 'origin', None)
title = getattr(item, 'title', 'Item')
if origin in {'openlibrary', 'libgen', 'soulseek', 'debrid'}:
# Handle search provider results
if origin == 'openlibrary':
# OpenLibrary: First check if lendable/downloadable via Archive.org
# Only route to LibGen if NOT available on Archive.org
metadata = getattr(item, 'full_metadata', {}) if isinstance(getattr(item, 'full_metadata', None), dict) else {}
isbn = metadata.get('isbn') or getattr(item, 'isbn', None)
olid = metadata.get('olid') or getattr(item, 'olid', None)
log(f"[search-result] OpenLibrary: '{title}'", flush=True)
if isbn:
log(f" ISBN: {isbn}", flush=True)
# Check if book is borrowable from ebook_access field or status
ebook_access = metadata.get('ebook_access') or getattr(item, 'ebook_access', '')
status_text = metadata.get('status') or getattr(item, 'status', '')
archive_id = metadata.get('archive_id') or getattr(item, 'archive_id', '')
# Determine if borrowable using unified helper
is_borrowable = _is_openlibrary_downloadable(ebook_access, status_text)
if is_borrowable:
# Book IS borrowable on Archive.org
log(f" ✓ Available for borrowing on Archive.org", flush=True)
log(f" → Queued for auto-borrowing...", flush=True)
# Queue borrow request as special dict object
ocaid = archive_id
if not ocaid and isbn:
try:
import requests
ol_url = f'https://openlibrary.org/isbn/{isbn}.json'
r = requests.get(ol_url, timeout=5)
if r.status_code == 200:
ol_data = r.json()
ocaid = ol_data.get('ocaid')
except Exception as e:
log(f" ⚠ Could not fetch OCAID from OpenLibrary: {e}", file=sys.stderr)
if ocaid:
urls_to_download.append({
'__borrow_request__': True,
'book_id': ocaid,
'isbn': isbn,
'title': title,
'olid': olid or getattr(item, 'openlibrary_id', '')
})
else:
# OCAID not found - book claims borrowable but not on Archive.org
# Fall back to LibGen search instead
log(f" ⚠ No Archive.org ID found - attempting LibGen instead...", file=sys.stderr)
if isbn:
try:
from helper.search_provider import get_provider
libgen_provider = get_provider("libgen", config)
if libgen_provider:
libgen_results = libgen_provider.search(f"isbn:{isbn}", limit=1)
if libgen_results:
libgen_result = libgen_results[0]
url = libgen_result.get('target') if isinstance(libgen_result, dict) else getattr(libgen_result, 'target', None)
if url:
urls_to_download.append(url)
log(f" ✓ Found on LibGen instead", flush=True)
else:
log(f" ⚠ Not found on LibGen", file=sys.stderr)
else:
log(f" ⚠ Not found on LibGen", file=sys.stderr)
else:
log(f" ⚠ LibGen provider not available", file=sys.stderr)
except Exception as e:
log(f" ✗ Error searching LibGen: {e}", file=sys.stderr)
else:
log(f" ⚠ ISBN not available for LibGen fallback", file=sys.stderr)
else:
# Book is NOT borrowable - route to LibGen
if isbn:
log(f" ⚠ Not available on Archive.org - attempting LibGen...", flush=True)
try:
from helper.search_provider import get_provider
libgen_provider = get_provider("libgen", config)
if libgen_provider:
libgen_results = libgen_provider.search(f"isbn:{isbn}", limit=1)
if libgen_results:
libgen_result = libgen_results[0]
url = libgen_result.get('target') if isinstance(libgen_result, dict) else getattr(libgen_result, 'target', None)
if url:
urls_to_download.append(url)
log(f" ✓ Found on LibGen", flush=True)
else:
log(f" ⚠ Not found on LibGen", file=sys.stderr)
else:
log(f" ⚠ Not found on LibGen", flush=True)
log(f" ▶ To search LibGen: search-file -provider libgen 'isbn:{isbn}' | @1 | download-data", flush=True)
else:
log(f" ▶ To search LibGen: search-file -provider libgen 'isbn:{isbn}' | @1 | download-data", flush=True)
except Exception as e:
log(f" ⚠ Could not search LibGen: {e}", file=sys.stderr)
log(f" ▶ To search LibGen: search-file -provider libgen 'isbn:{isbn}' | @1 | download-data", flush=True)
else:
log(f" ⚠ ISBN not available", flush=True)
log(f" ▶ Visit: {getattr(item, 'target', 'https://openlibrary.org')}", flush=True)
log(f" ▶ Or find ISBN and use: search-file -provider libgen 'isbn:\"<ISBN>\"'", flush=True)
elif origin == 'soulseek':
# Handle Soulseek downloads using the provider
metadata = getattr(item, 'full_metadata', {}) if isinstance(getattr(item, 'full_metadata', None), dict) else {}
username = metadata.get('username')
filename = metadata.get('filename')
size = getattr(item, 'size_bytes', 0) or 0
if username and filename:
try:
import asyncio
from helper.search_provider import SoulSeekProvider
provider = SoulSeekProvider(config)
log(f"[search-result] Soulseek: '{title}'", flush=True)
log(f" ▶ Downloading from {username}...", flush=True)
if db:
db.append_worker_stdout(worker_id, f"Downloading from Soulseek: {title} (from {username})")
# Get temp directory from config
temp_dir = config.get('temp')
if temp_dir:
temp_dir = str(Path(temp_dir).expanduser())
# Call async download_file with asyncio.run()
success = asyncio.run(provider.download_file(
username=username,
filename=filename,
file_size=size,
target_dir=temp_dir
))
if success:
downloaded_file = Path(provider.DOWNLOAD_DIR) / Path(filename).name
if downloaded_file.exists():
log(f" ✓ Downloaded: {downloaded_file.name}", flush=True)
files_downloaded_directly += 1
if db:
db.append_worker_stdout(worker_id, f"✓ Downloaded: {downloaded_file.name}")
if pipeline_context._PIPE_ACTIVE:
# Create proper PipeObject result
result_dict = create_pipe_object_result(
source='soulseek',
identifier=filename,
file_path=str(downloaded_file),
cmdlet_name='download-data',
title=title,
target=str(downloaded_file), # Explicit target for add-file
extra={
"metadata": metadata,
"origin": "soulseek"
}
)
pipeline_context.emit(result_dict)
else:
log(f" ✗ Download failed (peer may be offline)", file=sys.stderr)
if db:
db.append_worker_stdout(worker_id, f"✗ Download failed for {title}")
log(f" ▶ Try another result: search-file -provider soulseek \"...\" | @2 | download-data", flush=True)
except Exception as e:
log(f" ✗ Download error: {e}", file=sys.stderr)
if db:
db.append_worker_stdout(worker_id, f"✗ Error: {e}")
log(f" ▶ Alternative: search-soulseek -download \"{title}\" -storage <location>", flush=True)
else:
log(f"[search-result] Soulseek: '{title}'", flush=True)
log(f" ⚠ Missing download info (username/filename)", flush=True)
if db:
db.append_worker_stdout(worker_id, f"⚠ Missing download info for {title}")
elif origin == 'libgen':
# LibGen results with mirrors dict for fallback
url = getattr(item, 'target', None)
# Extract mirrors and book_id from full_metadata
metadata = getattr(item, 'full_metadata', {}) if isinstance(getattr(item, 'full_metadata', None), dict) else {}
mirrors = metadata.get('mirrors', {})
book_id = metadata.get('book_id', '')
if url:
url_entry = {
'url': str(url),
'mirrors': mirrors, # Alternative mirrors for fallback
'book_id': book_id,
}
urls_to_download.append(url_entry)
else:
urls_to_download.append(url) if url else None
elif origin == 'debrid':
url = getattr(item, 'target', None)
else:
url = getattr(item, 'url', None) or getattr(item, 'link', None) or getattr(item, 'href', None) or getattr(item, 'target', None)
if url:
urls_to_download.append(str(url))
if not urls_to_download and files_downloaded_directly == 0:
log(f"No downloadable URLs found", file=sys.stderr)
return 1
log(f"Processing {len(urls_to_download)} URL(s)", flush=True)
for i, u in enumerate(urls_to_download, 1):
if isinstance(u, dict):
log(f" [{i}] Format: {u.get('format_id', '?')} from {u.get('source_url', '?')[:60]}...", flush=True)
else:
log(f" [{i}] URL: {str(u)[:60]}...", flush=True)
# ========================================================================
# RESOLVE OUTPUT DIRECTORY
# ========================================================================
final_output_dir = None
# Priority 1: --storage flag
if storage_location:
try:
final_output_dir = SharedArgs.resolve_storage(storage_location)
log(f"Using storage location: {storage_location}{final_output_dir}", flush=True)
except ValueError as e:
log(str(e), file=sys.stderr)
return 1
# Priority 2: Config resolver
if final_output_dir is None and resolve_output_dir is not None:
try:
final_output_dir = resolve_output_dir(config)
log(f"Using config resolver: {final_output_dir}", flush=True)
except Exception:
pass
# Priority 4: Config outfile
if final_output_dir is None and config and config.get("outfile"):
try:
final_output_dir = Path(config["outfile"]).expanduser()
log(f"Using config outfile: {final_output_dir}", flush=True)
except Exception:
pass
# Priority 5: Default (home/Videos)
if final_output_dir is None:
final_output_dir = Path.home() / "Videos"
log(f"Using default directory: {final_output_dir}", flush=True)
# Ensure directory exists
try:
final_output_dir.mkdir(parents=True, exist_ok=True)
except Exception as e:
log(f"Cannot create output directory {final_output_dir}: {e}", file=sys.stderr)
return 1
# ========================================================================
# DOWNLOAD EACH URL
# ========================================================================
downloaded_files = []
playlists_displayed = 0
formats_displayed = False # NEW: Track if we showed formats
exit_code = 0
for url in urls_to_download:
try:
selected_playlist_entries: list[Dict[str, Any]] = []
playlist_existing_paths: set[str] = set()
# ====== HANDLE FORMAT SELECTION FROM PIPED RESULT ======
# If url is a dict with format_id and source_url, extract them and override format_selector
current_format_selector = format_selector
actual_url = url
if isinstance(url, dict) and url.get('format_id') and url.get('source_url'):
log(f"🎬 Format selected: {url.get('format_id')}", flush=True)
format_id = url.get('format_id')
current_format_selector = format_id
# If it's a video-only format (has vcodec but no acodec), add bestaudio
vcodec = url.get('vcodec', '')
acodec = url.get('acodec', '')
if vcodec and vcodec != "none" and (not acodec or acodec == "none"):
# Video-only format, add bestaudio automatically
current_format_selector = f"{format_id}+bestaudio"
log(f" Video-only format detected, automatically adding bestaudio", flush=True)
actual_url = url.get('source_url')
url = actual_url # Use the actual URL for further processing
# ====== AUTO-BORROW MODE - INTERCEPT SPECIAL BORROW REQUEST DICTS ======
if isinstance(url, dict) and url.get('__borrow_request__'):
try:
from helper.archive_client import credential_openlibrary, loan, get_book_infos, download
import tempfile
import shutil
book_id = url.get('book_id')
if not book_id:
log(f" ✗ Missing book ID for borrowing", file=sys.stderr)
exit_code = 1
continue
title_val = url.get('title', 'Unknown Book')
book_id_str = str(book_id)
log(f"[auto-borrow] Starting borrow for: {title_val}", flush=True)
log(f" Book ID: {book_id_str}", flush=True)
# Get Archive.org credentials
email, password = credential_openlibrary(config)
if not email or not password:
log(f" ✗ Archive.org credentials not configured", file=sys.stderr)
log(f" ▶ Set ARCHIVE_EMAIL and ARCHIVE_PASSWORD environment variables", file=sys.stderr)
exit_code = 1
continue
# Attempt to borrow and download
try:
log(f" → Logging into Archive.org...", flush=True)
from helper.archive_client import login
import requests
try:
session = login(email, password)
except requests.exceptions.Timeout:
log(f" ✗ Timeout logging into Archive.org (server not responding)", file=sys.stderr)
exit_code = 1
continue
except requests.exceptions.RequestException as e:
log(f" ✗ Error connecting to Archive.org: {e}", file=sys.stderr)
exit_code = 1
continue
log(f" → Borrowing book...", flush=True)
try:
session = loan(session, book_id_str, verbose=True)
except requests.exceptions.Timeout:
log(f" ✗ Timeout while borrowing (server not responding)", file=sys.stderr)
exit_code = 1
continue
except requests.exceptions.RequestException as e:
log(f" ✗ Error while borrowing: {e}", file=sys.stderr)
exit_code = 1
continue
log(f" → Extracting page information...", flush=True)
# Try both URL formats
book_urls = [
f"https://archive.org/borrow/{book_id_str}",
f"https://archive.org/details/{book_id_str}"
]
title = None
links = None
metadata = None
last_error = None
for book_url in book_urls:
try:
title, links, metadata = get_book_infos(session, book_url)
if title and links:
log(f" → Found {len(links)} pages", flush=True)
break
except requests.exceptions.Timeout:
last_error = "Timeout while extracting pages"
log(f" ⚠ Timeout while extracting from {book_url}", flush=True)
continue
except Exception as e:
last_error = str(e)
log(f" ⚠ Failed to extract from {book_url}: {e}", flush=True)
continue
if not links:
log(f" ✗ Could not extract book pages (Last error: {last_error})", file=sys.stderr)
exit_code = 1
continue
# Download pages
log(f" → Downloading {len(links)} pages...", flush=True)
with tempfile.TemporaryDirectory() as temp_dir:
# download(session, n_threads, directory, links, scale, book_id)
images = download(
session,
n_threads=4,
directory=temp_dir,
links=links,
scale=2,
book_id=str(book_id)
)
if not images:
log(f" ✗ No pages downloaded", file=sys.stderr)
exit_code = 1
continue
log(f" ✓ Downloaded {len(images)} pages", flush=True)
# Try to merge into PDF
try:
import img2pdf
log(f" → Merging pages into PDF...", flush=True)
filename = title if title else f"book_{book_id_str}"
filename = "".join(c for c in filename if c.isalnum() or c in (' ', '.', '-'))[:100]
output_path = Path(final_output_dir) / f"{filename}.pdf"
# Make unique filename if needed
i = 1
while output_path.exists():
output_path = Path(final_output_dir) / f"{filename}({i}).pdf"
i += 1
pdf_content = img2pdf.convert(images)
if pdf_content:
with open(output_path, 'wb') as f:
f.write(pdf_content)
log(f" ✓ Successfully borrowed and saved to: {output_path}", flush=True)
downloaded_files.append(str(output_path))
# Emit result for downstream cmdlets
file_hash = _compute_file_hash(output_path)
# Build tags including ISBN if available
emit_tags = ['book', 'borrowed', 'pdf']
isbn_tag = url.get('isbn')
if isbn_tag:
emit_tags.append(f'isbn:{isbn_tag}')
olid_tag = url.get('olid')
if olid_tag:
emit_tags.append(f'olid:{olid_tag}')
# Fetch OpenLibrary metadata tags
ol_tags = fetch_openlibrary_metadata_tags(isbn=isbn_tag, olid=olid_tag)
emit_tags.extend(ol_tags)
pipe_obj = create_pipe_object_result(
source='archive.org',
identifier=book_id_str,
file_path=str(output_path),
cmdlet_name='download-data',
title=title_val,
file_hash=file_hash,
tags=emit_tags,
source_url=url.get('source_url', f'archive.org/borrow/{book_id_str}')
)
pipeline_context.emit(pipe_obj)
exit_code = 0
except ImportError:
log(f" ⚠ img2pdf not available - saving pages as collection", file=sys.stderr)
# Just copy images to output dir
filename = title if title else f"book_{book_id_str}"
filename = "".join(c for c in filename if c.isalnum() or c in (' ', '.', '-'))[:100]
output_dir = Path(final_output_dir) / filename
i = 1
while output_dir.exists():
output_dir = Path(final_output_dir) / f"{filename}({i})"
i += 1
shutil.copytree(temp_dir, str(output_dir))
log(f" ✓ Successfully borrowed and saved to: {output_dir}", flush=True)
downloaded_files.append(str(output_dir))
# Emit result for downstream cmdlets
# Build tags including ISBN if available
emit_tags = ['book', 'borrowed', 'pages']
isbn_tag = url.get('isbn')
if isbn_tag:
emit_tags.append(f'isbn:{isbn_tag}')
olid_tag = url.get('olid')
if olid_tag:
emit_tags.append(f'olid:{olid_tag}')
# Fetch OpenLibrary metadata tags
ol_tags = fetch_openlibrary_metadata_tags(isbn=isbn_tag, olid=olid_tag)
emit_tags.extend(ol_tags)
pipe_obj = create_pipe_object_result(
source='archive.org',
identifier=book_id_str,
file_path=str(output_dir),
cmdlet_name='download-data',
title=title_val,
tags=emit_tags,
source_url=url.get('source_url', f'archive.org/borrow/{book_id_str}')
)
pipeline_context.emit(pipe_obj)
exit_code = 0
except Exception as e:
log(f" ✗ Borrow/download failed: {e}", file=sys.stderr)
import traceback
traceback.print_exc()
exit_code = 1
continue # Skip normal URL handling
except ImportError as e:
log(f" ✗ Archive.org tools not available: {e}", file=sys.stderr)
exit_code = 1
continue
except Exception as e:
log(f" ✗ Auto-borrow error: {e}", file=sys.stderr)
import traceback
traceback.print_exc()
exit_code = 1
continue
# ====== LIBGEN MIRROR FALLBACK MODE ======
# Handle libgen results with mirrors dict for fallback on failure
if isinstance(url, dict) and 'mirrors' in url:
try:
primary_url = url.get('url')
mirrors_dict = url.get('mirrors', {})
book_id = url.get('book_id', '')
if not primary_url:
log(f"Skipping libgen entry: no primary URL", file=sys.stderr)
exit_code = 1
continue
# Build list of mirrors to try: primary first, then alternatives
mirrors_to_try = [primary_url]
mirrors_to_try.extend(mirrors_dict.values())
# Remove duplicates while preserving order
mirrors_to_try = list(dict.fromkeys(mirrors_to_try))
log(f"🔄 LibGen download with mirror fallback (book_id: {book_id})", flush=True)
log(f" Primary: {primary_url[:80]}...", flush=True)
if len(mirrors_to_try) > 1:
log(f" {len(mirrors_to_try) - 1} alternative mirror(s) available", flush=True)
# Resolve cookies path
final_cookies_path_libgen = None
if cookies_path:
if resolve_cookies_path:
try:
final_cookies_path_libgen = resolve_cookies_path(config, Path(cookies_path))
except Exception:
final_cookies_path_libgen = Path(cookies_path).expanduser() if cookies_path else None
else:
final_cookies_path_libgen = Path(cookies_path).expanduser()
download_succeeded = False
last_error = None
successful_mirror = None
# Try each mirror in sequence using libgen_service's native download
for mirror_idx, mirror_url in enumerate(mirrors_to_try, 1):
try:
if mirror_idx > 1:
log(f" → Trying mirror #{mirror_idx}: {mirror_url[:80]}...", flush=True)
# Use libgen_service's download_from_mirror for proper libgen handling
from helper.libgen_service import download_from_mirror
# Generate filename from book_id and title
safe_title = "".join(c for c in str(title or "book") if c.isalnum() or c in (' ', '.', '-'))[:100]
file_path = final_output_dir / f"{safe_title}_{book_id}.pdf"
# Attempt download using libgen's native function
success = download_from_mirror(
mirror_url=mirror_url,
output_path=file_path,
log_info=lambda msg: log(f" {msg}", flush=True),
log_error=lambda msg: log(f"{msg}", file=sys.stderr)
)
if success and file_path.exists():
log(f" ✓ Downloaded successfully from mirror #{mirror_idx}", flush=True)
successful_mirror = mirror_url
download_succeeded = True
# Emit result for downstream cmdlets
file_hash = _compute_file_hash(file_path)
emit_tags = ['libgen', 'book']
pipe_obj = create_pipe_object_result(
source='libgen',
identifier=book_id,
file_path=str(file_path),
cmdlet_name='download-data',
file_hash=file_hash,
tags=emit_tags,
source_url=successful_mirror
)
pipeline_context.emit(pipe_obj)
downloaded_files.append(str(file_path))
exit_code = 0
break # Success, stop trying mirrors
except Exception as e:
last_error = str(e)
if mirror_idx == 1:
log(f" ⚠ Primary mirror failed: {e}", flush=True)
else:
log(f" ⚠ Mirror #{mirror_idx} failed: {e}", flush=True)
if not download_succeeded:
log(f" ✗ All mirrors failed. Last error: {last_error}", file=sys.stderr)
if "getaddrinfo failed" in str(last_error) or "NameResolutionError" in str(last_error) or "Failed to resolve" in str(last_error):
log(f" ⚠ Network issue detected: Cannot resolve LibGen mirror hostnames", file=sys.stderr)
log(f" ▶ Check your network connection or try with a VPN/proxy", file=sys.stderr)
exit_code = 1
continue # Skip to next URL
except Exception as e:
log(f" ✗ LibGen mirror fallback error: {e}", file=sys.stderr)
import traceback
traceback.print_exc(file=sys.stderr)
exit_code = 1
continue
# Ensure URL is a string for normal handling
if not isinstance(url, str):
# Check if it's a playlist item marker
if isinstance(url, dict) and url.get('__playlist_url'):
playlist_url = url.get('__playlist_url')
item_num = url.get('__playlist_item', 1)
log(f"📍 Handling selected playlist item #{item_num}", flush=True)
# Convert to actual URL and set playlist_items to download only this item
url = playlist_url
playlist_items = str(item_num)
# Fall through to normal handling below
else:
log(f"Skipping invalid URL entry: {url}", file=sys.stderr)
continue
log(f"Probing URL: {url}", flush=True)
# ====== TORRENT MODE - INTERCEPT BEFORE NORMAL DOWNLOAD ======
if torrent_mode or url.lower().startswith('magnet:'):
log(f"🧲 Torrent/magnet mode - spawning background worker...", flush=True)
try:
# Get API key from config
from config import get_debrid_api_key
api_key = get_debrid_api_key(config)
if not api_key:
log(f"✗ AllDebrid API key not found in config", file=sys.stderr)
exit_code = 1
continue
# Create a unique worker ID
worker_id = f"torrent_{uuid.uuid4().hex[:8]}"
# Get worker manager if available from config
worker_manager = config.get('_worker_manager')
# Create worker in manager if available
if worker_manager:
try:
worker_manager.track_worker(
worker_id,
worker_type="download_torrent",
title=f"Download: {url[:60]}...",
description=f"Torrent/magnet download via AllDebrid",
pipe=pipeline_context.get_current_command_text()
)
log(f"✓ Worker created (ID: {worker_id})", flush=True)
except Exception as e:
log(f"⚠ Failed to create worker: {e}", file=sys.stderr)
worker_manager = None
# Spawn background thread to handle the download
worker_thread = threading.Thread(
target=_download_torrent_worker,
args=(
worker_id,
url,
final_output_dir,
config,
api_key,
playlist_items,
audio_mode,
wait_timeout,
worker_manager,
),
daemon=False,
name=f"TorrentWorker_{worker_id}"
)
worker_thread.start()
log(f"✓ Background worker started (ID: {worker_id})", flush=True)
# Emit worker info so user can track it
worker_info = {
'worker_id': worker_id,
'worker_type': 'download_torrent',
'source_url': url,
'status': 'running',
'message': 'Downloading in background...'
}
pipeline_context.emit(worker_info)
continue
except ImportError:
log(f"✗ AllDebrid client not available", file=sys.stderr)
exit_code = 1
except Exception as e:
# Catches AllDebridError and other exceptions
log(f"✗ Failed to spawn torrent worker: {e}", file=sys.stderr)
import traceback
traceback.print_exc(file=sys.stderr)
exit_code = 1
continue # Skip to next URL
# ====== NORMAL DOWNLOAD MODE (HTTP/HTTPS) ======
# First, probe the URL to detect playlists and get info
# For YouTube URLs, ignore playlists and only probe the single video
is_youtube_url = isinstance(url, str) and ('youtube.com' in url or 'youtu.be' in url)
probe_info = probe_url(url, no_playlist=is_youtube_url)
is_actual_playlist = False # Track if we have a real multi-item playlist
if probe_info:
log(f"✓ Probed: {probe_info.get('title', url)} ({probe_info.get('extractor', 'unknown')})")
# If it's a playlist, show the result table and skip download for now
entries = probe_info.get("entries", [])
if entries and not playlist_items:
is_actual_playlist = True # We have a real playlist with multiple items
# Playlist detected but NO selection provided
# Always show table for user to select items
log(f"📋 Found playlist with {len(entries)} items")
_show_playlist_table(url, probe_info)
log(f" Playlist displayed. To select items, use @* or @1,3,5-8 syntax after piping results")
playlists_displayed += 1
continue # Skip to next URL - don't download playlist without selection
elif entries and playlist_items:
is_actual_playlist = True # We have a real playlist with item selection
# Playlist detected WITH selection - will download below
# Expand wildcard if present
expanded_items = _expand_playlist_selection(playlist_items, len(entries))
playlist_items = expanded_items
selected_playlist_entries = _select_playlist_entries(entries, playlist_items)
log(f"📋 Found playlist with {len(entries)} items - downloading selected: {playlist_items}")
else:
log(f"Single item: {probe_info.get('title', 'Unknown')}")
# ====== FORMAT LISTING MODE ======
if list_formats_mode and isinstance(url, str) and url.startswith(('http://', 'https://')):
log(f"Fetching formats for: {url}", flush=True)
from helper.download import list_formats
from result_table import ResultTable
all_formats = list_formats(url, no_playlist=is_youtube_url, playlist_items=playlist_items)
if all_formats:
# Filter and sort formats for better user experience
formats = _filter_and_sort_formats(all_formats)
# Create result table for format display
table = ResultTable(title=f"Available Formats - {probe_info.get('title', 'Unknown')}")
for fmt in formats:
row = table.add_row()
row.add_column("Format ID", fmt.get("format_id", ""))
# Build resolution/bitrate string
vcodec = fmt.get("vcodec", "")
acodec = fmt.get("acodec", "")
height = fmt.get("height")
tbr = fmt.get("tbr")
if vcodec != "none" and acodec != "none":
# Video + audio
res_str = fmt.get("resolution", "")
elif acodec != "none" and vcodec == "none":
# Audio only - show bitrate
res_str = f"{tbr:.0f} kbps" if tbr else "audio"
else:
# Video only
res_str = fmt.get("resolution", "")
row.add_column("Resolution", res_str)
# Build codec string (merged vcodec/acodec)
codec_parts = []
if vcodec and vcodec != "none":
codec_parts.append(f"v:{vcodec}")
if acodec and acodec != "none":
codec_parts.append(f"a:{acodec}")
codec_str = " | ".join(codec_parts) if codec_parts else "unknown"
row.add_column("Codec", codec_str)
if fmt.get("filesize"):
size_mb = fmt["filesize"] / (1024 * 1024)
row.add_column("Size", f"{size_mb:.1f} MB")
# Set source command for @N expansion
table.set_source_command("download-data", [url])
# Note: Row selection args are not set - users select with @N syntax directly
# Display table and emit as pipeline result
log(str(table), flush=True)
formats_displayed = True
# Store table for @N expansion so CLI can reconstruct commands
# Uses separate current_stage_table instead of result history table
pipeline_context.set_current_stage_table(table)
# Always emit formats so they can be selected with @N
for i, fmt in enumerate(formats, 1):
pipeline_context.emit({
"format_id": fmt.get("format_id", ""),
"format_string": fmt.get("format", ""),
"resolution": fmt.get("resolution", ""),
"vcodec": fmt.get("vcodec", ""),
"acodec": fmt.get("acodec", ""),
"ext": fmt.get("ext", ""),
"filesize": fmt.get("filesize"),
"source_url": url,
"index": i,
})
log(f"Use @N syntax to select a format and download", flush=True)
else:
log(f"✗ No formats available for this URL", file=sys.stderr)
continue # Skip download, just show formats
# ====== AUTO-DETECT MULTIPLE FORMATS ======
# Check if multiple formats exist and handle based on -item flag
if (not current_format_selector and not list_formats_mode and
isinstance(url, str) and url.startswith(('http://', 'https://'))):
# Check if this is a yt-dlp supported URL (YouTube, Vimeo, etc.)
from helper.download import is_url_supported_by_ytdlp, list_formats
from result_table import ResultTable
if is_url_supported_by_ytdlp(url):
log(f"Checking available formats for: {url}", flush=True)
all_formats = list_formats(url, no_playlist=is_youtube_url, playlist_items=playlist_items)
if all_formats:
# Filter and sort formats for better user experience
formats = _filter_and_sort_formats(all_formats)
# Handle -item selection for formats (single video)
if playlist_items and playlist_items.isdigit() and not is_actual_playlist:
idx = int(playlist_items)
if 0 < idx <= len(formats):
fmt = formats[idx-1]
current_format_selector = fmt.get("format_id")
log(f"Selected format #{idx}: {current_format_selector}")
playlist_items = None # Clear so it doesn't affect download options
else:
log(f"Invalid format index: {idx}", file=sys.stderr)
elif len(formats) > 1:
# Multiple formats available
log(f"📊 Found {len(formats)} available formats for: {probe_info.get('title', 'Unknown')}", flush=True)
# Always show table for format selection via @N syntax
# Show table and wait for @N selection
table = ResultTable(title=f"Available Formats - {probe_info.get('title', 'Unknown')}")
for fmt in formats:
row = table.add_row()
row.add_column("Format ID", fmt.get("format_id", ""))
# Build resolution/bitrate string
vcodec = fmt.get("vcodec", "")
acodec = fmt.get("acodec", "")
height = fmt.get("height")
tbr = fmt.get("tbr")
if vcodec != "none" and acodec != "none":
# Video + audio
res_str = fmt.get("resolution", "")
elif acodec != "none" and vcodec == "none":
# Audio only - show bitrate
res_str = f"{tbr:.0f} kbps" if tbr else "audio"
else:
# Video only
res_str = fmt.get("resolution", "")
row.add_column("Resolution", res_str)
# Build codec string (merged vcodec/acodec)
codec_parts = []
if vcodec and vcodec != "none":
codec_parts.append(f"v:{vcodec}")
if acodec and acodec != "none":
codec_parts.append(f"a:{acodec}")
codec_str = " | ".join(codec_parts) if codec_parts else "unknown"
row.add_column("Codec", codec_str)
if fmt.get("filesize"):
size_mb = fmt["filesize"] / (1024 * 1024)
row.add_column("Size", f"{size_mb:.1f} MB")
# Set source command for @N expansion
table.set_source_command("download-data", [url])
# Set row selection args so @N expands to "download-data URL -item N"
for i in range(len(formats)):
# i is 0-based index, but -item expects 1-based index
table.set_row_selection_args(i, ["-item", str(i + 1)])
# Display table and emit formats so they can be selected with @N
log(str(table), flush=True)
log(f"💡 Use @N syntax to select a format and download (e.g., @1)", flush=True)
# Store table for @N expansion so CLI can reconstruct commands
pipeline_context.set_current_stage_table(table)
# Emit formats as pipeline results for @N selection
for i, fmt in enumerate(formats, 1):
pipeline_context.emit({
"format_id": fmt.get("format_id", ""),
"format_string": fmt.get("format", ""),
"resolution": fmt.get("resolution", ""),
"vcodec": fmt.get("vcodec", ""),
"acodec": fmt.get("acodec", ""),
"filesize": fmt.get("filesize"),
"tbr": fmt.get("tbr"),
"source_url": url,
"index": i,
})
formats_displayed = True # Mark that we displayed formats
continue # Skip download, user must select format via @N
log(f"Downloading: {url}", flush=True)
# Resolve cookies path if specified
final_cookies_path = None
if cookies_path:
if resolve_cookies_path:
try:
final_cookies_path = resolve_cookies_path(config, Path(cookies_path))
except Exception:
final_cookies_path = Path(cookies_path).expanduser() if cookies_path else None
else:
final_cookies_path = Path(cookies_path).expanduser()
# Create download options - use correct parameter names
# Mode is "audio" or "video", required field
mode = "audio" if audio_mode else "video"
# Detect YouTube URLs and set no_playlist to download only the single video
is_youtube_url = isinstance(url, str) and ('youtube.com' in url or 'youtu.be' in url)
download_opts = DownloadOptions(
url=url,
mode=mode,
output_dir=final_output_dir,
cookies_path=final_cookies_path,
ytdl_format=current_format_selector, # Use per-URL format override if available
clip_sections=f"{clip_range[0]}-{clip_range[1]}" if clip_range else None,
playlist_items=playlist_items,
no_playlist=is_youtube_url, # For YouTube, ignore playlist URLs and download single video
)
# For playlist downloads, capture existing files BEFORE download
if playlist_items and selected_playlist_entries:
_, playlist_existing_paths = _snapshot_playlist_paths(selected_playlist_entries, final_output_dir)
# Call download_media from helper - no show_progress param
result_data = download_media(download_opts)
if result_data and result_data.path:
file_path = result_data.path
if file_path.exists():
# Check if this was a playlist download (is_actual_playlist tracks if we have a multi-item playlist)
if is_actual_playlist:
if not selected_playlist_entries:
log(
"⚠ Playlist metadata unavailable; cannot emit selected items for this stage.",
file=sys.stderr,
)
exit_code = 1
continue
matched_after, _ = _snapshot_playlist_paths(selected_playlist_entries, final_output_dir)
if not matched_after:
log(
"⚠ No playlist files found for the selected items after download.",
file=sys.stderr,
)
exit_code = 1
continue
new_playlist_files: list[Path] = []
for playlist_file in matched_after:
try:
path_key = str(playlist_file.resolve())
except OSError:
path_key = str(playlist_file)
if path_key not in playlist_existing_paths:
new_playlist_files.append(playlist_file)
emit_targets = new_playlist_files if new_playlist_files else matched_after
if new_playlist_files:
log(f"📋 Playlist download completed: {len(new_playlist_files)} new file(s)")
else:
log(f"📁 Reusing {len(emit_targets)} cached playlist file(s)", flush=True)
for playlist_file in emit_targets:
file_hash = _compute_file_hash(playlist_file)
tags = []
if extract_ytdlp_tags and result_data.tags:
tags = result_data.tags
pipe_obj = create_pipe_object_result(
source='download',
identifier=playlist_file.stem,
file_path=str(playlist_file),
cmdlet_name='download-data',
title=playlist_file.name,
file_hash=file_hash,
is_temp=False,
extra={
'url': url,
'tags': tags,
'audio_mode': audio_mode,
'format': format_selector,
'from_playlist': True,
},
)
downloaded_files.append(playlist_file)
pipeline_context.emit(pipe_obj)
else:
# Single file download
file_hash = result_data.hash_value or _compute_file_hash(file_path)
tags = result_data.tags if result_data.tags else []
pipe_obj = create_pipe_object_result(
source='download',
identifier=file_path.stem,
file_path=str(file_path),
cmdlet_name='download-data',
title=file_path.name,
file_hash=file_hash,
is_temp=False,
extra={
'url': url,
'tags': tags,
'audio_mode': audio_mode,
'format': format_selector,
'clipped': clip_range is not None,
}
)
downloaded_files.append(file_path)
pipeline_context.emit(pipe_obj)
log(f"✓ Downloaded: {file_path}", flush=True)
else:
log(f"Download returned no result for {url}", file=sys.stderr)
exit_code = 1
except Exception as e:
log(f"Error downloading {url}: {e}", file=sys.stderr)
import traceback
traceback.print_exc(file=sys.stderr)
exit_code = 1
# Success if we downloaded files or displayed playlists/formats
if downloaded_files or files_downloaded_directly > 0:
total_files = len(downloaded_files) + files_downloaded_directly
log(f"✓ Successfully downloaded {total_files} file(s)", flush=True)
2025-11-25 22:34:41 -08:00
# Create a result table for the downloaded files
# This ensures that subsequent @N commands select from these files
# instead of trying to expand the previous command (e.g. search-file)
if downloaded_files:
from result_table import ResultTable
table = ResultTable("Downloaded Files")
for i, file_path in enumerate(downloaded_files):
row = table.add_row()
row.add_column("#", str(i + 1))
row.add_column("File", file_path.name)
row.add_column("Path", str(file_path))
try:
size_mb = file_path.stat().st_size / (1024*1024)
row.add_column("Size", f"{size_mb:.1f} MB")
except OSError:
row.add_column("Size", "?")
# Set selection args to just the file path (or index if we want item selection)
# For item selection fallback, we don't strictly need row args if source command is None
# But setting them helps if we want to support command expansion later
table.set_row_selection_args(i, [str(file_path)])
# Register the table but DO NOT set a source command
# This forces CLI to use item-based selection (filtering the pipe)
# instead of command expansion
pipeline_context.set_last_result_table_overlay(table, downloaded_files)
pipeline_context.set_current_stage_table(table)
# Also print the table so user sees what they got
log(str(table), flush=True)
2025-11-25 20:09:33 -08:00
if db:
db.update_worker_status(worker_id, 'completed')
return 0
if playlists_displayed:
log(f"✓ Displayed {playlists_displayed} playlist(s) for selection", flush=True)
if db:
db.update_worker_status(worker_id, 'completed')
db.close()
return 0 # Success - playlists shown
if formats_displayed:
log(f"✓ Format selection table displayed - use @N to select and download", flush=True)
if db:
db.update_worker_status(worker_id, 'completed')
db.close()
return 0 # Success - formats shown
log(f"No files were downloaded or playlists displayed", file=sys.stderr)
if db:
db.update_worker_status(worker_id, 'completed')
db.close()
return 1
CMDLET = Cmdlet(
name="download-data",
exec=_run,
summary="Download data from URLs with playlist/clip support using yt-dlp",
usage="download-data <url> [options] or search-file | download-data [options]",
aliases=["download", "dl"],
args=[
CmdletArg(
name="url",
type="string",
required=False,
description="URL to download (HTTP/HTTPS or file with URL list)",
variadic=True
),
CmdletArg(
name="-url",
type="string",
description="URL to download (alias for positional argument)",
variadic=True
),
CmdletArg(
name="list-formats",
type="flag",
description="List available formats without downloading"
),
CmdletArg(
name="audio",
type="flag",
alias="a",
description="Download audio only (extract from video)"
),
CmdletArg(
name="video",
type="flag",
alias="v",
description="Download video (default if not specified)"
),
CmdletArg(
name="format",
type="string",
alias="fmt",
description="Explicit yt-dlp format selector (e.g., 'bestvideo+bestaudio')"
),
CmdletArg(
name="clip",
type="string",
description="Extract time range: MM:SS-MM:SS (e.g., 34:03-35:08) or seconds"
),
CmdletArg(
name="cookies",
type="string",
description="Path to cookies.txt file for authentication"
),
CmdletArg(
name="torrent",
type="flag",
description="Download torrent/magnet via AllDebrid (requires API key in config)"
),
CmdletArg(
name="wait",
type="float",
description="Wait time (seconds) for magnet processing timeout"
),
CmdletArg(
name="item",
type="string",
alias="items",
description="Item selection for playlists/formats: use '-item N' to select format N, or '-item' to show table for @N selection in next command"
),
SharedArgs.STORAGE, # Storage location: local, hydrus, 0x0, debrid, ftp
],
details=[
"Download media from URLs with advanced features.",
"",
"BASIC USAGE:",
" download-data https://youtube.com/watch?v=xyz",
" download-data https://example.com/file.pdf -storage local",
"",
"AUDIO/VIDEO OPTIONS:",
" -audio, -a Extract audio from video (M4A, MP3)",
" -video, -v Download as video (default)",
"",
"FORMAT SELECTION:",
" -format SELECTOR Specify yt-dlp format",
" Examples: 'best', 'bestvideo+bestaudio', '22'",
"",
"FORMAT/RESULT ITEM SELECTION:",
" -item Show available formats in table (see @N below)",
" -item N Auto-select and download format #N (e.g., -item 1)",
" Example: download-data URL -item 2 | add-file -storage local",
"",
"FORMAT SELECTION WITH @N SYNTAX:",
" 1. Show formats: download-data URL",
" 2. Select with @N: @1 | download-data | add-file",
" OR use -item N to skip manual selection",
"",
"CLIPPING:",
" -clip START-END Extract time range from media",
" Format: MM:SS-MM:SS (e.g., 34:03-35:08)",
" Also accepts: 2043-2108 (seconds)",
"",
"PLAYLIST MODE:",
" Automatically detects playlists",
" Shows numbered list of tracks",
" Download specific items: -item '1,3,5-8'",
" Download all items: -item '*'",
"",
"TORRENT MODE:",
" Download torrents/magnets via AllDebrid (if configured)",
" Usage: download-data -torrent magnet:?xt=urn:btih:... -item '1,3,5-8'",
" -wait SECONDS Maximum wait time for magnet processing (default: 1800)",
"",
"STORAGE LOCATIONS:",
" -storage local ~/Videos (default)",
" -storage hydrus ~/.hydrus/client_files",
" -storage 0x0 ~/Screenshots",
" -storage debrid ~/Debrid",
" -storage ftp ~/FTP",
"",
"EXAMPLES:",
" # Download YouTube video as audio",
" download-data https://youtube.com/watch?v=xyz -audio -storage local",
"",
" # Extract specific clip from video",
" download-data https://vimeo.com/123456 -clip 1:30-2:45 -format best",
"",
" # Download specific tracks from playlist",
" download-data https://youtube.com/playlist?list=xyz -item '1,3,5-8'",
"",
" # Download all items from playlist",
" download-data https://youtube.com/playlist?list=xyz -item '*'",
"",
" # Download with authentication",
" download-data https://example.com/content -cookies ~/cookies.txt",
"",
"TORRENT EXAMPLES:",
" # Download specific tracks from magnet link",
" download-data -torrent magnet:?xt=urn:btih:... -item '1,3,5-8' -storage local",
"",
" # Download all items from torrent and merge",
" download-data -torrent magnet:?xt=urn:btih:... -item '*' | merge-file | add-file",
"",
" # Download with custom wait time (5 minutes)",
" download-data -torrent magnet:?xt=urn:btih:... -wait 300 -item '1-5'",
]
)