Files
Medios-Macina/cmdlets/download_data.py
2025-12-07 00:21:30 -08:00

3139 lines
149 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Download data from URLs using yt-dlp with playlist, clipping, and format selection.
This is a merged implementation combining:
- cmdlets/download_data.py (pipeline wrapper)
- funact/download_data.py (feature-rich implementation)
- helper/download.py (low-level machinery)
Features:
- Direct file downloads and yt-dlp streaming sites
- Playlist detection with interactive track selection
- Clip extraction (time ranges like 34:03-35:08)
- Format selection and audio/video toggles
- Cookies file support
- Tag extraction and metadata integration
- Progress tracking and debug logging
- Pipeline integration with result emission
- Background torrent/magnet downloads via AllDebrid
"""
from __future__ import annotations
import hashlib
import re
import sys
import threading
import time
from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence, Tuple
import uuid
from helper.logger import log, debug
from helper.download import download_media, probe_url, is_url_supported_by_ytdlp
from helper.utils import sha256_file
from models import DownloadOptions
from . import register
from ._shared import Cmdlet, CmdletArg, SharedArgs, create_pipe_object_result, normalize_result_input, parse_cmdlet_args
import models
import pipeline as pipeline_context
from config import resolve_output_dir
from metadata import (
fetch_openlibrary_metadata_tags,
format_playlist_entry,
extract_ytdlp_tags,
build_book_tags,
)
# ============================================================================
# Try to import optional dependencies
# ============================================================================
try:
from yt_dlp.utils import sanitize_filename as ytdlp_sanitize_filename # type: ignore
except Exception: # pragma: no cover - optional dependency
ytdlp_sanitize_filename = None
# ============================================================================
# Background Worker for AllDebrid Downloads
# ============================================================================
def _download_torrent_worker(
worker_id: str,
magnet_url: str,
output_dir: Path,
config: Dict[str, Any],
api_key: str,
playlist_items: Optional[str] = None,
audio_mode: bool = False,
wait_timeout: int = 600,
worker_manager: Optional[Any] = None,
) -> None:
"""Background worker to download torrent/magnet via AllDebrid.
Runs in a separate thread and updates worker_manager with progress.
Args:
worker_id: Unique ID for this worker task
magnet_url: Magnet link or .torrent URL to download
output_dir: Directory to save downloaded files
config: Configuration dict
api_key: AllDebrid API key
playlist_items: Optional file selection (e.g., "1,3,5-8")
audio_mode: Whether to tag as audio or video
wait_timeout: Timeout in seconds for magnet processing
worker_manager: WorkerManager instance for progress updates
"""
worker = None
downloaded_files = []
try:
from helper.alldebrid import AllDebridClient
# Get worker reference if manager provided
if worker_manager:
try:
workers = worker_manager.get_active_workers()
worker = next((w for w in workers if w.get('id') == worker_id), None)
except:
worker = None
def log_progress(message: str) -> None:
"""Log progress to both console and worker manager."""
debug(message)
if worker_manager and worker_id:
try:
worker_manager.log_step(worker_id, message)
except:
pass
log_progress(f"[Worker {worker_id}] Submitting magnet to AllDebrid...")
client = AllDebridClient(api_key)
# Add magnet
magnet_info = client.magnet_add(magnet_url)
magnet_id = int(magnet_info.get('id', 0))
if magnet_id <= 0:
log_progress(f"[Worker {worker_id}] ✗ Failed to add magnet to AllDebrid")
if worker_manager:
try:
worker_manager.finish_worker(worker_id, "failed", f"Failed to add magnet")
except:
pass
return
log_progress(f"[Worker {worker_id}] ✓ Magnet added (ID: {magnet_id})")
# Poll for ready status
elapsed = 0
last_status_reported = 0
while elapsed < wait_timeout:
try:
status_info = client.magnet_status(magnet_id)
except Exception as e:
log_progress(f"[Worker {worker_id}] ⚠ Failed to get status: {e}")
time.sleep(2)
elapsed += 2
continue
status_code = status_info.get('statusCode', -1)
status_text = status_info.get('status', 'Unknown')
# Report progress every 5 seconds (avoid log spam)
if elapsed - last_status_reported >= 5 or elapsed < 2:
downloaded = status_info.get('downloaded', 0)
total_size = status_info.get('size', 0)
seeders = status_info.get('seeders', 0)
speed = status_info.get('downloadSpeed', 0)
if total_size > 0:
percent = (downloaded / total_size) * 100
speed_str = f" @ {speed / (1024**2):.1f} MB/s" if speed > 0 else ""
seeders_str = f" ({seeders} seeders)" if seeders > 0 else ""
progress_msg = f"[Worker {worker_id}] ⧗ {status_text}: {percent:.1f}% ({downloaded / (1024**3):.2f} / {total_size / (1024**3):.2f} GB){speed_str}{seeders_str}"
log_progress(progress_msg)
# Update worker with progress
if worker_manager:
try:
worker_manager.update_worker(
worker_id,
status="running",
progress=f"{percent:.1f}%",
details=progress_msg
)
except:
pass
else:
log_progress(f"[Worker {worker_id}] ⧗ {status_text}...")
last_status_reported = elapsed
if status_code == 4: # Ready
log_progress(f"[Worker {worker_id}] ✓ Files ready")
break
elif status_code >= 5: # Error
error_status = {
5: "Upload failed",
6: "Internal error during unpacking",
7: "Not downloaded in 20 minutes",
8: "File too big (>1TB)",
9: "Internal error",
10: "Download took >72 hours",
11: "Deleted on hoster website",
12: "Processing failed",
13: "Processing failed",
14: "Tracker error",
15: "No peers available"
}
error_msg = error_status.get(status_code, f"Unknown error {status_code}")
log_progress(f"[Worker {worker_id}] ✗ Magnet failed: {error_msg}")
if worker_manager:
try:
worker_manager.finish_worker(worker_id, "failed", error_msg)
except:
pass
return
time.sleep(2)
elapsed += 2
if elapsed >= wait_timeout:
log_progress(f"[Worker {worker_id}] ✗ Timeout waiting for magnet (>{wait_timeout}s)")
if worker_manager:
try:
worker_manager.finish_worker(worker_id, "failed", f"Timeout after {wait_timeout}s")
except:
pass
return
# Get files
files_result = client.magnet_links([magnet_id])
magnet_files = files_result.get(str(magnet_id), {})
if not magnet_files and isinstance(magnet_id, int):
# Try integer key as fallback
for key in files_result:
if str(key) == str(magnet_id):
magnet_files = files_result[key]
break
files_array = magnet_files.get('files', [])
if not files_array:
log_progress(f"[Worker {worker_id}] ✗ No files found in magnet")
if worker_manager:
try:
worker_manager.finish_worker(worker_id, "failed", "No files found in magnet")
except:
pass
return
log_progress(f"[Worker {worker_id}] ✓ Found {len(files_array)} file(s)")
# Extract download links
download_links = []
def extract_links(items, prefix=""):
if not isinstance(items, list):
return
for item in items:
if isinstance(item, dict):
name = item.get('n', '')
link = item.get('l', '')
size = item.get('s', 0)
entries = item.get('e', [])
if link:
download_links.append({
'link': link,
'name': name,
'size': size,
'path': f"{prefix}/{name}" if prefix else name
})
if entries:
extract_links(entries, f"{prefix}/{name}" if prefix else name)
extract_links(files_array)
if not download_links:
log_progress(f"[Worker {worker_id}] ✗ No downloadable files found")
if worker_manager:
try:
worker_manager.finish_worker(worker_id, "failed", "No downloadable files")
except:
pass
return
# Filter by playlist_items if specified
if playlist_items and playlist_items != '*':
# Parse selection like "1,3,5-8"
selected_indices = []
for part in playlist_items.split(','):
part = part.strip()
if '-' in part:
start, end = part.split('-')
selected_indices.extend(range(int(start)-1, int(end)))
else:
selected_indices.append(int(part)-1)
download_links = [download_links[i] for i in selected_indices if i < len(download_links)]
log_progress(f"[Worker {worker_id}] Downloading {len(download_links)} selected file(s)")
# Download each file
for idx, file_info in enumerate(download_links, 1):
link = file_info['link']
name = file_info['name']
log_progress(f"[Worker {worker_id}] ({idx}/{len(download_links)}) Downloading: {name}")
try:
# Unlock the link
try:
actual_link = client.unlock_link(link)
if actual_link and actual_link != link:
link = actual_link
except:
pass
# Download via HTTP
from helper.http_client import HTTPClient
output_dir.mkdir(parents=True, exist_ok=True)
file_path = output_dir / name
file_path.parent.mkdir(parents=True, exist_ok=True)
with HTTPClient() as http_client:
http_client.download(link, str(file_path))
log_progress(f"[Worker {worker_id}] ✓ Downloaded: {name}")
# Compute hash and emit result
file_hash = _compute_file_hash(file_path)
result_obj = {
'file_path': str(file_path),
'source_url': magnet_url,
'file_hash': file_hash,
'media_kind': 'audio' if audio_mode else 'video',
}
pipeline_context.emit(result_obj)
downloaded_files.append(file_path)
except Exception as e:
log_progress(f"[Worker {worker_id}] ⚠ Failed to download {name}: {e}")
if downloaded_files:
msg = f"✓ Torrent download complete ({len(downloaded_files)} file(s))"
log_progress(f"[Worker {worker_id}] {msg}")
if worker_manager:
try:
worker_manager.finish_worker(worker_id, "success", msg)
except:
pass
else:
if worker_manager:
try:
worker_manager.finish_worker(worker_id, "failed", "No files downloaded")
except:
pass
except ImportError:
log_progress(f"[Worker {worker_id}] ✗ AllDebrid client not available")
if worker_manager:
try:
worker_manager.finish_worker(worker_id, "failed", "AllDebrid client not available")
except:
pass
except Exception as e:
import traceback
log_progress(f"[Worker {worker_id}] ✗ Torrent download failed: {e}")
if worker_manager:
try:
worker_manager.finish_worker(worker_id, "failed", str(e))
except:
pass
traceback.print_exc(file=sys.stderr)
# ============================================================================
# CMDLET Metadata Declaration
# ============================================================================
# ============================================================================
# Torrent File Parsing
# ============================================================================
def _parse_torrent_file(file_path: str) -> Optional[str]:
"""Parse a .torrent file and extract magnet link.
Args:
file_path: Path to .torrent file
Returns:
Magnet link string or None if parsing fails
"""
try:
import bencode3
except ImportError:
log("⚠ bencode3 module not found. Install: pip install bencode3", file=sys.stderr)
return None
try:
with open(file_path, 'rb') as f:
torrent_data = bencode3.bdecode(f.read())
except Exception as e:
log(f"✗ Failed to parse torrent file: {e}", file=sys.stderr)
return None
try:
# Get info dict - bencode3 returns string keys, not bytes
info = torrent_data.get('info')
if not info:
log("✗ No info dict in torrent file", file=sys.stderr)
return None
# Calculate info hash (SHA1 of bencoded info dict)
import hashlib
info_hash = hashlib.sha1(bencode3.bencode(info)).hexdigest()
# Get name
name = info.get('name', 'Unknown')
if isinstance(name, bytes):
name = name.decode('utf-8', errors='ignore')
# Create magnet link
magnet = f"magnet:?xt=urn:btih:{info_hash}&dn={name}"
# Add trackers if available
announce = torrent_data.get('announce')
if announce:
try:
tracker = announce if isinstance(announce, str) else announce.decode('utf-8', errors='ignore')
magnet += f"&tr={tracker}"
except:
pass
announce_list = torrent_data.get('announce-list', [])
for tier in announce_list:
if isinstance(tier, list):
for tracker_item in tier:
try:
tracker = tracker_item if isinstance(tracker_item, str) else tracker_item.decode('utf-8', errors='ignore')
if tracker:
magnet += f"&tr={tracker}"
except:
pass
debug(f"✓ Parsed torrent: {name} (hash: {info_hash})")
return magnet
except Exception as e:
log(f"✗ Error parsing torrent metadata: {e}", file=sys.stderr)
return None
def _download_torrent_file(url: str, temp_dir: Optional[Path] = None) -> Optional[str]:
"""Download a .torrent file from URL and parse it.
Args:
url: URL to .torrent file
temp_dir: Optional temp directory for storing downloaded file
Returns:
Magnet link string or None if download/parsing fails
"""
try:
from helper.http_client import HTTPClient
except ImportError:
log("⚠ HTTPClient not available", file=sys.stderr)
return None
try:
# Download torrent file
debug(f"⇓ Downloading torrent file: {url}")
with HTTPClient(timeout=30.0) as client:
response = client.get(url)
response.raise_for_status()
torrent_data = response.content
# Create temp file
if temp_dir is None:
temp_dir = Path.home() / ".cache" / "downlow"
temp_dir.mkdir(parents=True, exist_ok=True)
# Save to temp file
import hashlib
url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
temp_file = temp_dir / f"torrent_{url_hash}.torrent"
temp_file.write_bytes(torrent_data)
debug(f"✓ Downloaded torrent file: {temp_file}")
# Parse it
magnet = _parse_torrent_file(str(temp_file))
# Clean up
try:
temp_file.unlink()
except:
pass
return magnet
except Exception as e:
log(f"✗ Failed to download/parse torrent: {e}", file=sys.stderr)
return None
def _is_torrent_file_or_url(arg: str) -> bool:
"""Check if argument is a .torrent file path or URL.
Args:
arg: Argument to check
Returns:
True if it's a .torrent file or URL
"""
arg_lower = arg.lower()
# Check if it's a .torrent file path
if arg_lower.endswith('.torrent'):
return Path(arg).exists() or arg_lower.startswith('http')
# Check if it's a URL to .torrent file
if arg_lower.startswith('http://') or arg_lower.startswith('https://'):
return '.torrent' in arg_lower
return False
def _process_torrent_input(arg: str) -> Optional[str]:
"""Process torrent file or URL and convert to magnet link.
Args:
arg: .torrent file path or URL
Returns:
Magnet link or original argument if not processable
"""
try:
if arg.lower().startswith('http://') or arg.lower().startswith('https://'):
# It's a URL
return _download_torrent_file(arg) or arg
else:
# It's a file path
if Path(arg).exists():
return _parse_torrent_file(arg) or arg
else:
return arg
except Exception as e:
log(f"⚠ Error processing torrent: {e}", file=sys.stderr)
return arg
# ============================================================================
# Helper Functions
# ============================================================================
def _show_playlist_table(url: str, probe_info: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""Show playlist result table and get user selection.
Args:
url: Original URL
probe_info: Info dict from probe_url()
Returns:
Modified probe_info with selected_entries, or None if user cancelled
"""
entries = probe_info.get("entries", [])
if not entries:
return probe_info
extractor = probe_info.get("extractor", "")
title = probe_info.get("title", "Playlist")
debug(f"📋 Detected playlist: {title} ({len(entries)} items) - {extractor}")
# Skip full metadata enrichment for speed - extract_flat usually provides enough info
# debug("📋 Fetching metadata for each item...")
# entries = enrich_playlist_entries(entries, extractor)
# Emit each playlist item as a separate result row
for i, entry in enumerate(entries, 1):
formatted = format_playlist_entry(entry, i, extractor)
# Build tags from available metadata
tags = []
artist = formatted.get("artist") or formatted.get("uploader", "")
if artist:
tags.append(artist)
album = formatted.get("album", "")
if album and album != title: # Don't repeat playlist title
tags.append(album)
# Extract individual fields for separate columns
duration = formatted.get("duration", 0)
duration_str = ""
if duration:
minutes = int(duration // 60)
seconds = int(duration % 60)
duration_str = f"{minutes}m{seconds}s"
tags.append(duration_str)
# Normalize extractor for comparison (remove special chars and case)
ext_lower = extractor.lower().replace(":", "").replace(" ", "")
track_number = None
# Add site-specific tags and fields
if "youtube" in ext_lower and formatted.get("channel"):
tags.append(f"channel:{formatted.get('channel')}")
elif "bandcamp" in ext_lower:
track_number = formatted.get("track_number", i)
tags.append(f"track:{track_number}")
# Create result row with separate columns for important metadata
# Build columns dynamically based on available data
columns = [
("#", i),
("Title", formatted["title"]),
]
# Add Artist column if available
if artist:
columns.append(("Artist", artist))
# Add Duration column if available
if duration_str:
columns.append(("Duration", duration_str))
# Add Track number column for music platforms
if track_number is not None:
columns.append(("Track", str(track_number)))
# Add Tags column for remaining tags (if any)
remaining_tags = [t for t in tags if t not in [artist, duration_str]]
if remaining_tags:
columns.append(("Tags", ", ".join(remaining_tags)))
# Create result row with compact columns display
# Using "columns" field tells ResultTable which columns to show
result_row = {
"title": formatted["title"],
"tags": tags,
"index": i,
# Store all metadata but don't display in table (use columns field)
"__source": "playlist-probe",
"__id": f"{i}",
"__file_path": url,
"__action": f"playlist-item:{i}",
"__artist": formatted.get("artist", ""),
"__duration": formatted.get("duration", 0),
"__extractor": extractor,
# Define which columns should be shown in the result table
"columns": columns
}
# Add site-specific metadata for pipeline use
if "youtube" in ext_lower:
result_row["__video_id"] = formatted.get("video_id", "")
result_row["__channel"] = formatted.get("channel", "")
elif "bandcamp" in ext_lower:
result_row["__track_number"] = formatted.get("track_number", i)
result_row["__album"] = formatted.get("album") or title
elif "spotify" in ext_lower:
result_row["__artists"] = formatted.get("artists", "")
result_row["__album"] = formatted.get("album", "")
pipeline_context.emit(result_row)
debug(f" Playlist items displayed. Use result table references (@1, @2, etc.) to select tracks.")
# Return modified probe info
return probe_info
def _parse_time_range(clip_spec: str) -> Optional[Tuple[int, int]]:
"""Parse time range from MM:SS-MM:SS or seconds format.
Args:
clip_spec: Time range string like "34:03-35:08" or "2043-2108"
Returns:
Tuple of (start_seconds, end_seconds) or None if invalid
"""
try:
if '-' not in clip_spec:
return None
parts = clip_spec.split('-')
if len(parts) != 2:
return None
start_str, end_str = parts
# Try MM:SS format first
if ':' in start_str:
start_parts = start_str.split(':')
if len(start_parts) == 2:
start_sec = int(start_parts[0]) * 60 + int(start_parts[1])
else:
return None
else:
start_sec = int(start_str)
if ':' in end_str:
end_parts = end_str.split(':')
if len(end_parts) == 2:
end_sec = int(end_parts[0]) * 60 + int(end_parts[1])
else:
return None
else:
end_sec = int(end_str)
if start_sec >= end_sec:
return None
return (start_sec, end_sec)
except (ValueError, AttributeError):
return None
def _parse_section_ranges(section_spec: str) -> Optional[List[Tuple[int, int]]]:
"""Parse section ranges from comma-separated time ranges.
Args:
section_spec: Section ranges like "1:30-1:35,0:05-0:15" or "90-95,5-15"
May include quotes from CLI which will be stripped
Returns:
List of (start_seconds, end_seconds) tuples or None if invalid
"""
try:
# Strip quotes if present (from CLI parsing)
section_spec = section_spec.strip('"\'')
if not section_spec or ',' not in section_spec and '-' not in section_spec:
return None
ranges = []
# Handle both comma-separated ranges and single range
if ',' in section_spec:
section_parts = section_spec.split(',')
else:
section_parts = [section_spec]
for part in section_parts:
part = part.strip()
if not part:
continue
# Parse each range using the same logic as _parse_time_range
# Handle format like "1:30-1:35" or "90-95"
if '-' not in part:
return None
# Split carefully to handle cases like "1:30-1:35"
# We need to find the dash that separates start and end
# Look for pattern: something-something where first something may have colons
dash_pos = -1
colon_count = 0
for i, char in enumerate(part):
if char == ':':
colon_count += 1
elif char == '-':
# If we've seen a colon and this is a dash, check if it's the separator
# Could be "1:30-1:35" or just "90-95"
# The separator dash should come after the first number/time
if i > 0 and i < len(part) - 1:
dash_pos = i
break
if dash_pos == -1:
return None
start_str = part[:dash_pos]
end_str = part[dash_pos+1:]
# Parse start time
if ':' in start_str:
start_parts = start_str.split(':')
if len(start_parts) == 2:
start_sec = int(start_parts[0]) * 60 + int(start_parts[1])
elif len(start_parts) == 3:
start_sec = int(start_parts[0]) * 3600 + int(start_parts[1]) * 60 + int(start_parts[2])
else:
return None
else:
start_sec = int(start_str)
# Parse end time
if ':' in end_str:
end_parts = end_str.split(':')
if len(end_parts) == 2:
end_sec = int(end_parts[0]) * 60 + int(end_parts[1])
elif len(end_parts) == 3:
end_sec = int(end_parts[0]) * 3600 + int(end_parts[1]) * 60 + int(end_parts[2])
else:
return None
else:
end_sec = int(end_str)
if start_sec >= end_sec:
return None
ranges.append((start_sec, end_sec))
return ranges if ranges else None
except (ValueError, AttributeError, IndexError):
return None
MEDIA_EXTENSIONS = {'.mp3', '.m4a', '.mp4', '.mkv', '.webm', '.flac', '.wav', '.aac'}
def _parse_playlist_selection_indices(selection: Optional[str], total_items: int) -> list[int]:
"""Convert playlist selection string to 0-based indices."""
if total_items <= 0:
return []
if not selection or selection.strip() in {"*", ""}:
return list(range(total_items))
indices: list[int] = []
for part in selection.split(','):
part = part.strip()
if not part:
continue
if '-' in part:
bounds = part.split('-', 1)
try:
start = int(bounds[0])
end = int(bounds[1])
except ValueError:
continue
if start <= 0 or end <= 0:
continue
if start > end:
start, end = end, start
for idx in range(start - 1, end):
if 0 <= idx < total_items:
indices.append(idx)
else:
try:
idx = int(part) - 1
except ValueError:
continue
if 0 <= idx < total_items:
indices.append(idx)
seen: set[int] = set()
ordered: list[int] = []
for idx in indices:
if idx not in seen:
ordered.append(idx)
seen.add(idx)
return ordered
def _select_playlist_entries(entries: Any, selection: Optional[str]) -> list[Dict[str, Any]]:
"""Pick playlist entries according to a selection string."""
if not isinstance(entries, list):
return []
indices = _parse_playlist_selection_indices(selection, len(entries))
if not indices:
return []
selected: list[Dict[str, Any]] = []
for idx in indices:
entry = entries[idx]
if isinstance(entry, dict):
selected.append(entry)
return selected
def _sanitize_title_for_filename(title: Optional[str]) -> str:
"""Match yt-dlp's restricted filename sanitization for comparisons."""
if not title:
return ""
if ytdlp_sanitize_filename:
try:
return ytdlp_sanitize_filename(title, restricted=True)
except Exception:
pass
sanitized = re.sub(r"[^0-9A-Za-z._-]+", "_", title)
return sanitized.strip() or ""
def _find_playlist_files_from_entries(
entries: Sequence[Dict[str, Any]],
output_dir: Path,
) -> list[Path]:
"""Resolve expected playlist files based on entry titles/exts."""
matched: list[Path] = []
seen: set[str] = set()
for entry in entries:
title = entry.get('title') if isinstance(entry, dict) else None
sanitized = _sanitize_title_for_filename(title)
if not sanitized:
continue
preferred_exts: list[str] = []
for key in ('ext', 'audio_ext', 'video_ext'):
value = entry.get(key) if isinstance(entry, dict) else None
if isinstance(value, str) and value:
preferred_exts.append(value.lower())
if not preferred_exts:
preferred_exts = [ext.strip('.') for ext in MEDIA_EXTENSIONS]
candidate: Optional[Path] = None
for ext in preferred_exts:
ext = ext.lstrip('.').lower()
path = output_dir / f"{sanitized}.{ext}"
if path.exists():
candidate = path
break
if candidate is None:
try:
# Bandcamp/yt-dlp often prefixes uploader info, so fall back to a substring match.
for f in output_dir.glob(f"*{sanitized}*"):
if f.suffix.lower() in MEDIA_EXTENSIONS and f.is_file():
candidate = f
break
except OSError:
candidate = None
if candidate and str(candidate) not in seen:
matched.append(candidate)
seen.add(str(candidate))
return matched
def _snapshot_playlist_paths(
entries: Sequence[Dict[str, Any]],
output_dir: Path,
) -> tuple[list[Path], set[str]]:
"""Capture current playlist file paths for a given selection."""
matches = _find_playlist_files_from_entries(entries, output_dir)
resolved: set[str] = set()
for path in matches:
try:
resolved.add(str(path.resolve()))
except OSError:
resolved.add(str(path))
return matches, resolved
def _expand_playlist_selection(selection: str, num_items: int) -> str:
"""Expand playlist selection string, handling wildcards.
Args:
selection: Selection string like '1,3,5-8' or '*'
num_items: Total number of items in playlist
Returns:
Expanded selection string like '1,3,5,6,7,8' or '1-18' for '*'
"""
if selection.strip() == "*":
# Wildcard: select all items
return f"1-{num_items}"
# Return as-is if not wildcard (yt-dlp will handle ranges and lists)
return selection
def _parse_selection_string(selection: str) -> List[int]:
"""Parse selection string into list of integers.
Handles formats like:
- "2" -> [2]
- "1,3,5" -> [1, 3, 5]
- "1-3" -> [1, 2, 3]
- "1,3-5,7" -> [1, 3, 4, 5, 7]
Args:
selection: Selection string
Returns:
List of integer indices
"""
result = []
for part in selection.split(','):
part = part.strip()
if '-' in part:
# Range like "3-5"
try:
start, end = part.split('-')
start_num = int(start.strip())
end_num = int(end.strip())
result.extend(range(start_num, end_num + 1))
except (ValueError, AttributeError):
continue
else:
# Single number
try:
result.append(int(part))
except ValueError:
continue
return result
def _filter_and_sort_formats(formats: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Filter and sort formats for user selection.
Filters out:
- Storyboards (webp, svg formats)
- Low quality audio (below ~128 kbps, typically 48kHz audio)
- Video below 360p
Sorts to prioritize:
- @1: Best combined audio+video (highest resolution, highest bitrate)
- @2: Best audio-only (highest bitrate audio)
- Then rest by quality
Args:
formats: List of format dicts from yt-dlp
Returns:
Filtered and sorted format list
"""
filtered = []
for fmt in formats:
format_id = fmt.get("format_id", "")
ext = fmt.get("ext", "")
vcodec = fmt.get("vcodec", "")
acodec = fmt.get("acodec", "")
height = fmt.get("height")
tbr = fmt.get("tbr") # Total bitrate
# Skip storyboards (webp images, svg, etc.)
if ext in {"webp", "svg", "mhtml"}:
continue
# Skip video-only formats below 360p
if vcodec != "none" and acodec == "none":
if height and height < 360:
continue
# Skip low-bitrate audio (typically 48kHz, very low quality)
# Keep audio with tbr >= 64 kbps (reasonable quality threshold)
if acodec != "none" and vcodec == "none":
if tbr and tbr < 64:
continue
filtered.append(fmt)
# Sort formats: best combined first, then best audio-only, then video-only
def format_sort_key(fmt: Dict[str, Any]) -> tuple:
vcodec = fmt.get("vcodec", "")
acodec = fmt.get("acodec", "")
height = fmt.get("height", 0) or 0
tbr = fmt.get("tbr", 0) or 0
# Category 0: has both audio and video (sort first)
# Category 1: audio only (sort second)
# Category 2: video only (sort last, by height desc)
if vcodec != "none" and acodec != "none":
category = 0
return (category, -height, -tbr)
elif acodec != "none" and vcodec == "none":
category = 1
return (category, -tbr) # Sort by bitrate descending
else: # Video only
category = 2
return (category, -height, -tbr) # Sort by height descending, then bitrate
return sorted(filtered, key=format_sort_key)
def _compute_file_hash(file_path: Path) -> Optional[str]:
"""Compute SHA256 hash of file."""
try:
return sha256_file(file_path)
except Exception:
return None
# ============================================================================
# Main Cmdlet Function
# ============================================================================
def _run(result: Any, args: Sequence[str], config: Dict[str, Any], emit_results: bool = True) -> int:
"""Download data from URLs with advanced options.
Accepts:
- Single URL as string
- Result object with 'url' or 'file_path' field
- List of results
- File containing URLs (one per line)
Returns:
Exit code (0 for success, 1 for failure)
"""
debug("Starting download-data")
collected_results: List[Dict[str, Any]] = []
def _emit(obj: Any) -> None:
"""Internal helper to collect and optionally emit results."""
collected_results.append(obj)
if emit_results:
pipeline_context.emit(obj)
# Track pipeline mode once so playlist handling can respect current run scope
stage_ctx = pipeline_context.get_stage_context()
in_pipeline = stage_ctx is not None and getattr(stage_ctx, 'total_stages', 1) > 1
# ========================================================================
# ARGUMENT PARSING
# ========================================================================
# Parse arguments using shared parser
parsed = parse_cmdlet_args(args, CMDLET)
audio_mode = parsed.get("audio", False)
format_selector = parsed.get("format")
list_formats_mode = parsed.get("list-formats", False)
clip_spec = parsed.get("clip")
clip_range = None
if clip_spec:
clip_range = _parse_time_range(clip_spec)
if clip_range:
debug(f"Clip range: {clip_spec} ({clip_range[0]}-{clip_range[1]} seconds)")
else:
log(f"Invalid clip format: {clip_spec}", file=sys.stderr)
return 1
# Section download (yt-dlp only)
section_spec = parsed.get("section")
section_ranges = None
if section_spec:
# Parse section spec like "1:30-1:35,0:05-0:15" into list of (start, end) tuples
section_ranges = _parse_section_ranges(section_spec)
if section_ranges:
debug(f"Section ranges: {section_spec} ({len(section_ranges)} sections)")
# When downloading sections, auto-select best format if not specified
# Since we're only getting portions, quality matters less than completeness
if not format_selector:
format_selector = "bestvideo+bestaudio/best"
debug(f"Auto-selecting format for sections: {format_selector}")
else:
log(f"Invalid section format: {section_spec}", file=sys.stderr)
return 1
cookies_path = parsed.get("cookies")
storage_location = parsed.get("storage")
torrent_mode = parsed.get("torrent", False)
wait_timeout = float(parsed.get("wait", 1800))
# Collect URLs from positional args and -url flag
# Both map to "url" in parsed result
urls_to_download = []
raw_urls = parsed.get("url", [])
if isinstance(raw_urls, str):
raw_urls = [raw_urls]
for arg in raw_urls:
if arg.lower().startswith(('http://', 'https://')):
# Check if it's a .torrent URL or file first
if '.torrent' in arg.lower():
debug(f"Processing torrent URL: {arg}")
magnet = _process_torrent_input(arg)
if magnet and magnet.lower().startswith('magnet:'):
urls_to_download.append(magnet)
debug(f"✓ Converted to magnet: {magnet[:70]}...")
elif magnet:
urls_to_download.append(magnet)
else:
log(f"✗ Failed to process torrent: {arg}", file=sys.stderr)
else:
urls_to_download.append(arg)
elif torrent_mode and (arg.lower().startswith('magnet:') or len(arg) == 40 or len(arg) == 64):
# In torrent mode, accept magnet links or torrent hashes (40-char SHA1 or 64-char SHA256)
urls_to_download.append(arg)
debug(f"Torrent/magnet added: {arg[:50]}...")
elif _is_torrent_file_or_url(arg):
# Handle .torrent files and URLs
debug(f"Processing torrent file/URL: {arg}")
magnet = _process_torrent_input(arg)
if magnet and magnet.lower().startswith('magnet:'):
urls_to_download.append(magnet)
debug(f"✓ Converted to magnet: {magnet[:70]}...")
elif magnet:
urls_to_download.append(magnet)
else:
log(f"✗ Failed to process torrent: {arg}", file=sys.stderr)
else:
# Treat as URL if it looks like one
if arg.lower().startswith(('magnet:', 'ftp://')):
urls_to_download.append(arg)
else:
# Check if it's a file containing URLs
path = Path(arg)
if path.exists() and path.is_file():
try:
with open(arg, 'r') as f:
for line in f:
line = line.strip()
if line and line.lower().startswith(('http://', 'https://')):
urls_to_download.append(line)
debug(f"Loaded URLs from file: {arg}")
except Exception as e:
log(f"Error reading file {arg}: {e}", file=sys.stderr)
else:
debug(f"Ignored argument: {arg}")
# Item selection (for playlists/formats)
# Note: -item flag is deprecated in favor of @N pipeline selection, but kept for compatibility
playlist_items = parsed.get("item")
if playlist_items:
debug(f"Item selection: {playlist_items}")
def _is_openlibrary_downloadable(ebook_access_val: Any, status_val: Any) -> bool:
access = str(ebook_access_val or "").strip().lower()
status = str(status_val or "").strip().lower()
if status == "download":
return True
if access in {"borrowable", "public", "full", "open"} or access.startswith("full "):
return True
if "" in str(status_val or ""):
return True
return False
# ========================================================================
# INPUT PROCESSING - Extract URLs from pipeline or arguments
# ========================================================================
# Initialize worker tracking for downloads
import uuid
from helper.local_library import LocalLibraryDB
from config import get_local_storage_path
# Define LazyDB proxy to avoid keeping DB connection open for long duration
class LazyDB:
def __init__(self, root):
self.root = root
def _op(self, func_name, *args, **kwargs):
try:
with LocalLibraryDB(self.root) as db:
func = getattr(db, func_name)
return func(*args, **kwargs)
except Exception as e:
# Log error but don't crash
pass
def insert_worker(self, *args, **kwargs): self._op('insert_worker', *args, **kwargs)
def update_worker_status(self, *args, **kwargs): self._op('update_worker_status', *args, **kwargs)
def append_worker_stdout(self, *args, **kwargs): self._op('append_worker_stdout', *args, **kwargs)
def close(self): pass
worker_id = str(uuid.uuid4())
library_root = get_local_storage_path(config or {})
db = None
if library_root:
try:
db = LazyDB(library_root)
db.insert_worker(
worker_id,
"download",
title="Download Data",
description="Downloading files from search results",
pipe=pipeline_context.get_current_command_text()
)
except Exception as e:
log(f"⚠ Worker tracking unavailable: {e}", file=sys.stderr)
piped_results = normalize_result_input(result)
# Track files downloaded directly (e.g. Soulseek) to avoid "No URLs" error
files_downloaded_directly = 0
# Only process piped results if no URLs were provided in arguments
# This prevents picking up residue from previous commands when running standalone
if piped_results and not urls_to_download:
for item in piped_results:
url = None
origin = None
# ====== CHECK FOR PLAYLIST ITEM MARKER FROM add-file ======
# When add-file detects a playlist item and wants to download it
if isinstance(item, dict) and item.get('__playlist_url'):
playlist_url = item.get('__playlist_url')
item_num = item.get('__playlist_item', 1)
debug(f"📍 Playlist item from add-file: #{item_num}")
# Add to download list with marker
urls_to_download.append({
'__playlist_url': playlist_url,
'__playlist_item': int(item_num)
})
continue
# ====== CHECK FOR PLAYLIST ITEM SELECTION FIRST ======
# When user selects @12 from a playlist, item is emitted dict with __action: "playlist-item:12"
if isinstance(item, dict) and '__action' in item and item['__action'].startswith('playlist-item:'):
playlist_url = item.get('__file_path')
playlist_action = item['__action'] # e.g., "playlist-item:12"
item_num = playlist_action.split(':')[1] # Extract item number (1-based)
if playlist_url:
# Playlist item selected - need to download this specific track
debug(f"📍 Playlist item selected: #{item_num} - {item.get('title', 'Unknown')}")
# Add to download list - the playlist will be probed and item extracted
# Store with special marker so we know which item to select
urls_to_download.append({
'__playlist_url': playlist_url,
'__playlist_item': int(item_num)
})
continue
# ====== CHECK FOR FORMAT SELECTION RESULT ======
if isinstance(item, dict) and item.get('format_id') is not None and item.get('source_url'):
debug(f"🎬 Format selected from pipe: {item.get('format_id')}")
debug(f" Source URL: {item.get('source_url')}")
# Store as dict so we can extract format_id + source_url during download
urls_to_download.append(item)
continue
elif hasattr(item, 'format_id') and hasattr(item, 'source_url') and item.format_id is not None:
debug(f"🎬 Format selected from pipe: {item.format_id}")
debug(f" Source URL: {item.source_url}")
urls_to_download.append({
'format_id': item.format_id,
'source_url': item.source_url,
})
continue
if isinstance(item, dict):
# Check for search provider results first
origin = item.get('origin')
if origin in {'openlibrary', 'libgen', 'soulseek', 'debrid'}:
# Handle search provider results
title = item.get('title', 'Item')
if origin == 'openlibrary':
# OpenLibrary: First check if lendable/downloadable via Archive.org
# Only route to LibGen if NOT available on Archive.org
metadata = item.get('full_metadata', {}) if isinstance(item.get('full_metadata'), dict) else {}
isbn = metadata.get('isbn') or item.get('isbn')
olid = metadata.get('olid') or item.get('olid')
debug(f"[search-result] OpenLibrary: '{title}'")
if isbn:
debug(f" ISBN: {isbn}")
# Check if book is borrowable from ebook_access field or status
ebook_access = metadata.get('ebook_access') or item.get('ebook_access', '')
status_text = metadata.get('status') or item.get('status', '')
archive_id = metadata.get('archive_id') or item.get('archive_id')
# Determine if borrowable based on new status vocabulary
is_borrowable = _is_openlibrary_downloadable(ebook_access, status_text)
if is_borrowable:
debug(f" ✓ Available for borrowing on Archive.org")
debug(f" → Queued for auto-borrowing...")
# Queue borrow request as special dict object
# We need OCAID (Archive.org ID), not just numeric OLID
ocaid = archive_id
if not ocaid and isbn:
# If no OCAID in metadata, fetch it from OpenLibrary ISBN lookup
try:
import requests
ol_url = f'https://openlibrary.org/isbn/{isbn}.json'
r = requests.get(ol_url, timeout=5)
if r.status_code == 200:
ol_data = r.json()
ocaid = ol_data.get('ocaid')
except Exception as e:
debug(f" ⚠ Could not fetch OCAID from OpenLibrary: {e}")
if ocaid:
urls_to_download.append({
'__borrow_request__': True,
'book_id': ocaid,
'isbn': isbn,
'title': title,
'olid': olid
})
else:
# OCAID not found - book claims borrowable but not on Archive.org
# Fall back to LibGen search instead
debug(f" ⚠ Book marked borrowable but not found on Archive.org")
if isbn:
try:
from helper.search_provider import get_provider
libgen_provider = get_provider("libgen", config)
if libgen_provider:
libgen_results = libgen_provider.search(f"isbn:{isbn}", limit=1)
if libgen_results:
libgen_result = libgen_results[0]
url = libgen_result.get('target') if isinstance(libgen_result, dict) else getattr(libgen_result, 'target', None)
if url:
urls_to_download.append(url)
debug(f" ✓ Found on LibGen instead")
else:
debug(f" ⚠ Not found on LibGen")
else:
debug(f" ⚠ Not found on LibGen")
else:
debug(f" ⚠ LibGen provider not available")
except Exception as e:
debug(f" ✗ Error searching LibGen: {e}")
else:
# Book is NOT borrowable - route to LibGen
if isbn:
debug(f" ⚠ Not available on Archive.org - attempting LibGen...")
try:
from helper.search_provider import get_provider
libgen_provider = get_provider("libgen", config)
if libgen_provider:
libgen_results = libgen_provider.search(f"isbn:{isbn}", limit=1)
if libgen_results:
libgen_result = libgen_results[0]
url = libgen_result.get('target') if isinstance(libgen_result, dict) else getattr(libgen_result, 'target', None)
if url:
urls_to_download.append(url)
debug(f" ✓ Found on LibGen")
else:
debug(f" ⚠ Not found on LibGen")
else:
debug(f" ⚠ Not found on LibGen")
debug(f" ▶ To search LibGen: search-file -provider libgen 'isbn:{isbn}' | @1 | download-data")
else:
debug(f" ▶ To search LibGen: search-file -provider libgen 'isbn:{isbn}' | @1 | download-data")
except Exception as e:
debug(f" ⚠ Could not search LibGen: {e}")
debug(f" ▶ To search LibGen: search-file -provider libgen 'isbn:{isbn}' | @1 | download-data")
else:
debug(f" ⚠ ISBN not available")
debug(f" ▶ Visit: {item.get('target', 'https://openlibrary.org')}")
debug(f" ▶ Or find ISBN and use: search-file -provider libgen 'isbn:\"<ISBN>\"'")
elif origin == 'soulseek':
# Handle Soulseek downloads using the provider
metadata = item.get('full_metadata', {}) if isinstance(item.get('full_metadata'), dict) else {}
username = metadata.get('username')
filename = metadata.get('filename')
size = item.get('size_bytes') or 0
if username and filename:
try:
import asyncio
from helper.search_provider import SoulSeekProvider
provider = SoulSeekProvider(config)
log(f"[search-result] Soulseek: '{title}'", flush=True)
log(f" ▶ Downloading from {username}...", flush=True)
if db:
db.append_worker_stdout(worker_id, f"Downloading from Soulseek: {title} (from {username})")
# Get temp directory from config
temp_dir = config.get('temp')
if temp_dir:
temp_dir = str(Path(temp_dir).expanduser())
# Call async download_file with asyncio.run()
success = asyncio.run(provider.download_file(
username=username,
filename=filename,
file_size=size,
target_dir=temp_dir
))
if success:
downloaded_file = Path(provider.DOWNLOAD_DIR) / Path(filename).name
if downloaded_file.exists():
log(f" ✓ Downloaded: {downloaded_file.name}", flush=True)
files_downloaded_directly += 1
if db:
db.append_worker_stdout(worker_id, f"✓ Downloaded: {downloaded_file.name}")
if pipeline_context._PIPE_ACTIVE:
# Create proper PipeObject result
result_dict = create_pipe_object_result(
source='soulseek',
identifier=filename,
file_path=str(downloaded_file),
cmdlet_name='download-data',
title=title,
target=str(downloaded_file), # Explicit target for add-file
extra={
"metadata": metadata,
"origin": "soulseek"
}
)
pipeline_context.emit(result_dict)
else:
debug(f" ✗ Download failed (peer may be offline)")
if db:
db.append_worker_stdout(worker_id, f"✗ Download failed for {title}")
debug(f" ▶ Try another result: search-file -provider soulseek \"...\" | @2 | download-data")
except Exception as e:
debug(f" ✗ Download error: {e}")
if db:
db.append_worker_stdout(worker_id, f"✗ Error: {e}")
debug(f" ▶ Alternative: search-soulseek -download \"{title}\" -storage <location>")
else:
debug(f"[search-result] Soulseek: '{title}'")
debug(f" ⚠ Missing download info (username/filename)")
if db:
db.append_worker_stdout(worker_id, f"⚠ Missing download info for {title}")
elif origin == 'libgen':
# LibGen results can use the direct URL
# Also extract mirrors dict for fallback if primary fails
url = item.get('target')
# Extract mirrors and book_id from full_metadata
metadata = item.get('full_metadata', {}) if isinstance(item.get('full_metadata'), dict) else {}
mirrors = metadata.get('mirrors', {})
book_id = metadata.get('book_id', '')
author = metadata.get('author')
isbn_val = metadata.get('isbn')
year_val = metadata.get('year')
if url:
url_entry = {
'url': str(url),
'mirrors': mirrors, # Alternative mirrors for fallback
'book_id': book_id,
'title': title,
'author': author,
'isbn': isbn_val,
'year': year_val,
}
urls_to_download.append(url_entry)
debug(f"[search-result] LibGen: '{title}'")
debug(f" ✓ Queued for download")
if mirrors:
debug(f" Mirrors available: {len(mirrors)}")
elif origin == 'debrid':
# Debrid results can use download-data
url = item.get('target')
if url:
urls_to_download.append(str(url))
debug(f"[search-result] Debrid: '{title}'")
debug(f" ✓ Queued for download")
else:
# Regular fields for non-search results
url = item.get('url') or item.get('link') or item.get('href') or item.get('target')
else:
# Object attributes
origin = getattr(item, 'origin', None)
title = getattr(item, 'title', 'Item')
if origin in {'openlibrary', 'libgen', 'soulseek', 'debrid'}:
# Handle search provider results
if origin == 'openlibrary':
# OpenLibrary: First check if lendable/downloadable via Archive.org
# Only route to LibGen if NOT available on Archive.org
metadata = getattr(item, 'full_metadata', {}) if isinstance(getattr(item, 'full_metadata', None), dict) else {}
isbn = metadata.get('isbn') or getattr(item, 'isbn', None)
olid = metadata.get('olid') or getattr(item, 'olid', None)
debug(f"[search-result] OpenLibrary: '{title}'")
if isbn:
debug(f" ISBN: {isbn}")
# Check if book is borrowable from ebook_access field or status
ebook_access = metadata.get('ebook_access') or getattr(item, 'ebook_access', '')
status_text = metadata.get('status') or getattr(item, 'status', '')
archive_id = metadata.get('archive_id') or getattr(item, 'archive_id', '')
# Determine if borrowable using unified helper
is_borrowable = _is_openlibrary_downloadable(ebook_access, status_text)
if is_borrowable:
# Book IS borrowable on Archive.org
debug(f" ✓ Available for borrowing on Archive.org")
debug(f" → Queued for auto-borrowing...")
# Queue borrow request as special dict object
ocaid = archive_id
if not ocaid and isbn:
try:
import requests
ol_url = f'https://openlibrary.org/isbn/{isbn}.json'
r = requests.get(ol_url, timeout=5)
if r.status_code == 200:
ol_data = r.json()
ocaid = ol_data.get('ocaid')
except Exception as e:
debug(f" ⚠ Could not fetch OCAID from OpenLibrary: {e}")
if ocaid:
urls_to_download.append({
'__borrow_request__': True,
'book_id': ocaid,
'isbn': isbn,
'title': title,
'olid': olid or getattr(item, 'openlibrary_id', '')
})
else:
# OCAID not found - book claims borrowable but not on Archive.org
# Fall back to LibGen search instead
debug(f" ⚠ No Archive.org ID found - attempting LibGen instead...")
if isbn:
try:
from helper.search_provider import get_provider
libgen_provider = get_provider("libgen", config)
if libgen_provider:
libgen_results = libgen_provider.search(f"isbn:{isbn}", limit=1)
if libgen_results:
libgen_result = libgen_results[0]
url = libgen_result.get('target') if isinstance(libgen_result, dict) else getattr(libgen_result, 'target', None)
if url:
urls_to_download.append(url)
debug(f" ✓ Found on LibGen instead")
else:
debug(f" ⚠ Not found on LibGen")
else:
debug(f" ⚠ Not found on LibGen")
else:
debug(f" ⚠ LibGen provider not available")
except Exception as e:
debug(f" ✗ Error searching LibGen: {e}")
else:
debug(f" ⚠ ISBN not available for LibGen fallback")
else:
# Book is NOT borrowable - route to LibGen
if isbn:
debug(f" ⚠ Not available on Archive.org - attempting LibGen...")
try:
from helper.search_provider import get_provider
libgen_provider = get_provider("libgen", config)
if libgen_provider:
libgen_results = libgen_provider.search(f"isbn:{isbn}", limit=1)
if libgen_results:
libgen_result = libgen_results[0]
url = libgen_result.get('target') if isinstance(libgen_result, dict) else getattr(libgen_result, 'target', None)
if url:
urls_to_download.append(url)
debug(f" ✓ Found on LibGen")
else:
debug(f" ⚠ Not found on LibGen")
else:
debug(f" ⚠ Not found on LibGen")
debug(f" ▶ To search LibGen: search-file -provider libgen 'isbn:{isbn}' | @1 | download-data")
else:
debug(f" ▶ To search LibGen: search-file -provider libgen 'isbn:{isbn}' | @1 | download-data")
except Exception as e:
debug(f" ⚠ Could not search LibGen: {e}")
debug(f" ▶ To search LibGen: search-file -provider libgen 'isbn:{isbn}' | @1 | download-data")
else:
debug(f" ⚠ ISBN not available")
debug(f" ▶ Visit: {getattr(item, 'target', 'https://openlibrary.org')}")
debug(f" ▶ Or find ISBN and use: search-file -provider libgen 'isbn:\"<ISBN>\"'")
elif origin == 'soulseek':
# Handle Soulseek downloads using the provider
metadata = getattr(item, 'full_metadata', {}) if isinstance(getattr(item, 'full_metadata', None), dict) else {}
username = metadata.get('username')
filename = metadata.get('filename')
size = getattr(item, 'size_bytes', 0) or 0
if username and filename:
try:
import asyncio
from helper.search_provider import SoulSeekProvider
provider = SoulSeekProvider(config)
debug(f"[search-result] Soulseek: '{title}'")
debug(f" ▶ Downloading from {username}...")
if db:
db.append_worker_stdout(worker_id, f"Downloading from Soulseek: {title} (from {username})")
# Get temp directory from config
temp_dir = config.get('temp')
if temp_dir:
temp_dir = str(Path(temp_dir).expanduser())
# Call async download_file with asyncio.run()
success = asyncio.run(provider.download_file(
username=username,
filename=filename,
file_size=size,
target_dir=temp_dir
))
if success:
downloaded_file = Path(provider.DOWNLOAD_DIR) / Path(filename).name
if downloaded_file.exists():
debug(f" ✓ Downloaded: {downloaded_file.name}")
files_downloaded_directly += 1
if db:
db.append_worker_stdout(worker_id, f"✓ Downloaded: {downloaded_file.name}")
if pipeline_context._PIPE_ACTIVE:
# Create proper PipeObject result
result_dict = create_pipe_object_result(
source='soulseek',
identifier=filename,
file_path=str(downloaded_file),
cmdlet_name='download-data',
title=title,
target=str(downloaded_file), # Explicit target for add-file
extra={
"metadata": metadata,
"origin": "soulseek"
}
)
pipeline_context.emit(result_dict)
else:
debug(f" ✗ Download failed (peer may be offline)")
if db:
db.append_worker_stdout(worker_id, f"✗ Download failed for {title}")
debug(f" ▶ Try another result: search-file -provider soulseek \"...\" | @2 | download-data")
except Exception as e:
debug(f" ✗ Download error: {e}")
if db:
db.append_worker_stdout(worker_id, f"✗ Error: {e}")
debug(f" ▶ Alternative: search-soulseek -download \"{title}\" -storage <location>")
else:
debug(f"[search-result] Soulseek: '{title}'")
debug(f" ⚠ Missing download info (username/filename)")
if db:
db.append_worker_stdout(worker_id, f"⚠ Missing download info for {title}")
elif origin == 'libgen':
# LibGen results with mirrors dict for fallback
url = getattr(item, 'target', None)
# Extract mirrors and book_id from full_metadata
metadata = getattr(item, 'full_metadata', {}) if isinstance(getattr(item, 'full_metadata', None), dict) else {}
mirrors = metadata.get('mirrors', {})
book_id = metadata.get('book_id', '')
author = metadata.get('author')
isbn_val = metadata.get('isbn')
year_val = metadata.get('year')
if url:
url_entry = {
'url': str(url),
'mirrors': mirrors, # Alternative mirrors for fallback
'book_id': book_id,
'title': title,
'author': author,
'isbn': isbn_val,
'year': year_val,
}
urls_to_download.append(url_entry)
else:
urls_to_download.append(url) if url else None
elif origin == 'debrid':
url = getattr(item, 'target', None)
else:
url = getattr(item, 'url', None) or getattr(item, 'link', None) or getattr(item, 'href', None) or getattr(item, 'target', None)
if url:
urls_to_download.append(str(url))
if not urls_to_download and files_downloaded_directly == 0:
debug(f"No downloadable URLs found")
return 1
# Deduplicate URLs while preserving order
unique_urls = []
seen_keys = set()
for u in urls_to_download:
key = None
if isinstance(u, dict):
key = u.get('url') or u.get('link') or u.get('target') or u.get('source_url')
if not key:
key = str(u)
else:
key = str(u)
if key and key not in seen_keys:
seen_keys.add(key)
unique_urls.append(u)
urls_to_download = unique_urls
debug(f"Processing {len(urls_to_download)} URL(s)")
for i, u in enumerate(urls_to_download, 1):
if isinstance(u, dict):
debug(f" [{i}] Format: {u.get('format_id', '?')} from {u.get('source_url', '?')[:60]}...")
else:
debug(f" [{i}] URL: {str(u)[:60]}...")
# ========================================================================
# RESOLVE OUTPUT DIRECTORY
# ========================================================================
final_output_dir = None
# Priority 1: --storage flag
if storage_location:
try:
# For 'local' storage, check config first before using default
if storage_location.lower() == 'local':
from config import get_local_storage_path
try:
configured_path = get_local_storage_path(config)
if configured_path:
final_output_dir = configured_path
debug(f"Using configured local storage path: {final_output_dir}")
else:
final_output_dir = SharedArgs.resolve_storage(storage_location)
debug(f"Using default storage location: {storage_location}{final_output_dir}")
except Exception as exc:
log(f"⚠️ Error reading local storage config: {exc}", file=sys.stderr)
final_output_dir = SharedArgs.resolve_storage(storage_location)
debug(f"Falling back to default storage location: {storage_location}{final_output_dir}")
else:
final_output_dir = SharedArgs.resolve_storage(storage_location)
debug(f"Using storage location: {storage_location}{final_output_dir}")
except ValueError as e:
log(str(e), file=sys.stderr)
return 1
# Priority 2: Config resolver
if final_output_dir is None and resolve_output_dir is not None:
try:
final_output_dir = resolve_output_dir(config)
debug(f"Using config resolver: {final_output_dir}")
except Exception:
pass
# Priority 4: Config outfile
if final_output_dir is None and config and config.get("outfile"):
try:
final_output_dir = Path(config["outfile"]).expanduser()
debug(f"Using config outfile: {final_output_dir}")
except Exception:
pass
# Priority 5: Default (home/Videos)
if final_output_dir is None:
final_output_dir = Path.home() / "Videos"
debug(f"Using default directory: {final_output_dir}")
# Ensure directory exists
try:
final_output_dir.mkdir(parents=True, exist_ok=True)
except Exception as e:
log(f"Cannot create output directory {final_output_dir}: {e}", file=sys.stderr)
return 1
# ========================================================================
# DOWNLOAD EACH URL
# ========================================================================
downloaded_files = []
playlists_displayed = 0
formats_displayed = False # NEW: Track if we showed formats
exit_code = 0
for url in urls_to_download:
try:
selected_playlist_entries: list[Dict[str, Any]] = []
playlist_existing_paths: set[str] = set()
# ====== HANDLE FORMAT SELECTION FROM PIPED RESULT ======
# If url is a dict with format_id and source_url, extract them and override format_selector
current_format_selector = format_selector
actual_url = url
if isinstance(url, dict) and url.get('format_id') and url.get('source_url'):
debug(f"🎬 Format selected: {url.get('format_id')}")
format_id = url.get('format_id')
current_format_selector = format_id
# If it's a video-only format (has vcodec but no acodec), add bestaudio
# BUT: Skip this for -section downloads because combining formats causes re-encoding
# For -section, use formats that already have audio (muxed) to avoid FFmpeg re-encoding
vcodec = url.get('vcodec', '')
acodec = url.get('acodec', '')
if vcodec and vcodec != "none" and (not acodec or acodec == "none"):
if not clip_range and not section_ranges:
# Only add bestaudio if NOT doing -section or -clip
# For section downloads, we need muxed formats to avoid re-encoding
current_format_selector = f"{format_id}+bestaudio"
debug(f" Video-only format detected, automatically adding bestaudio")
else:
debug(f" Section/clip download: using video-only format as-is (no bestaudio to avoid re-encoding)")
actual_url = url.get('source_url')
url = actual_url # Use the actual URL for further processing
# ====== AUTO-BORROW MODE - INTERCEPT SPECIAL BORROW REQUEST DICTS ======
if isinstance(url, dict) and url.get('__borrow_request__'):
try:
from helper.archive_client import credential_openlibrary, loan, get_book_infos, download
import tempfile
import shutil
book_id = url.get('book_id')
if not book_id:
debug(f" ✗ Missing book ID for borrowing")
exit_code = 1
continue
title_val = url.get('title', 'Unknown Book')
book_id_str = str(book_id)
debug(f"[auto-borrow] Starting borrow for: {title_val}")
debug(f" Book ID: {book_id_str}")
# Get Archive.org credentials
email, password = credential_openlibrary(config)
if not email or not password:
log(f" ✗ Archive.org credentials not configured", file=sys.stderr)
log(f" ▶ Set ARCHIVE_EMAIL and ARCHIVE_PASSWORD environment variables", file=sys.stderr)
exit_code = 1
continue
# Attempt to borrow and download
try:
debug(f" → Logging into Archive.org...")
from helper.archive_client import login
import requests
try:
session = login(email, password)
except requests.exceptions.Timeout:
debug(f" ✗ Timeout logging into Archive.org (server not responding)")
exit_code = 1
continue
except requests.exceptions.RequestException as e:
debug(f" ✗ Error connecting to Archive.org: {e}")
exit_code = 1
continue
debug(f" → Borrowing book...")
try:
session = loan(session, book_id_str, verbose=True)
except requests.exceptions.Timeout:
debug(f" ✗ Timeout while borrowing (server not responding)")
exit_code = 1
continue
except requests.exceptions.RequestException as e:
debug(f" ✗ Error while borrowing: {e}")
exit_code = 1
continue
except Exception as e:
# Check for BookNotAvailableError (imported dynamically or by name)
if type(e).__name__ == 'BookNotAvailableError':
debug(f" ⚠ Book is waitlisted/unavailable on Archive.org")
# Fallback to LibGen if ISBN is available
isbn = url.get('isbn')
if isbn:
debug(f" ▶ Falling back to LibGen search for ISBN: {isbn}")
from helper.search_provider import LibGenProvider
provider = LibGenProvider(config)
# Search specifically by ISBN
results = provider.search(f"isbn:{isbn}", limit=1)
if results:
debug(f" ✓ Found {len(results)} result(s) on LibGen")
# Use the first result
libgen_result = results[0]
# Construct a new URL entry for the main loop to process
# We can't easily inject into the loop, so we'll process it here
# LibGen results from provider have 'target' as mirror URL or libgen:ID
target = libgen_result.target
debug(f" → Downloading from LibGen: {libgen_result.title}")
# We need to use the LibGen download logic.
# The easiest way is to call the UnifiedBookDownloader directly or
# delegate to the 'libgen' origin handler if we can.
# But we are inside the loop.
# Let's use UnifiedBookDownloader directly to download to final_output_dir
from helper.unified_book_downloader import UnifiedBookDownloader
downloader = UnifiedBookDownloader(config)
# The target might be a mirror URL or libgen:ID
# UnifiedBookDownloader.download_book expects a book dict or similar?
# Actually, let's look at how 'libgen' origin is handled in the main loop.
# It uses urls_to_download.append(url_entry).
# We can just process this result right here.
# The provider result has full_metadata which is the book dict.
book_data = libgen_result.full_metadata
# Download the book
# We need to find a working mirror
mirrors = book_data.get('mirrors', {})
download_url = book_data.get('mirror_url')
if not download_url and mirrors:
# Pick first mirror
download_url = next(iter(mirrors.values()))
if download_url:
debug(f" → Mirror: {download_url}")
# Use helper.download.download_media or similar?
# UnifiedBookDownloader has download_book(book, output_dir)
# Reconstruct book dict for downloader
# It expects: title, author, year, extension, mirrors, etc.
# book_data should have most of it.
filepath = downloader.download_book(book_data, final_output_dir)
if filepath:
debug(f" ✓ Successfully downloaded from LibGen: {filepath}")
downloaded_files.append(str(filepath))
# Emit result
file_hash = _compute_file_hash(filepath)
emit_tags = ['book', 'libgen']
if isbn: emit_tags.append(f'isbn:{isbn}')
pipe_obj = create_pipe_object_result(
source='libgen',
identifier=book_data.get('md5', 'unknown'),
file_path=str(filepath),
cmdlet_name='download-data',
title=libgen_result.title,
file_hash=file_hash,
tags=emit_tags,
source_url=download_url
)
pipeline_context.emit(pipe_obj)
exit_code = 0
continue # Success!
else:
debug(f" ✗ Failed to download from LibGen")
else:
debug(f" ✗ No download URL found in LibGen result")
else:
debug(f" ✗ No results found on LibGen for ISBN: {isbn}")
else:
debug(f" ⚠ No ISBN available for LibGen fallback")
# If fallback failed or wasn't possible, abort
debug(f" ✗ Unable to borrow from Archive.org and LibGen fallback failed.")
exit_code = 1
continue
else:
# Re-raise other exceptions
raise e
debug(f" → Extracting page information...")
# Try both URL formats
book_urls = [
f"https://archive.org/borrow/{book_id_str}",
f"https://archive.org/details/{book_id_str}"
]
title = None
links = None
metadata = None
last_error = None
for book_url in book_urls:
try:
title, links, metadata = get_book_infos(session, book_url)
if title and links:
debug(f" → Found {len(links)} pages")
break
except requests.exceptions.Timeout:
last_error = "Timeout while extracting pages"
debug(f" ⚠ Timeout while extracting from {book_url}")
continue
except Exception as e:
last_error = str(e)
debug(f" ⚠ Failed to extract from {book_url}: {e}")
continue
if not links:
debug(f" ✗ Could not extract book pages (Last error: {last_error})")
exit_code = 1
continue
# Download pages
debug(f" → Downloading {len(links)} pages...")
with tempfile.TemporaryDirectory() as temp_dir:
# download(session, n_threads, directory, links, scale, book_id)
images = download(
session,
n_threads=4,
directory=temp_dir,
links=links,
scale=2,
book_id=str(book_id)
)
if not images:
debug(f" ✗ No pages downloaded")
exit_code = 1
continue
debug(f" ✓ Downloaded {len(images)} pages")
# Try to merge into PDF
try:
import img2pdf
debug(f" → Merging pages into PDF...")
# Use title from result item if available, otherwise fallback to extracted title
filename_title = title_val if title_val and title_val != 'Unknown Book' else (title if title else f"book_{book_id_str}")
# Allow underscores and spaces
filename = "".join(c for c in filename_title if c.isalnum() or c in (' ', '.', '-', '_'))[:100]
output_path = Path(final_output_dir) / f"{filename}.pdf"
# Make unique filename if needed
i = 1
while output_path.exists():
output_path = Path(final_output_dir) / f"{filename}({i}).pdf"
i += 1
pdf_content = img2pdf.convert(images)
if pdf_content:
with open(output_path, 'wb') as f:
f.write(pdf_content)
debug(f" ✓ Successfully borrowed and saved to: {output_path}")
downloaded_files.append(str(output_path))
# Emit result for downstream cmdlets
file_hash = _compute_file_hash(output_path)
# Build tags including ISBN if available
emit_tags = ['book', 'borrowed', 'pdf']
if title_val and title_val != 'Unknown Book':
emit_tags.append(f'title:{title_val}')
isbn_tag = url.get('isbn')
if isbn_tag:
emit_tags.append(f'isbn:{isbn_tag}')
olid_tag = url.get('olid')
if olid_tag:
emit_tags.append(f'olid:{olid_tag}')
# Fetch OpenLibrary metadata tags
ol_tags = fetch_openlibrary_metadata_tags(isbn=isbn_tag, olid=olid_tag)
emit_tags.extend(ol_tags)
pipe_obj = create_pipe_object_result(
source='archive.org',
identifier=book_id_str,
file_path=str(output_path),
cmdlet_name='download-data',
title=title_val,
file_hash=file_hash,
tags=emit_tags,
source_url=url.get('source_url', f'archive.org/borrow/{book_id_str}')
)
pipeline_context.emit(pipe_obj)
exit_code = 0
except ImportError:
debug(f" ⚠ img2pdf not available - saving pages as collection")
# Just copy images to output dir
filename = title if title else f"book_{book_id_str}"
filename = "".join(c for c in filename if c.isalnum() or c in (' ', '.', '-'))[:100]
output_dir = Path(final_output_dir) / filename
i = 1
while output_dir.exists():
output_dir = Path(final_output_dir) / f"{filename}({i})"
i += 1
shutil.copytree(temp_dir, str(output_dir))
debug(f" ✓ Successfully borrowed and saved to: {output_dir}")
downloaded_files.append(str(output_dir))
# Emit result for downstream cmdlets
# Build tags including ISBN if available
emit_tags = ['book', 'borrowed', 'pages']
isbn_tag = url.get('isbn')
if isbn_tag:
emit_tags.append(f'isbn:{isbn_tag}')
olid_tag = url.get('olid')
if olid_tag:
emit_tags.append(f'olid:{olid_tag}')
# Fetch OpenLibrary metadata tags
ol_tags = fetch_openlibrary_metadata_tags(isbn=isbn_tag, olid=olid_tag)
emit_tags.extend(ol_tags)
pipe_obj = create_pipe_object_result(
source='archive.org',
identifier=book_id_str,
file_path=str(output_dir),
cmdlet_name='download-data',
title=title_val,
tags=emit_tags,
source_url=url.get('source_url', f'archive.org/borrow/{book_id_str}')
)
pipeline_context.emit(pipe_obj)
exit_code = 0
except Exception as e:
debug(f" ✗ Borrow/download failed: {e}")
import traceback
traceback.print_exc()
exit_code = 1
continue # Skip normal URL handling
except ImportError as e:
debug(f" ✗ Archive.org tools not available: {e}")
exit_code = 1
continue
except Exception as e:
debug(f" ✗ Auto-borrow error: {e}")
import traceback
traceback.print_exc()
exit_code = 1
continue
# ====== LIBGEN MIRROR FALLBACK MODE ======
# Handle libgen results with mirrors dict for fallback on failure
if isinstance(url, dict) and 'mirrors' in url:
try:
primary_url = url.get('url')
mirrors_dict = url.get('mirrors', {})
book_id = url.get('book_id', '')
title_val = url.get('title')
author_val = url.get('author')
isbn_val = url.get('isbn')
year_val = url.get('year')
if not primary_url:
debug(f"Skipping libgen entry: no primary URL")
exit_code = 1
continue
# Build list of mirrors to try: primary first, then alternatives
mirrors_to_try = [primary_url]
mirrors_to_try.extend(mirrors_dict.values())
# Remove duplicates while preserving order
mirrors_to_try = list(dict.fromkeys(mirrors_to_try))
debug(f"🔄 LibGen download with mirror fallback (book_id: {book_id})")
debug(f" Primary: {primary_url[:80]}...")
if len(mirrors_to_try) > 1:
debug(f" {len(mirrors_to_try) - 1} alternative mirror(s) available")
# Resolve cookies path
final_cookies_path_libgen = None
if cookies_path:
if resolve_cookies_path:
try:
final_cookies_path_libgen = resolve_cookies_path(config, Path(cookies_path))
except Exception:
final_cookies_path_libgen = Path(cookies_path).expanduser() if cookies_path else None
else:
final_cookies_path_libgen = Path(cookies_path).expanduser()
download_succeeded = False
last_error = None
successful_mirror = None
# Try each mirror in sequence using libgen_service's native download
for mirror_idx, mirror_url in enumerate(mirrors_to_try, 1):
try:
if mirror_idx > 1:
debug(f" → Trying mirror #{mirror_idx}: {mirror_url[:80]}...")
# Use libgen_service's download_from_mirror for proper libgen handling
from helper.libgen_service import download_from_mirror
# Generate filename from book_id and title
safe_title = "".join(c for c in str(title or "book") if c.isalnum() or c in (' ', '.', '-'))[:100]
file_path = final_output_dir / f"{safe_title}_{book_id}.pdf"
progress_bar = models.ProgressBar()
progress_start = time.time()
last_update = [progress_start]
progress_bytes = [0]
progress_total = [0]
def _libgen_progress(downloaded: int, total: int) -> None:
progress_bytes[0] = downloaded
progress_total[0] = total
now = time.time()
if total > 0 and now - last_update[0] >= 0.5:
percent = (downloaded / total) * 100
elapsed = max(now - progress_start, 1e-6)
speed = downloaded / elapsed if elapsed > 0 else 0
remaining = max(total - downloaded, 0)
eta = remaining / speed if speed > 0 else 0
minutes, seconds = divmod(int(eta), 60)
hours, minutes = divmod(minutes, 60)
eta_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
speed_str = f"{progress_bar.format_bytes(speed)}/s"
progress_line = progress_bar.format_progress(
percent_str=f"{percent:.1f}%",
downloaded=downloaded,
total=total,
speed_str=speed_str,
eta_str=eta_str,
)
debug(f" {progress_line}")
last_update[0] = now
# Attempt download using libgen's native function
success, downloaded_path = download_from_mirror(
mirror_url=mirror_url,
output_path=file_path,
log_info=lambda msg: debug(f" {msg}"),
log_error=lambda msg: debug(f"{msg}"),
progress_callback=_libgen_progress,
)
final_path = Path(downloaded_path) if downloaded_path else file_path
if success and final_path.exists():
downloaded = progress_bytes[0] or final_path.stat().st_size
elapsed = time.time() - progress_start
avg_speed = downloaded / elapsed if elapsed > 0 else 0
debug(f" ✓ Downloaded in {elapsed:.1f}s at {progress_bar.format_bytes(avg_speed)}/s")
debug(f" ✓ Downloaded successfully from mirror #{mirror_idx}")
successful_mirror = mirror_url
download_succeeded = True
# Emit result for downstream cmdlets
file_hash = _compute_file_hash(final_path)
emit_tags = build_book_tags(
title=title_val or title,
author=author_val,
isbn=isbn_val,
year=year_val,
source='libgen',
extra=[f"libgen_id:{book_id}"] if book_id else None,
)
pipe_obj = create_pipe_object_result(
source='libgen',
identifier=book_id,
file_path=str(final_path),
cmdlet_name='download-data',
file_hash=file_hash,
tags=emit_tags,
source_url=successful_mirror
)
pipeline_context.emit(pipe_obj)
downloaded_files.append(str(final_path))
exit_code = 0
break # Success, stop trying mirrors
except Exception as e:
last_error = str(e)
if mirror_idx == 1:
debug(f" ⚠ Primary mirror failed: {e}")
else:
debug(f" ⚠ Mirror #{mirror_idx} failed: {e}")
if not download_succeeded:
log(f" ✗ All mirrors failed. Last error: {last_error}", file=sys.stderr)
if "getaddrinfo failed" in str(last_error) or "NameResolutionError" in str(last_error) or "Failed to resolve" in str(last_error):
log(f" ⚠ Network issue detected: Cannot resolve LibGen mirror hostnames", file=sys.stderr)
log(f" ▶ Check your network connection or try with a VPN/proxy", file=sys.stderr)
exit_code = 1
continue # Skip to next URL
except Exception as e:
debug(f" ✗ LibGen mirror fallback error: {e}")
import traceback
traceback.print_exc(file=sys.stderr)
exit_code = 1
continue
# Ensure URL is a string for normal handling
if not isinstance(url, str):
# Check if it's a playlist item marker
if isinstance(url, dict) and url.get('__playlist_url'):
playlist_url = url.get('__playlist_url')
item_num = url.get('__playlist_item', 1)
debug(f"📍 Handling selected playlist item #{item_num}")
# Convert to actual URL and set playlist_items to download only this item
url = playlist_url
playlist_items = str(item_num)
# Fall through to normal handling below
else:
debug(f"Skipping invalid URL entry: {url}")
continue
debug(f"Probing URL: {url}")
# ====== TORRENT MODE - INTERCEPT BEFORE NORMAL DOWNLOAD ======
if torrent_mode or url.lower().startswith('magnet:'):
debug(f"🧲 Torrent/magnet mode - spawning background worker...")
try:
# Get API key from config
from config import get_debrid_api_key
api_key = get_debrid_api_key(config)
if not api_key:
log(f"✗ AllDebrid API key not found in config", file=sys.stderr)
exit_code = 1
continue
# Create a unique worker ID
worker_id = f"torrent_{uuid.uuid4().hex[:8]}"
# Get worker manager if available from config
worker_manager = config.get('_worker_manager')
# Create worker in manager if available
if worker_manager:
try:
worker_manager.track_worker(
worker_id,
worker_type="download_torrent",
title=f"Download: {url[:60]}...",
description=f"Torrent/magnet download via AllDebrid",
pipe=pipeline_context.get_current_command_text()
)
debug(f"✓ Worker created (ID: {worker_id})")
except Exception as e:
debug(f"⚠ Failed to create worker: {e}")
worker_manager = None
# Spawn background thread to handle the download
worker_thread = threading.Thread(
target=_download_torrent_worker,
args=(
worker_id,
url,
final_output_dir,
config,
api_key,
playlist_items,
audio_mode,
wait_timeout,
worker_manager,
),
daemon=False,
name=f"TorrentWorker_{worker_id}"
)
worker_thread.start()
debug(f"✓ Background worker started (ID: {worker_id})")
# Emit worker info so user can track it
worker_info = {
'worker_id': worker_id,
'worker_type': 'download_torrent',
'source_url': url,
'status': 'running',
'message': 'Downloading in background...'
}
pipeline_context.emit(worker_info)
continue
except ImportError:
log(f"✗ AllDebrid client not available", file=sys.stderr)
exit_code = 1
except Exception as e:
# Catches AllDebridError and other exceptions
log(f"✗ Failed to spawn torrent worker: {e}", file=sys.stderr)
import traceback
traceback.print_exc(file=sys.stderr)
exit_code = 1
continue # Skip to next URL
# ====== NORMAL DOWNLOAD MODE (HTTP/HTTPS) ======
# First, probe the URL to detect playlists and get info
# For YouTube URLs, ignore playlists and only probe the single video
is_youtube_url = isinstance(url, str) and ('youtube.com' in url or 'youtu.be' in url)
probe_info = probe_url(url, no_playlist=is_youtube_url)
is_actual_playlist = False # Track if we have a real multi-item playlist
if probe_info:
debug(f"✓ Probed: {probe_info.get('title', url)} ({probe_info.get('extractor', 'unknown')})")
# If it's a playlist, show the result table and skip download for now
entries = probe_info.get("entries", [])
if entries and not playlist_items:
is_actual_playlist = True # We have a real playlist with multiple items
# Playlist detected but NO selection provided
# Always show table for user to select items
debug(f"📋 Found playlist with {len(entries)} items")
_show_playlist_table(url, probe_info)
debug(f" Playlist displayed. To select items, use @* or @1,3,5-8 syntax after piping results")
playlists_displayed += 1
continue # Skip to next URL - don't download playlist without selection
elif entries and playlist_items:
is_actual_playlist = True # We have a real playlist with item selection
# Playlist detected WITH selection - will download below
# Expand wildcard if present
expanded_items = _expand_playlist_selection(playlist_items, len(entries))
playlist_items = expanded_items
selected_playlist_entries = _select_playlist_entries(entries, playlist_items)
debug(f"📋 Found playlist with {len(entries)} items - downloading selected: {playlist_items}")
else:
debug(f"Single item: {probe_info.get('title', 'Unknown')}")
# ====== FORMAT LISTING MODE ======
if list_formats_mode and isinstance(url, str) and url.startswith(('http://', 'https://')):
debug(f"Fetching formats for: {url}")
from helper.download import list_formats
from result_table import ResultTable
all_formats = list_formats(url, no_playlist=is_youtube_url, playlist_items=playlist_items)
if all_formats:
# Filter and sort formats for better user experience
formats = _filter_and_sort_formats(all_formats)
# Create result table for format display
table = ResultTable(title=f"Available Formats - {probe_info.get('title', 'Unknown')}")
for idx, fmt in enumerate(formats, start=1):
row = table.add_row()
row.add_column("Format ID", fmt.get("format_id", ""))
# Build resolution/bitrate string
vcodec = fmt.get("vcodec", "")
acodec = fmt.get("acodec", "")
height = fmt.get("height")
tbr = fmt.get("tbr")
if vcodec != "none" and acodec != "none":
# Video + audio
res_str = fmt.get("resolution", "")
elif acodec != "none" and vcodec == "none":
# Audio only - show bitrate
res_str = f"{tbr:.0f} kbps" if tbr else "audio"
else:
# Video only
res_str = fmt.get("resolution", "")
row.add_column("Resolution", res_str)
# Build codec string (merged vcodec/acodec)
codec_parts = []
if vcodec and vcodec != "none":
codec_parts.append(f"v:{vcodec}")
if acodec and acodec != "none":
codec_parts.append(f"a:{acodec}")
codec_str = " | ".join(codec_parts) if codec_parts else "unknown"
row.add_column("Codec", codec_str)
if fmt.get("filesize"):
size_mb = fmt["filesize"] / (1024 * 1024)
row.add_column("Size", f"{size_mb:.1f} MB")
# Enable @N expansion to rerun download-data with -item idx
row.set_selection_args(["-item", str(idx)])
# Set source command for @N expansion
table.set_source_command("download-data", [url])
# Display table
log(str(table), flush=True)
formats_displayed = True
# Store table for @N expansion so CLI can reconstruct commands
pipeline_context.set_current_stage_table(table)
pipeline_context.set_last_result_table_overlay(table, formats)
debug("Use @N to pick a format; pipeline paused until selection")
else:
log(f"✗ No formats available for this URL", file=sys.stderr)
# Stop pipeline here; selection via @N will re-run download-data with -item
return 0
# ====== AUTO-DETECT MULTIPLE FORMATS ======
# Check if multiple formats exist and handle based on -item flag
if (not current_format_selector and not list_formats_mode and
isinstance(url, str) and url.startswith(('http://', 'https://'))):
# Check if this is a yt-dlp supported URL (YouTube, Vimeo, etc.)
from helper.download import list_formats
from result_table import ResultTable
if is_url_supported_by_ytdlp(url):
debug(f"Checking available formats for: {url}")
all_formats = list_formats(url, no_playlist=is_youtube_url, playlist_items=playlist_items)
if all_formats:
# Filter and sort formats for better user experience
formats = _filter_and_sort_formats(all_formats)
# Handle -item selection for formats (single video)
if playlist_items and playlist_items.isdigit() and not is_actual_playlist:
idx = int(playlist_items)
if 0 < idx <= len(formats):
fmt = formats[idx-1]
current_format_selector = fmt.get("format_id")
# If video-only format is selected, append +bestaudio to merge with best audio
# BUT: Skip this for -section downloads because combining formats causes re-encoding
vcodec = fmt.get("vcodec")
acodec = fmt.get("acodec")
if vcodec and vcodec != "none" and (not acodec or acodec == "none"):
if not clip_range and not section_ranges:
# Only add bestaudio if NOT doing -section or -clip
current_format_selector = f"{current_format_selector}+bestaudio"
debug(f"Video-only format selected, appending bestaudio: {current_format_selector}")
else:
debug(f"Section/clip download: using video-only format as-is (no bestaudio to avoid re-encoding)")
debug(f"Selected format #{idx}: {current_format_selector}")
playlist_items = None # Clear so it doesn't affect download options
else:
log(f"Invalid format index: {idx}", file=sys.stderr)
elif len(formats) > 1:
# Multiple formats available
debug(f"📊 Found {len(formats)} available formats for: {probe_info.get('title', 'Unknown')}")
# Always show table for format selection via @N syntax
# Show table and wait for @N selection
table = ResultTable(title=f"Available Formats - {probe_info.get('title', 'Unknown')}")
for fmt in formats:
row = table.add_row()
row.add_column("Format ID", fmt.get("format_id", ""))
# Build resolution/bitrate string
vcodec = fmt.get("vcodec", "")
acodec = fmt.get("acodec", "")
height = fmt.get("height")
tbr = fmt.get("tbr")
if vcodec != "none" and acodec != "none":
# Video + audio
res_str = fmt.get("resolution", "")
elif acodec != "none" and vcodec == "none":
# Audio only - show bitrate
res_str = f"{tbr:.0f} kbps" if tbr else "audio"
else:
# Video only
res_str = fmt.get("resolution", "")
row.add_column("Resolution", res_str)
# Build codec string (merged vcodec/acodec)
codec_parts = []
if vcodec and vcodec != "none":
codec_parts.append(f"v:{vcodec}")
if acodec and acodec != "none":
codec_parts.append(f"a:{acodec}")
codec_str = " | ".join(codec_parts) if codec_parts else "unknown"
row.add_column("Codec", codec_str)
if fmt.get("filesize"):
size_mb = fmt["filesize"] / (1024 * 1024)
row.add_column("Size", f"{size_mb:.1f} MB")
# Set source command for @N expansion
table.set_source_command("download-data", [url])
# Set row selection args so @N expands to "download-data URL -item N"
for i in range(len(formats)):
table.set_row_selection_args(i, ["-item", str(i + 1)])
# Display table
log(str(table), flush=True)
debug(f"💡 Use @N syntax to select a format and download (e.g., @1)")
# Store table for @N expansion so CLI can reconstruct commands
pipeline_context.set_current_stage_table(table)
pipeline_context.set_last_result_table_overlay(table, formats)
formats_displayed = True # Mark that we displayed formats
return 0 # Pause pipeline; user must select format via @N
debug(f"Downloading: {url}")
# Special handling for LibGen URLs
if "libgen" in url or "library.lol" in url:
debug(f"🔄 Detected LibGen URL, using specialized downloader: {url}")
try:
from helper.libgen_service import download_from_mirror, search_libgen
# If it's a search/details page, try to find the download link
# e.g. https://libgen.li/series.php?id=577851
# We can try to extract the ID and search for it, or just try to download if it's a mirror
# Extract ID if possible, BUT skip for series/edition pages which are handled by download_from_mirror
libgen_id = ""
results = []
if "series.php" not in url and "edition.php" not in url:
match = re.search(r"id=(\d+)", url)
if match:
libgen_id = match.group(1)
debug(f" Extracted LibGen ID: {libgen_id}")
# Search by ID to get fresh mirror links
results = search_libgen(libgen_id, limit=1)
if results:
# Use the mirror URL from the result
mirror_url = results[0].get("mirror_url")
if mirror_url:
debug(f" Resolved to mirror URL: {mirror_url}")
url = mirror_url
# Attempt download with specialized function
# We need a filename. LibGen doesn't always give one easily in the URL.
# download_from_mirror expects a full path.
# We'll try to guess a filename or use a temp one and rename later?
# Actually download_from_mirror writes to output_path.
# Let's try to get metadata to make a good filename
filename = "libgen_download.bin"
title_from_results = None
author_from_results = None
year_from_results = None
if libgen_id and results:
title_from_results = results[0].get("title")
author_from_results = results[0].get("author")
year_from_results = results[0].get("year")
ext = results[0].get("extension", "pdf")
# Sanitize filename
safe_title = "".join(c for c in (title_from_results or "book") if c.isalnum() or c in (' ', '-', '_')).strip()
filename = f"{safe_title}.{ext}"
elif "series.php" in url:
filename = f"series_{re.search(r'id=(\d+)', url).group(1) if re.search(r'id=(\d+)', url) else 'unknown'}.pdf"
output_path = final_output_dir / filename
success, downloaded_path = download_from_mirror(
url,
output_path,
log_info=debug,
log_error=log,
)
final_file = Path(downloaded_path) if downloaded_path else output_path
if success and final_file.exists():
debug(f"✓ LibGen download successful: {final_file}")
# Create a result object
info = {
"id": libgen_id or "libgen",
"title": filename,
"webpage_url": url,
"ext": final_file.suffix.lstrip("."),
}
emit_tags = build_book_tags(
title=title_from_results or filename,
author=author_from_results,
year=year_from_results,
source="libgen",
extra=[f"libgen_id:{libgen_id}"] if libgen_id else None,
)
file_hash = _compute_file_hash(final_file)
# Emit result
pipeline_context.emit(create_pipe_object_result(
source="libgen",
identifier=libgen_id or "libgen",
file_path=str(final_file),
cmdlet_name="download-data",
title=filename,
file_hash=file_hash,
tags=emit_tags,
extra=info
))
downloaded_files.append(str(final_file))
continue
else:
debug("⚠ LibGen specialized download failed, falling back to generic downloader...")
except Exception as e:
debug(f"⚠ LibGen specialized download error: {e}")
# Fall through to generic downloader
# Resolve cookies path if specified
final_cookies_path = None
if cookies_path:
if resolve_cookies_path:
try:
final_cookies_path = resolve_cookies_path(config, Path(cookies_path))
except Exception:
final_cookies_path = Path(cookies_path).expanduser() if cookies_path else None
else:
final_cookies_path = Path(cookies_path).expanduser()
# Create download options - use correct parameter names
# Mode is "audio" or "video", required field
mode = "audio" if audio_mode else "video"
# Detect YouTube URLs and set no_playlist to download only the single video
is_youtube_url = isinstance(url, str) and ('youtube.com' in url or 'youtu.be' in url)
# Determine clip_sections to pass to yt-dlp
# Sections take precedence over clip if both are specified
# Sections are for yt-dlp download-sections (merge multiple clips at source)
# Clip is for post-download extraction
clip_sections_str = None
if section_ranges:
# Check if this is a yt-dlp URL
if is_url_supported_by_ytdlp(url):
# Convert section ranges to yt-dlp format: "start1-end1,start2-end2"
# Use * prefix to indicate download_sections (yt-dlp convention in some contexts)
# But here we just pass the string and let helper/download.py parse it
clip_sections_str = ",".join(f"{start}-{end}" for start, end in section_ranges)
debug(f"Using yt-dlp sections: {clip_sections_str}")
else:
log(f"Warning: -section only works with yt-dlp supported URLs. Use -clip for {url}", file=sys.stderr)
elif clip_range:
# For -clip, we use the same field but it's handled differently in helper/download.py
# Wait, helper/download.py treats clip_sections as download_sections for yt-dlp
# So -clip should also work as download_sections if it's a yt-dlp URL?
# Currently -clip is just one range.
clip_sections_str = f"{clip_range[0]}-{clip_range[1]}"
download_opts = DownloadOptions(
url=url,
mode=mode,
output_dir=final_output_dir,
cookies_path=final_cookies_path,
ytdl_format=current_format_selector, # Use per-URL format override if available
clip_sections=clip_sections_str,
playlist_items=playlist_items,
no_playlist=is_youtube_url, # For YouTube, ignore playlist URLs and download single video
)
# For playlist downloads, capture existing files BEFORE download
if playlist_items and selected_playlist_entries:
_, playlist_existing_paths = _snapshot_playlist_paths(selected_playlist_entries, final_output_dir)
# Call download_media from helper - no show_progress param
result_data = download_media(download_opts)
if result_data and result_data.path:
file_path = result_data.path
if file_path.exists():
# Check if we have multiple section files to emit
if result_data.paths:
# Section download - emit each section file separately for merge-file
debug(f"📋 Section download: emitting {len(result_data.paths)} file(s) to merge-file")
for section_file in result_data.paths:
if section_file.exists():
file_hash = _compute_file_hash(section_file)
tags = result_data.tags if result_data.tags else []
pipe_obj = create_pipe_object_result(
source='download',
identifier=section_file.stem,
file_path=str(section_file),
cmdlet_name='download-data',
title=section_file.name,
file_hash=file_hash,
is_temp=False,
extra={
'url': url,
'tags': tags,
'audio_mode': audio_mode,
'format': format_selector,
'from_sections': True,
}
)
downloaded_files.append(section_file)
pipeline_context.emit(pipe_obj)
# Check if this was a playlist download (is_actual_playlist tracks if we have a multi-item playlist)
elif is_actual_playlist:
if not selected_playlist_entries:
debug("⚠ Playlist metadata unavailable; cannot emit selected items for this stage.")
exit_code = 1
continue
matched_after, _ = _snapshot_playlist_paths(selected_playlist_entries, final_output_dir)
if not matched_after:
debug("⚠ No playlist files found for the selected items after download.")
exit_code = 1
continue
new_playlist_files: list[Path] = []
for playlist_file in matched_after:
try:
path_key = str(playlist_file.resolve())
except OSError:
path_key = str(playlist_file)
if path_key not in playlist_existing_paths:
new_playlist_files.append(playlist_file)
emit_targets = new_playlist_files if new_playlist_files else matched_after
if new_playlist_files:
debug(f"📋 Playlist download completed: {len(new_playlist_files)} new file(s)")
else:
debug(f"📁 Reusing {len(emit_targets)} cached playlist file(s)")
for playlist_file in emit_targets:
file_hash = _compute_file_hash(playlist_file)
tags = []
if extract_ytdlp_tags and result_data.tags:
tags = result_data.tags
pipe_obj = create_pipe_object_result(
source='download',
identifier=playlist_file.stem,
file_path=str(playlist_file),
cmdlet_name='download-data',
title=playlist_file.name,
file_hash=file_hash,
is_temp=False,
extra={
'url': url,
'tags': tags,
'audio_mode': audio_mode,
'format': format_selector,
'from_playlist': True,
},
)
downloaded_files.append(playlist_file)
pipeline_context.emit(pipe_obj)
else:
# Single file download
file_hash = result_data.hash_value or _compute_file_hash(file_path)
tags = result_data.tags if result_data.tags else []
pipe_obj = create_pipe_object_result(
source='download',
identifier=file_path.stem,
file_path=str(file_path),
cmdlet_name='download-data',
title=file_path.name,
file_hash=file_hash,
is_temp=False,
extra={
'url': url,
'tags': tags,
'audio_mode': audio_mode,
'format': format_selector,
'clipped': clip_range is not None,
}
)
downloaded_files.append(file_path)
pipeline_context.emit(pipe_obj)
debug(f"✓ Downloaded: {file_path}")
else:
log(f"Download returned no result for {url}", file=sys.stderr)
exit_code = 1
except Exception as e:
log(f"Error downloading {url}: {e}", file=sys.stderr)
import traceback
traceback.print_exc(file=sys.stderr)
exit_code = 1
# Success if we downloaded files or displayed playlists/formats
if downloaded_files or files_downloaded_directly > 0:
total_files = len(downloaded_files) + files_downloaded_directly
debug(f"✓ Successfully downloaded {total_files} file(s)")
stage_ctx = pipeline_context.get_stage_context()
should_display_results = stage_ctx is None or stage_ctx.is_last_stage
if downloaded_files and should_display_results:
try:
from cmdlets import search_file as search_cmdlet
except Exception:
search_cmdlet = None
if search_cmdlet:
seen_hashes: set[str] = set()
for file_entry in downloaded_files:
path_obj = Path(file_entry) if not isinstance(file_entry, Path) else file_entry
if not path_obj.is_file():
continue
file_hash = _compute_file_hash(path_obj)
if file_hash and file_hash not in seen_hashes:
seen_hashes.add(file_hash)
search_cmdlet._run(None, [f"hash:{file_hash}"], config)
else:
debug("search-file not available; skipping post-download display")
elif downloaded_files:
debug("Skipping search-file display because downstream pipeline is present")
if db:
db.update_worker_status(worker_id, 'completed')
return 0
if playlists_displayed:
debug(f"✓ Displayed {playlists_displayed} playlist(s) for selection")
if db:
db.update_worker_status(worker_id, 'completed')
db.close()
return 0 # Success - playlists shown
if formats_displayed:
debug(f"✓ Format selection table displayed - use @N to select and download")
if db:
db.update_worker_status(worker_id, 'completed')
db.close()
return 0 # Success - formats shown
log(f"No files were downloaded or playlists displayed", file=sys.stderr)
if db:
db.update_worker_status(worker_id, 'completed')
db.close()
return 1
CMDLET = Cmdlet(
name="download-data",
exec=_run,
summary="Download data from URLs with playlist/clip support using yt-dlp",
usage="download-data <url> [options] or search-file | download-data [options]",
aliases=["download", "dl"],
args=[
CmdletArg(
name="url",
type="string",
required=False,
description="URL to download (HTTP/HTTPS or file with URL list)",
variadic=True
),
CmdletArg(
name="-url",
type="string",
description="URL to download (alias for positional argument)",
variadic=True
),
CmdletArg(
name="list-formats",
type="flag",
description="List available formats without downloading"
),
CmdletArg(
name="audio",
type="flag",
alias="a",
description="Download audio only (extract from video)"
),
CmdletArg(
name="video",
type="flag",
alias="v",
description="Download video (default if not specified)"
),
CmdletArg(
name="format",
type="string",
alias="fmt",
description="Explicit yt-dlp format selector (e.g., 'bestvideo+bestaudio')"
),
CmdletArg(
name="clip",
type="string",
description="Extract time range: MM:SS-MM:SS (e.g., 34:03-35:08) or seconds"
),
CmdletArg(
name="section",
type="string",
description="Download sections (yt-dlp only): TIME_RANGE[,TIME_RANGE...] (e.g., '1:30-1:35,0:05-0:15')"
),
CmdletArg(
name="cookies",
type="string",
description="Path to cookies.txt file for authentication"
),
CmdletArg(
name="torrent",
type="flag",
description="Download torrent/magnet via AllDebrid (requires API key in config)"
),
CmdletArg(
name="wait",
type="float",
description="Wait time (seconds) for magnet processing timeout"
),
CmdletArg(
name="item",
type="string",
alias="items",
description="Item selection for playlists/formats: use '-item N' to select format N, or '-item' to show table for @N selection in next command"
),
SharedArgs.STORAGE, # Storage location: local, hydrus, 0x0, debrid, ftp
],
details=[
"Download media from URLs with advanced features.",
"",
"BASIC USAGE:",
" download-data https://youtube.com/watch?v=xyz",
" download-data https://example.com/file.pdf -storage local",
"",
"AUDIO/VIDEO OPTIONS:",
" -audio, -a Extract audio from video (M4A, MP3)",
" -video, -v Download as video (default)",
"",
"FORMAT SELECTION:",
" -format SELECTOR Specify yt-dlp format",
" Examples: 'best', 'bestvideo+bestaudio', '22'",
"",
"FORMAT/RESULT ITEM SELECTION:",
" -item Show available formats in table (see @N below)",
" -item N Auto-select and download format #N (e.g., -item 1)",
" Example: download-data URL -item 2 | add-file -storage local",
"",
"FORMAT SELECTION WITH @N SYNTAX:",
" 1. Show formats: download-data URL",
" 2. Select with @N: @1 | download-data | add-file",
" OR use -item N to skip manual selection",
"",
"CLIPPING:",
" -clip START-END Extract time range from media",
" Format: MM:SS-MM:SS (e.g., 34:03-35:08)",
" Also accepts: 2043-2108 (seconds)",
"",
"SECTION DOWNLOAD (yt-dlp only):",
" -section RANGES Download specific time sections and merge them",
" Format: HH:MM:SS-HH:MM:SS[,HH:MM:SS-HH:MM:SS...]",
" Example: -section '1:30-1:35,0:05-0:15'",
" Each section is downloaded separately then merged in order",
"",
"PLAYLIST MODE:",
" Automatically detects playlists",
" Shows numbered list of tracks",
" Download specific items: -item '1,3,5-8'",
" Download all items: -item '*'",
"",
"TORRENT MODE:",
" Download torrents/magnets via AllDebrid (if configured)",
" Usage: download-data -torrent magnet:?xt=urn:btih:... -item '1,3,5-8'",
" -wait SECONDS Maximum wait time for magnet processing (default: 1800)",
"",
"STORAGE LOCATIONS:",
" -storage local ~/Videos (default)",
" -storage hydrus ~/.hydrus/client_files",
" -storage 0x0 ~/Screenshots",
" -storage debrid ~/Debrid",
" -storage ftp ~/FTP",
"",
"EXAMPLES:",
" # Download YouTube video as audio",
" download-data https://youtube.com/watch?v=xyz -audio -storage local",
"",
" # Extract specific clip from video",
" download-data https://vimeo.com/123456 -clip 1:30-2:45 -format best",
"",
" # Download multiple sections and merge them",
" download-data https://youtube.com/watch?v=xyz -section '1:30-1:35,0:05-0:15' | merge-file | add-file -storage local",
"",
" # Download specific tracks from playlist",
" download-data https://youtube.com/playlist?list=xyz -item '1,3,5-8'",
"",
" # Download all items from playlist",
" download-data https://youtube.com/playlist?list=xyz -item '*'",
"",
" # Download with authentication",
" download-data https://example.com/content -cookies ~/cookies.txt",
"",
"TORRENT EXAMPLES:",
" # Download specific tracks from magnet link",
" download-data -torrent magnet:?xt=urn:btih:... -item '1,3,5-8' -storage local",
"",
" # Download all items from torrent and merge",
" download-data -torrent magnet:?xt=urn:btih:... -item '*' | merge-file | add-file",
"",
" # Download with custom wait time (5 minutes)",
" download-data -torrent magnet:?xt=urn:btih:... -wait 300 -item '1-5'",
]
)