This commit is contained in:
nose
2025-11-25 20:09:33 -08:00
parent d75c644a82
commit bd69119996
80 changed files with 39615 additions and 0 deletions

92
helper/__init__.py Normal file
View File

@@ -0,0 +1,92 @@
"""Helper modules for the downlow mpv integration."""
from . import hydrus as _hydrus
from . import download as _download
from . import tasks as _tasks
from . import utils as _utils
try: # Optional dependency on Playwright
from . import webshot as _webshot
except Exception as exc: # pragma: no cover - surfaced when Playwright is missing
_webshot = None # type: ignore
ScreenshotError = None # type: ignore[assignment]
ScreenshotOptions = None # type: ignore[assignment]
ScreenshotResult = None # type: ignore[assignment]
capture_screenshot = None # type: ignore[assignment]
ScreenshotImportError = exc # type: ignore[assignment]
else:
ScreenshotError = _webshot.ScreenshotError
ScreenshotOptions = _webshot.ScreenshotOptions
ScreenshotResult = _webshot.ScreenshotResult
capture_screenshot = _webshot.capture_screenshot
ScreenshotImportError = None
# CBOR utilities
decode_cbor = _utils.decode_cbor
jsonify = _utils.jsonify
# General utilities
CHUNK_SIZE = _utils.CHUNK_SIZE
ensure_directory = _utils.ensure_directory
unique_path = _utils.unique_path
download_hydrus_file = _hydrus.download_hydrus_file
sanitize_metadata_value = _utils.sanitize_metadata_value
unique_preserve_order = _utils.unique_preserve_order
sha256_file = _utils.sha256_file
create_metadata_sidecar = _utils.create_metadata_sidecar
create_tags_sidecar = _utils.create_tags_sidecar
# Format utilities
format_bytes = _utils.format_bytes
format_duration = _utils.format_duration
format_timestamp = _utils.format_timestamp
format_metadata_value = _utils.format_metadata_value
# Link utilities
extract_link = _utils.extract_link
extract_link_from_args = _utils.extract_link_from_args
extract_link_from_result = _utils.extract_link_from_result
get_api_key = _utils.get_api_key
add_direct_link_to_result = _utils.add_direct_link_to_result
# URL policy utilities
resolve_url_policy = _utils.resolve_url_policy
UrlPolicy = _utils.UrlPolicy
# Download utilities
DownloadOptions = _download.DownloadOptions
DownloadError = _download.DownloadError
DownloadMediaResult = _download.DownloadMediaResult
download_media = _download.download_media
is_url_supported_by_ytdlp = _download.is_url_supported_by_ytdlp
probe_url = _download.probe_url
# Hydrus utilities
hydrus_request = _hydrus.hydrus_request
hydrus_export = _hydrus.hydrus_export
HydrusClient = _hydrus.HydrusClient
HydrusRequestError = _hydrus.HydrusRequestError
connect_ipc = _tasks.connect_ipc
ipc_sender = _tasks.ipc_sender
__all__ = [
'decode_cbor',
'jsonify',
'CHUNK_SIZE',
'ensure_directory',
'unique_path',
'download_hydrus_file',
'sanitize_metadata_value',
'unique_preserve_order',
'sha256_file',
'resolve_url_policy',
'UrlPolicy',
'ScreenshotError',
'ScreenshotOptions',
'ScreenshotResult',
'capture_screenshot',
'ScreenshotImportError',
'DownloadOptions',
'DownloadError',
'DownloadMediaResult',
'download_media',
'is_url_supported_by_ytdlp',
'probe_url',
'HydrusClient',
'HydrusRequestError',
'hydrus_request',
'hydrus_export',
'connect_ipc',
'ipc_sender',
]

130
helper/adjective.json Normal file
View File

@@ -0,0 +1,130 @@
{
"Occult": [
"esoterica",
"ritual",
"alchemy",
"magic",
"hermetic",
"divination",
"grimoires",
"symbolism",
"ceremony"
],
"Philosophy": [
"ethics",
"metaphysics",
"epistemology",
"logic",
"existentialism",
"stoicism",
"phenomenology",
"dialectic",
"aesthetics"
],
"Mystery": [
"investigation",
"crime",
"detective",
"noir",
"thriller",
"suspense",
"conspiracy",
"whodunit",
"clues"
],
"Religion": [
"scripture",
"theology",
"worship",
"ritual",
"doctrine",
"faith",
"tradition",
"liturgy",
"sacred"
],
"Mythology": [
"gods",
"creation",
"heroes",
"legends",
"folklore",
"pantheon",
"epic",
"mythic",
"archetype"
],
"Science": [
"research",
"experiment",
"theory",
"biology",
"physics",
"chemistry",
"data",
"method",
"innovation"
],
"Art": [
"visual",
"painting",
"sculpture",
"modernism",
"technique",
"studio",
"curation",
"expression",
"composition"
],
"Literature": [
"fiction",
"poetry",
"novel",
"criticism",
"narrative",
"prose",
"drama",
"canonical",
"translation"
],
"History": [
"archaeology",
"chronicle",
"period",
"empire",
"revolution",
"archive",
"heritage",
"historiography",
"timeline"
],
"Psychology": [
"cognition",
"behavior",
"therapy",
"development",
"neuroscience",
"personality",
"perception",
"emotion",
"motivation"
],
"gnostic": [
"religion",
"scripture",
"gnostic",
"gospel",
"wisdom",
"spirituality",
"ancient",
"philosophy",
"esoteric",
"mysticism",
"mythology",
"theology",
"sacred",
"divine",
"apocrapha",
"gnosticism"
]
}

829
helper/alldebrid.py Normal file
View File

@@ -0,0 +1,829 @@
"""AllDebrid API integration for converting free links to direct downloads.
AllDebrid is a debrid service that unlocks free file hosters and provides direct download links.
API docs: https://docs.alldebrid.com/#general-informations
"""
from __future__ import annotations
import json
import sys
from helper.logger import log, debug
import time
import logging
from pathlib import Path
from typing import Any, Dict, Optional, Set, List, Sequence
from urllib.parse import urlencode, urlparse
from .http_client import HTTPClient
logger = logging.getLogger(__name__)
class AllDebridError(Exception):
"""Raised when AllDebrid API request fails."""
pass
# Cache for supported hosters (domain -> host info)
_SUPPORTED_HOSTERS_CACHE: Optional[Dict[str, Dict[str, Any]]] = None
_CACHE_TIMESTAMP: float = 0
_CACHE_DURATION: float = 3600 # 1 hour
class AllDebridClient:
"""Client for AllDebrid API."""
# Try both v4 and v3 APIs
BASE_URLS = [
"https://api.alldebrid.com/v4",
"https://api.alldebrid.com/v3",
]
def __init__(self, api_key: str):
"""Initialize AllDebrid client with API key.
Args:
api_key: AllDebrid API key from config
"""
self.api_key = api_key.strip()
if not self.api_key:
raise AllDebridError("AllDebrid API key is empty")
self.base_url = self.BASE_URLS[0] # Start with v4
def _request(self, endpoint: str, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
"""Make a request to AllDebrid API.
Args:
endpoint: API endpoint (e.g., "user/profile", "link/unlock")
params: Query parameters
Returns:
Parsed JSON response
Raises:
AllDebridError: If request fails or API returns error
"""
if params is None:
params = {}
# Add API key to params
params['apikey'] = self.api_key
url = f"{self.base_url}/{endpoint}"
query_string = urlencode(params)
full_url = f"{url}?{query_string}"
logger.debug(f"[AllDebrid] {endpoint} request to {full_url[:80]}...")
try:
# Pass timeout to HTTPClient init, not to get()
with HTTPClient(timeout=30.0, headers={'User-Agent': 'downlow/1.0'}) as client:
try:
response = client.get(full_url)
response.raise_for_status()
except Exception as req_err:
# Log detailed error info
logger.error(f"[AllDebrid] Request error to {full_url[:80]}: {req_err}", exc_info=True)
if hasattr(req_err, 'response') and req_err.response is not None: # type: ignore
try:
error_body = req_err.response.content.decode('utf-8') # type: ignore
logger.error(f"[AllDebrid] Response body: {error_body[:200]}")
except:
pass
raise
data = json.loads(response.content.decode('utf-8'))
logger.debug(f"[AllDebrid] Response status: {response.status_code}")
# Check for API errors
if data.get('status') == 'error':
error_msg = data.get('error', {}).get('message', 'Unknown error')
logger.error(f"[AllDebrid] API error: {error_msg}")
raise AllDebridError(f"AllDebrid API error: {error_msg}")
return data
except AllDebridError:
raise
except Exception as exc:
error_msg = f"AllDebrid request failed: {exc}"
logger.error(f"[AllDebrid] {error_msg}", exc_info=True)
raise AllDebridError(error_msg)
def unlock_link(self, link: str) -> Optional[str]:
"""Unlock a restricted link and get direct download URL.
Args:
link: Restricted link to unlock
Returns:
Direct download URL, or None if already unrestricted
Raises:
AllDebridError: If unlock fails
"""
if not link.startswith(('http://', 'https://')):
raise AllDebridError(f"Invalid URL: {link}")
try:
response = self._request('link/unlock', {'link': link})
# Check if unlock was successful
if response.get('status') == 'success':
data = response.get('data', {})
# AllDebrid returns the download info in 'link' field
if 'link' in data:
return data['link']
# Alternative: check for 'file' field
if 'file' in data:
return data['file']
# If no direct link, return the input link
return link
return None
except AllDebridError:
raise
except Exception as exc:
raise AllDebridError(f"Failed to unlock link: {exc}")
def check_host(self, hostname: str) -> Dict[str, Any]:
"""Check if a host is supported by AllDebrid.
Args:
hostname: Hostname to check (e.g., "uploadhaven.com")
Returns:
Host information dict with support status
Raises:
AllDebridError: If request fails
"""
try:
response = self._request('host', {'name': hostname})
if response.get('status') == 'success':
return response.get('data', {})
return {}
except AllDebridError:
raise
except Exception as exc:
raise AllDebridError(f"Failed to check host: {exc}")
def get_user_info(self) -> Dict[str, Any]:
"""Get current user account information.
Returns:
User information dict
Raises:
AllDebridError: If request fails
"""
try:
response = self._request('user/profile')
if response.get('status') == 'success':
return response.get('data', {})
return {}
except AllDebridError:
raise
except Exception as exc:
raise AllDebridError(f"Failed to get user info: {exc}")
def get_supported_hosters(self) -> Dict[str, Dict[str, Any]]:
"""Get list of all supported hosters from AllDebrid API.
Returns:
Dict mapping domain to host info (status, name, etc)
Raises:
AllDebridError: If request fails
"""
try:
response = self._request('hosts/domains')
if response.get('status') == 'success':
data = response.get('data', {})
# The API returns hosts keyed by domain
return data if isinstance(data, dict) else {}
return {}
except AllDebridError:
raise
except Exception as exc:
raise AllDebridError(f"Failed to get supported hosters: {exc}")
def magnet_add(self, magnet_uri: str) -> Dict[str, Any]:
"""Submit a magnet link or torrent hash to AllDebrid for processing.
AllDebrid will download the torrent content and store it in the account.
Processing time varies based on torrent size and availability.
Args:
magnet_uri: Magnet URI (magnet:?xt=urn:btih:...) or torrent hash
Returns:
Dict with magnet info:
- id: Magnet ID (int) - needed for status checks
- name: Torrent name
- hash: Torrent hash
- size: Total file size (bytes)
- ready: Boolean - True if already available
Raises:
AllDebridError: If submit fails (requires premium, invalid magnet, etc)
"""
if not magnet_uri:
raise AllDebridError("Magnet URI is empty")
try:
# API endpoint: POST /v4/magnet/upload
# Format: /magnet/upload?apikey=key&magnets[]=magnet:?xt=...
response = self._request('magnet/upload', {'magnets[]': magnet_uri})
if response.get('status') == 'success':
data = response.get('data', {})
magnets = data.get('magnets', [])
if magnets and len(magnets) > 0:
magnet_info = magnets[0]
# Check for errors in the magnet response
if 'error' in magnet_info:
error = magnet_info['error']
error_msg = error.get('message', 'Unknown error')
raise AllDebridError(f"Magnet error: {error_msg}")
return magnet_info
raise AllDebridError("No magnet data in response")
raise AllDebridError(f"API error: {response.get('error', 'Unknown')}")
except AllDebridError:
raise
except Exception as exc:
raise AllDebridError(f"Failed to submit magnet: {exc}")
def magnet_status(self, magnet_id: int, include_files: bool = False) -> Dict[str, Any]:
"""Get status of a magnet currently being processed or stored.
Status codes:
0-3: Processing (in queue, downloading, compressing, uploading)
4: Ready (files available for download)
5-15: Error (upload failed, not downloaded in 20min, too big, etc)
Args:
magnet_id: Magnet ID from magnet_add()
include_files: If True, includes file list in response
Returns:
Dict with status info:
- id: Magnet ID
- filename: Torrent name
- size: Total size (bytes)
- status: Human-readable status
- statusCode: Numeric code (0-15)
- downloaded: Bytes downloaded so far
- uploaded: Bytes uploaded so far
- seeders: Number of seeders
- downloadSpeed: Current speed (bytes/sec)
- uploadSpeed: Current speed (bytes/sec)
- files: (optional) Array of file objects when include_files=True
Each file: {n: name, s: size, l: download_link}
Raises:
AllDebridError: If status check fails
"""
if not isinstance(magnet_id, int) or magnet_id <= 0:
raise AllDebridError(f"Invalid magnet ID: {magnet_id}")
try:
# Use v4.1 endpoint for better response format
# Temporarily override base_url for this request
old_base = self.base_url
self.base_url = "https://api.alldebrid.com/v4.1"
try:
response = self._request('magnet/status', {'id': str(magnet_id)})
finally:
self.base_url = old_base
if response.get('status') == 'success':
data = response.get('data', {})
magnets = data.get('magnets', {})
# Handle both list and dict responses
if isinstance(magnets, list) and len(magnets) > 0:
return magnets[0]
elif isinstance(magnets, dict) and magnets:
return magnets
raise AllDebridError(f"No magnet found with ID {magnet_id}")
raise AllDebridError(f"API error: {response.get('error', 'Unknown')}")
except AllDebridError:
raise
except Exception as exc:
raise AllDebridError(f"Failed to get magnet status: {exc}")
def magnet_status_live(self, magnet_id: int, session: int = None, counter: int = 0) -> Dict[str, Any]:
"""Get live status of a magnet using delta sync mode.
The live mode endpoint provides real-time progress by only sending
deltas (changed fields) instead of full status on each call. This
reduces bandwidth and server load compared to regular polling.
Note: The "live" designation refers to the delta-sync mode where you
maintain state locally and apply diffs from the API, not a streaming
endpoint. Regular magnet_status() polling is simpler for single magnets.
Docs: https://docs.alldebrid.com/#get-status-live-mode
Args:
magnet_id: Magnet ID from magnet_add()
session: Session ID (use same ID across multiple calls). If None, will query current status
counter: Counter value from previous response (starts at 0)
Returns:
Dict with magnet status. May contain only changed fields if counter > 0.
For single-magnet tracking, use magnet_status() instead.
Raises:
AllDebridError: If request fails
"""
if not isinstance(magnet_id, int) or magnet_id <= 0:
raise AllDebridError(f"Invalid magnet ID: {magnet_id}")
try:
# For single magnet queries, just use regular endpoint with ID
# The "live mode" with session/counter is for multi-magnet dashboards
# where bandwidth savings from diffs matter
response = self._request('magnet/status', {'id': magnet_id})
if response.get('status') == 'success':
data = response.get('data', {})
magnets = data.get('magnets', [])
# Handle list response
if isinstance(magnets, list) and len(magnets) > 0:
return magnets[0]
raise AllDebridError(f"No magnet found with ID {magnet_id}")
raise AllDebridError(f"API error: {response.get('error', 'Unknown')}")
except AllDebridError:
raise
except Exception as exc:
raise AllDebridError(f"Failed to get magnet live status: {exc}")
def magnet_links(self, magnet_ids: list) -> Dict[str, Any]:
"""Get files and download links for one or more magnets.
Use this after magnet_status shows statusCode == 4 (Ready).
Returns the file tree structure with direct download links.
Args:
magnet_ids: List of magnet IDs to get files for
Returns:
Dict mapping magnet_id (as string) -> magnet_info:
- id: Magnet ID
- files: Array of file/folder objects
File: {n: name, s: size, l: direct_download_link}
Folder: {n: name, e: [sub_items]}
Raises:
AllDebridError: If request fails
"""
if not magnet_ids:
raise AllDebridError("No magnet IDs provided")
try:
# Build parameter: id[]=123&id[]=456 style
params = {}
for i, magnet_id in enumerate(magnet_ids):
params[f'id[{i}]'] = str(magnet_id)
response = self._request('magnet/files', params)
if response.get('status') == 'success':
data = response.get('data', {})
magnets = data.get('magnets', [])
# Convert list to dict keyed by ID (as string) for easier access
result = {}
for magnet_info in magnets:
magnet_id = magnet_info.get('id')
if magnet_id:
result[str(magnet_id)] = magnet_info
return result
raise AllDebridError(f"API error: {response.get('error', 'Unknown')}")
except AllDebridError:
raise
except Exception as exc:
raise AllDebridError(f"Failed to get magnet files: {exc}")
def instant_available(self, magnet_hash: str) -> Optional[List[Dict[str, Any]]]:
"""Check if magnet is available for instant streaming without downloading.
AllDebrid's "instant" feature checks if a magnet can be streamed directly
without downloading all the data. Returns available video/audio files.
Args:
magnet_hash: Torrent hash (with or without magnet: prefix)
Returns:
List of available files for streaming, or None if not available
Each file: {n: name, s: size, e: extension, t: type}
Returns empty list if torrent not found or not available
Raises:
AllDebridError: If API request fails
"""
try:
# Parse magnet hash if needed
if magnet_hash.startswith('magnet:'):
# Extract hash from magnet URI
import re
match = re.search(r'xt=urn:btih:([a-fA-F0-9]+)', magnet_hash)
if not match:
return None
hash_value = match.group(1)
else:
hash_value = magnet_hash.strip()
if not hash_value or len(hash_value) < 32:
return None
response = self._request('magnet/instant', {'magnet': hash_value})
if response.get('status') == 'success':
data = response.get('data', {})
# Returns 'files' array if available, or empty
return data.get('files', [])
# Not available is not an error, just return empty list
return []
except AllDebridError:
raise
except Exception as exc:
logger.debug(f"[AllDebrid] instant_available check failed: {exc}")
return None
def magnet_delete(self, magnet_id: int) -> bool:
"""Delete a magnet from the AllDebrid account.
Args:
magnet_id: Magnet ID to delete
Returns:
True if deletion was successful
Raises:
AllDebridError: If deletion fails
"""
if not isinstance(magnet_id, int) or magnet_id <= 0:
raise AllDebridError(f"Invalid magnet ID: {magnet_id}")
try:
response = self._request('magnet/delete', {'id': str(magnet_id)})
if response.get('status') == 'success':
return True
raise AllDebridError(f"API error: {response.get('error', 'Unknown')}")
except AllDebridError:
raise
except Exception as exc:
raise AllDebridError(f"Failed to delete magnet: {exc}")
def _get_cached_supported_hosters(api_key: str) -> Set[str]:
"""Get cached list of supported hoster domains.
Uses AllDebrid API to fetch the list once per hour,
caching the result to avoid repeated API calls.
Args:
api_key: AllDebrid API key
Returns:
Set of supported domain names (lowercased)
"""
global _SUPPORTED_HOSTERS_CACHE, _CACHE_TIMESTAMP
now = time.time()
# Return cached result if still valid
if _SUPPORTED_HOSTERS_CACHE is not None and (now - _CACHE_TIMESTAMP) < _CACHE_DURATION:
return set(_SUPPORTED_HOSTERS_CACHE.keys())
# Fetch fresh list from API
try:
client = AllDebridClient(api_key)
hosters_dict = client.get_supported_hosters()
if hosters_dict:
# API returns: hosts (list), streams (list), redirectors (list)
# Combine all into a single set
all_domains: Set[str] = set()
# Add hosts
if 'hosts' in hosters_dict and isinstance(hosters_dict['hosts'], list):
all_domains.update(hosters_dict['hosts'])
# Add streams
if 'streams' in hosters_dict and isinstance(hosters_dict['streams'], list):
all_domains.update(hosters_dict['streams'])
# Add redirectors
if 'redirectors' in hosters_dict and isinstance(hosters_dict['redirectors'], list):
all_domains.update(hosters_dict['redirectors'])
# Cache as dict for consistency
_SUPPORTED_HOSTERS_CACHE = {domain: {} for domain in all_domains}
_CACHE_TIMESTAMP = now
if all_domains:
debug(f"✓ Cached {len(all_domains)} supported hosters")
return all_domains
except Exception as exc:
log(f"⚠ Failed to fetch supported hosters: {exc}", file=sys.stderr)
# Return any cached hosters even if expired
if _SUPPORTED_HOSTERS_CACHE:
return set(_SUPPORTED_HOSTERS_CACHE.keys())
# Fallback: empty set if no cache available
return set()
def is_link_restrictable_hoster(url: str, api_key: str) -> bool:
"""Check if a URL is from a hoster that AllDebrid can unlock.
Intelligently queries the AllDebrid API to detect if the URL is
from a supported restricted hoster.
Args:
url: URL to check
api_key: AllDebrid API key
Returns:
True if URL is from a supported restrictable hoster
"""
if not url or not api_key:
return False
try:
# Extract domain from URL
parsed = urlparse(url)
domain = parsed.netloc.lower()
# Remove www. prefix for comparison
if domain.startswith('www.'):
domain = domain[4:]
# Get supported hosters (cached)
supported = _get_cached_supported_hosters(api_key)
if not supported:
# API check failed, fall back to manual detection
# Check for common restricted hosters
common_hosters = {
'uploadhaven.com', 'uploaded.to', 'uploaded.net',
'datafile.com', 'rapidfile.io', 'nitroflare.com',
'1fichier.com', 'mega.nz', 'mediafire.com'
}
return any(host in url.lower() for host in common_hosters)
# Check if domain is in supported list
# Need to check exact match and with/without www
return domain in supported or f"www.{domain}" in supported
except Exception as exc:
log(f"⚠ Hoster detection failed: {exc}", file=sys.stderr)
return False
def convert_link_with_debrid(link: str, api_key: str) -> Optional[str]:
"""Convert a restricted link to a direct download URL using AllDebrid.
Args:
link: Restricted link
api_key: AllDebrid API key
Returns:
Direct download URL, or original link if already unrestricted
"""
if not api_key:
return None
try:
client = AllDebridClient(api_key)
direct_link = client.unlock_link(link)
if direct_link and direct_link != link:
debug(f"✓ Converted link: {link[:60]}... → {direct_link[:60]}...")
return direct_link
return None
except AllDebridError as exc:
log(f"⚠ Failed to convert link: {exc}", file=sys.stderr)
return None
except Exception as exc:
log(f"⚠ Unexpected error: {exc}", file=sys.stderr)
return None
def is_magnet_link(uri: str) -> bool:
"""Check if a URI is a magnet link.
Magnet links start with 'magnet:?xt=urn:btih:' or just 'magnet:'
Args:
uri: URI to check
Returns:
True if URI is a magnet link
"""
if not uri:
return False
return uri.lower().startswith('magnet:')
def is_torrent_hash(text: str) -> bool:
"""Check if text looks like a torrent hash (40 or 64 hex characters).
Common formats:
- Info hash v1: 40 hex chars (SHA-1)
- Info hash v2: 64 hex chars (SHA-256)
Args:
text: Text to check
Returns:
True if text matches torrent hash format
"""
if not text or not isinstance(text, str):
return False
text = text.strip()
# Check if it's 40 hex chars (SHA-1) or 64 hex chars (SHA-256)
if len(text) not in (40, 64):
return False
try:
# Try to parse as hex
int(text, 16)
return True
except ValueError:
return False
def is_torrent_file(path: str) -> bool:
"""Check if a file path is a .torrent file.
Args:
path: File path to check
Returns:
True if file has .torrent extension
"""
if not path:
return False
return path.lower().endswith('.torrent')
def parse_magnet_or_hash(uri: str) -> Optional[str]:
"""Parse a magnet URI or hash into a format for AllDebrid API.
AllDebrid's magnet/upload endpoint accepts:
- Full magnet URIs: magnet:?xt=urn:btih:...
- Info hashes: 40 or 64 hex characters
Args:
uri: Magnet URI or hash
Returns:
Normalized input for AllDebrid API, or None if invalid
"""
if not uri:
return None
uri = uri.strip()
# Already a magnet link - just return it
if is_magnet_link(uri):
return uri
# Check if it's a valid hash
if is_torrent_hash(uri):
return uri
# Not a recognized format
return None
# ============================================================================
# Cmdlet: unlock_link
# ============================================================================
def unlock_link_cmdlet(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
"""Unlock a restricted link using AllDebrid.
Converts free hosters and restricted links to direct download URLs.
Usage:
unlock-link <link>
unlock-link # Uses URL from pipeline result
Requires:
- AllDebrid API key in config under Debrid.All-debrid
Args:
result: Pipeline result object
args: Command arguments
config: Configuration dictionary
Returns:
0 on success, 1 on failure
"""
try:
from .link_utils import (
extract_link,
get_api_key,
add_direct_link_to_result,
)
except ImportError as e:
log(f"Required modules unavailable: {e}", file=sys.stderr)
return 1
# Get link from args or result
link = extract_link(result, args)
if not link:
log("No valid URL provided", file=sys.stderr)
return 1
# Get AllDebrid API key from config
api_key = get_api_key(config, "AllDebrid", "Debrid.All-debrid")
if not api_key:
log("AllDebrid API key not configured in Debrid.All-debrid", file=sys.stderr)
return 1
# Try to unlock the link
debug(f"Unlocking: {link}")
direct_link = convert_link_with_debrid(link, api_key)
if direct_link:
debug(f"✓ Direct link: {direct_link}")
# Update result with direct link
add_direct_link_to_result(result, direct_link, link)
# Return the updated result via pipeline context
# Note: The cmdlet wrapper will handle emitting to pipeline
return 0
else:
log(f"❌ Failed to unlock link or already unrestricted", file=sys.stderr)
return 1
# ============================================================================
# Cmdlet Registration
# ============================================================================
def _register_unlock_link():
"""Register unlock-link command with cmdlet registry if available."""
try:
from cmdlets import register
@register(["unlock-link"])
def unlock_link_wrapper(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
"""Wrapper to make unlock_link_cmdlet available as cmdlet."""
import pipeline as ctx
ret_code = unlock_link_cmdlet(result, args, config)
# If successful, emit the result
if ret_code == 0:
ctx.emit(result)
return ret_code
return unlock_link_wrapper
except ImportError:
# If cmdlets module not available, just return None
return None
# Register when module is imported
_unlock_link_registration = _register_unlock_link()

567
helper/archive_client.py Normal file
View File

@@ -0,0 +1,567 @@
"""Archive.org API client for borrowing and downloading books.
This module provides low-level functions for interacting with Archive.org:
- Authentication (login, credential management)
- Borrowing (loan, return_loan)
- Book metadata extraction (get_book_infos, get_book_metadata)
- Image downloading and deobfuscation
- PDF creation with metadata
Used by unified_book_downloader.py for the borrowing workflow.
"""
from __future__ import annotations
import base64
import hashlib
import logging
import os
import re
import sys
import time
from concurrent import futures
from typing import Any, Dict, List, Optional, Sequence, Tuple
import requests
from helper.logger import log, debug
try:
from Crypto.Cipher import AES # type: ignore
from Crypto.Util import Counter # type: ignore
except ImportError:
AES = None # type: ignore
Counter = None # type: ignore
try:
from tqdm import tqdm # type: ignore
except ImportError:
tqdm = None # type: ignore
def credential_openlibrary(config: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
"""Get OpenLibrary/Archive.org email and password from config.
Supports both formats:
- New: {"provider": {"openlibrary": {"email": "...", "password": "..."}}}
- Old: {"Archive": {"email": "...", "password": "..."}}
{"archive_org_email": "...", "archive_org_password": "..."}
Returns: (email, password) tuple, each can be None
"""
if not isinstance(config, dict):
return None, None
# Try new format first
provider_config = config.get("provider", {})
if isinstance(provider_config, dict):
openlibrary_config = provider_config.get("openlibrary", {})
if isinstance(openlibrary_config, dict):
email = openlibrary_config.get("email")
password = openlibrary_config.get("password")
if email or password:
return email, password
# Try old nested format
archive_config = config.get("Archive")
if isinstance(archive_config, dict):
email = archive_config.get("email")
password = archive_config.get("password")
if email or password:
return email, password
# Fall back to old flat format
email = config.get("archive_org_email")
password = config.get("archive_org_password")
return email, password
def display_error(response: requests.Response, message: str) -> None:
"""Display error and exit."""
log(message, file=sys.stderr)
log(response.text, file=sys.stderr)
sys.exit(1)
def login(email: str, password: str) -> requests.Session:
"""Login to archive.org.
Args:
email: Archive.org email
password: Archive.org password
Returns:
Authenticated requests.Session
Raises:
SystemExit on login failure
"""
session = requests.Session()
session.get("https://archive.org/account/login", timeout=30)
data = {"username": email, "password": password}
response = session.post("https://archive.org/account/login", data=data, timeout=30)
if "bad_login" in response.text:
log("Invalid credentials!", file=sys.stderr)
sys.exit(1)
if "Successful login" in response.text:
debug("Successful login")
return session
display_error(response, "[-] Error while login:")
sys.exit(1) # Unreachable but satisfies type checker
def loan(session: requests.Session, book_id: str, verbose: bool = True) -> requests.Session:
"""Borrow a book from archive.org (14-day loan).
Args:
session: Authenticated requests.Session from login()
book_id: Archive.org book identifier (e.g., 'ia_book_id')
verbose: Whether to log messages
Returns:
Session with active loan
Raises:
SystemExit on loan failure
"""
data = {"action": "grant_access", "identifier": book_id}
response = session.post("https://archive.org/services/loans/loan/searchInside.php", data=data, timeout=30)
data["action"] = "browse_book"
response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30)
if response.status_code == 400:
try:
if response.json()["error"] == "This book is not available to borrow at this time. Please try again later.":
debug("This book doesn't need to be borrowed")
return session
display_error(response, "Something went wrong when trying to borrow the book.")
except:
display_error(response, "The book cannot be borrowed")
data["action"] = "create_token"
response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30)
if "token" in response.text:
if verbose:
debug("Successful loan")
return session
display_error(response, "Something went wrong when trying to borrow the book.")
sys.exit(1) # Unreachable but satisfies type checker
def return_loan(session: requests.Session, book_id: str) -> None:
"""Return a borrowed book.
Args:
session: Authenticated requests.Session with active loan
book_id: Archive.org book identifier
"""
data = {"action": "return_loan", "identifier": book_id}
response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30)
if response.status_code == 200 and response.json()["success"]:
debug("Book returned")
else:
display_error(response, "Something went wrong when trying to return the book")
def get_book_infos(session: requests.Session, url: str) -> Tuple[str, List[str], Dict[str, Any]]:
"""Extract book information and page links from archive.org viewer.
Args:
session: Authenticated requests.Session
url: Book URL (e.g., https://archive.org/borrow/book_id or /details/book_id)
Returns:
Tuple of (title, page_links, metadata)
Raises:
RuntimeError: If page data cannot be extracted
"""
r = session.get(url, timeout=30).text
# Try to extract the infos URL from the response
try:
# Look for the "url" field in the response
if '"url":"' not in r:
raise ValueError("No 'url' field found in response")
infos_url = "https:" + r.split('"url":"')[1].split('"')[0].replace("\\u0026", "&")
except (IndexError, ValueError) as e:
# If URL extraction fails, raise with better error message
raise RuntimeError(f"Failed to extract book info URL from response: {e}")
response = session.get(infos_url, timeout=30)
data = response.json()["data"]
title = data["brOptions"]["bookTitle"].strip().replace(" ", "_")
title = "".join(c for c in title if c not in '<>:"/\\|?*') # Filter forbidden chars
title = title[:150] # Trim to avoid long file names
metadata = data["metadata"]
links = []
# Safely extract page links from brOptions data
try:
br_data = data.get("brOptions", {}).get("data", [])
for item in br_data:
if isinstance(item, list):
for page in item:
if isinstance(page, dict) and "uri" in page:
links.append(page["uri"])
elif isinstance(item, dict) and "uri" in item:
links.append(item["uri"])
except (KeyError, IndexError, TypeError) as e:
log(f"Warning: Error parsing page links: {e}", file=sys.stderr)
# Continue with whatever links we found
if len(links) > 1:
debug(f"Found {len(links)} pages")
return title, links, metadata
elif len(links) == 1:
debug(f"Found {len(links)} page")
return title, links, metadata
else:
log("Error while getting image links - no pages found", file=sys.stderr)
raise RuntimeError("No pages found in book data")
def image_name(pages: int, page: int, directory: str) -> str:
"""Generate image filename for page.
Args:
pages: Total number of pages
page: Current page number (0-indexed)
directory: Directory to save to
Returns:
Full path to image file
"""
return f"{directory}/{(len(str(pages)) - len(str(page))) * '0'}{page}.jpg"
def deobfuscate_image(image_data: bytes, link: str, obf_header: str) -> bytes:
"""Decrypt obfuscated image data using AES-CTR.
This handles Archive.org's image obfuscation for borrowed books.
Based on: https://github.com/justimm
Args:
image_data: Encrypted image bytes
link: Image URL (used to derive AES key)
obf_header: X-Obfuscate header value (format: "1|BASE64_COUNTER")
Returns:
Decrypted image bytes
"""
if not AES or not Counter:
raise RuntimeError("Crypto library not available")
try:
version, counter_b64 = obf_header.split("|")
except Exception as e:
raise ValueError("Invalid X-Obfuscate header format") from e
if version != "1":
raise ValueError("Unsupported obfuscation version: " + version)
# Derive AES key from URL
aesKey = re.sub(r"^https?:\/\/.*?\/", "/", link)
sha1_digest = hashlib.sha1(aesKey.encode("utf-8")).digest()
key = sha1_digest[:16]
# Decode counter
counter_bytes = base64.b64decode(counter_b64)
if len(counter_bytes) != 16:
raise ValueError(f"Expected counter to be 16 bytes, got {len(counter_bytes)}")
prefix = counter_bytes[:8]
initial_value = int.from_bytes(counter_bytes[8:], byteorder="big")
# Create AES-CTR cipher
ctr = Counter.new(64, prefix=prefix, initial_value=initial_value, little_endian=False) # type: ignore
cipher = AES.new(key, AES.MODE_CTR, counter=ctr) # type: ignore
decrypted_part = cipher.decrypt(image_data[:1024])
new_data = decrypted_part + image_data[1024:]
return new_data
def download_one_image(
session: requests.Session,
link: str,
i: int,
directory: str,
book_id: str,
pages: int,
) -> None:
"""Download a single book page image.
Handles obfuscated images and re-borrowing on 403 errors.
Args:
session: Authenticated requests.Session
link: Direct image URL
i: Page index (0-based)
directory: Directory to save to
book_id: Archive.org book ID (for re-borrowing on 403)
pages: Total number of pages
"""
headers = {
"Referer": "https://archive.org/",
"Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
"Sec-Fetch-Site": "same-site",
"Sec-Fetch-Mode": "no-cors",
"Sec-Fetch-Dest": "image",
}
retry = True
response = None
while retry:
try:
response = session.get(link, headers=headers, timeout=30)
if response.status_code == 403:
session = loan(session, book_id, verbose=False)
raise Exception("Borrow again")
if response.status_code == 200:
retry = False
except:
time.sleep(1)
image = image_name(pages, i, directory)
if response is None:
log(f"Failed to download page {i}", file=sys.stderr)
return
obf_header = response.headers.get("X-Obfuscate")
image_content = None
if obf_header:
try:
image_content = deobfuscate_image(response.content, link, obf_header)
except Exception as e:
log(f"Deobfuscation failed: {e}", file=sys.stderr)
return
else:
image_content = response.content
with open(image, "wb") as f:
f.write(image_content)
def download(
session: requests.Session,
n_threads: int,
directory: str,
links: List[str],
scale: int,
book_id: str,
) -> List[str]:
"""Download all book pages as images.
Uses thread pool for parallel downloads.
Args:
session: Authenticated requests.Session
n_threads: Number of download threads
directory: Directory to save images to
links: List of image URLs
scale: Image resolution (0=highest, 10=lowest)
book_id: Archive.org book ID (for re-borrowing)
Returns:
List of downloaded image file paths
"""
debug("Downloading pages...")
links = [f"{link}&rotate=0&scale={scale}" for link in links]
pages = len(links)
tasks = []
with futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
for link in links:
i = links.index(link)
tasks.append(
executor.submit(
download_one_image,
session=session,
link=link,
i=i,
directory=directory,
book_id=book_id,
pages=pages,
)
)
if tqdm:
for _ in tqdm(futures.as_completed(tasks), total=len(tasks)): # type: ignore
pass
else:
for _ in futures.as_completed(tasks):
pass
images = [image_name(pages, i, directory) for i in range(len(links))]
return images
def check_direct_download(book_id: str) -> Tuple[bool, str]:
"""Check if a book can be downloaded directly without borrowing.
Searches Archive.org metadata for downloadable PDF files.
Args:
book_id: Archive.org book identifier
Returns:
Tuple of (can_download: bool, pdf_url: str)
"""
try:
# First, try to get the metadata to find the actual PDF filename
metadata_url = f"https://archive.org/metadata/{book_id}"
response = requests.get(metadata_url, timeout=10)
response.raise_for_status()
metadata = response.json()
# Find PDF file in files list
if "files" in metadata:
for file_info in metadata["files"]:
filename = file_info.get("name", "")
if filename.endswith(".pdf") and file_info.get("source") == "original":
# Found the original PDF
pdf_filename = filename
pdf_url = f"https://archive.org/download/{book_id}/{pdf_filename.replace(' ', '%20')}"
# Verify it's accessible
check_response = requests.head(pdf_url, timeout=5, allow_redirects=True)
if check_response.status_code == 200:
return True, pdf_url
return False, ""
except Exception as e:
log(f"Error checking direct download: {e}", file=sys.stderr)
return False, ""
def get_openlibrary_by_isbn(isbn: str) -> Dict[str, Any]:
"""Fetch book data from OpenLibrary using ISBN.
Args:
isbn: ISBN-10 or ISBN-13 to search for
Returns:
Dictionary with book metadata from OpenLibrary
"""
try:
# Try ISBN API first
api_url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn}&jscmd=data&format=json"
response = requests.get(api_url, timeout=10)
response.raise_for_status()
data = response.json()
if data:
# Get first result
key = list(data.keys())[0]
return data[key]
return {}
except Exception as e:
log(f"Error fetching OpenLibrary data by ISBN: {e}", file=sys.stderr)
return {}
def extract_isbn_from_metadata(metadata: Dict[str, Any]) -> str:
"""Extract ISBN from archive.org metadata.
Looks for ISBN in various metadata fields.
Args:
metadata: Archive.org metadata dictionary
Returns:
ISBN string (clean, no hyphens) or empty string if not found
"""
# Try various common metadata fields
isbn_fields = [
"isbn", "ISBN", "isbn_13", "isbn_10", "isbns",
"isbn-10", "isbn-13", "identifer_isbn"
]
for field in isbn_fields:
if field in metadata:
isbn_val = metadata[field]
if isinstance(isbn_val, list):
isbn_val = isbn_val[0] if isbn_val else None
if isbn_val and isinstance(isbn_val, str):
# Clean ISBN (remove hyphens, spaces)
isbn_clean = isbn_val.replace("-", "").replace(" ", "")
if len(isbn_clean) in [10, 13]:
return isbn_clean
return ""
def normalize_url(url: str) -> str:
"""Convert openlibrary.org URL to archive.org URL.
Looks up the actual Archive.org ID from OpenLibrary API.
Args:
url: Book URL (archive.org or openlibrary.org format)
Returns:
Normalized archive.org URL
"""
url = url.strip()
# Already archive.org format
if url.startswith("https://archive.org/details/"):
return url
# Convert openlibrary.org format by querying the OpenLibrary API
if "openlibrary.org/books/" in url:
try:
# Extract the book ID (e.g., OL6796852M)
parts = url.split("/books/")
if len(parts) > 1:
book_id = parts[1].split("/")[0]
# Query OpenLibrary API to get the book metadata
api_url = f"https://openlibrary.org/books/{book_id}.json"
response = requests.get(api_url, timeout=10)
response.raise_for_status()
data = response.json()
# Look for identifiers including internet_archive or ocaid
# First try ocaid (Open Content Alliance ID) - this is most common
if "ocaid" in data:
ocaid = data["ocaid"]
return f"https://archive.org/details/{ocaid}"
# Check for identifiers object
if "identifiers" in data:
identifiers = data["identifiers"]
# Look for internet_archive ID
if "internet_archive" in identifiers:
ia_ids = identifiers["internet_archive"]
if isinstance(ia_ids, list) and ia_ids:
ia_id = ia_ids[0]
else:
ia_id = ia_ids
return f"https://archive.org/details/{ia_id}"
# If no IA identifier found, use the book ID as fallback
log(f"No Internet Archive ID found for {book_id}. Attempting with OpenLibrary ID.", file=sys.stderr)
return f"https://archive.org/details/{book_id}"
except requests.RequestException as e:
log(f"Could not fetch OpenLibrary metadata: {e}", file=sys.stderr)
# Fallback to using the book ID directly
parts = url.split("/books/")
if len(parts) > 1:
book_id = parts[1].split("/")[0]
return f"https://archive.org/details/{book_id}"
except (KeyError, IndexError) as e:
log(f"Error parsing OpenLibrary response: {e}", file=sys.stderr)
# Fallback to using the book ID directly
parts = url.split("/books/")
if len(parts) > 1:
book_id = parts[1].split("/")[0]
return f"https://archive.org/details/{book_id}"
# Return original if can't parse
return url

730
helper/download.py Normal file
View File

@@ -0,0 +1,730 @@
"""Download media files using yt-dlp with support for direct file downloads.
Lean, focused downloader without event infrastructure overhead.
- yt-dlp integration for streaming sites
- Direct file download fallback for PDFs, images, documents
- Tag extraction via metadata.extract_ytdlp_tags()
- Logging via helper.logger.log()
"""
from __future__ import annotations
import re # noqa: F401
import sys
import time
import traceback
from pathlib import Path
from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import urljoin
import httpx
from helper.logger import log, debug
from .utils import ensure_directory, sha256_file
from .http_client import HTTPClient
from models import DownloadError, DownloadOptions, DownloadMediaResult, DebugLogger, ProgressBar
try:
import yt_dlp # type: ignore
from yt_dlp.extractor import gen_extractors # type: ignore
except Exception as exc:
yt_dlp = None # type: ignore
YTDLP_IMPORT_ERROR = exc
else:
YTDLP_IMPORT_ERROR = None
try:
from metadata import extract_ytdlp_tags
except ImportError:
extract_ytdlp_tags = None
_EXTRACTOR_CACHE: List[Any] | None = None
def _ensure_yt_dlp_ready() -> None:
"""Verify yt-dlp is available, raise if not."""
if yt_dlp is not None:
return
detail = str(YTDLP_IMPORT_ERROR or "yt-dlp is not installed")
raise DownloadError(f"yt-dlp module not available: {detail}")
def _progress_callback(status: Dict[str, Any]) -> None:
"""Simple progress callback using logger."""
event = status.get("status")
if event == "downloading":
percent = status.get("_percent_str", "?")
speed = status.get("_speed_str", "?")
debug(f"Downloading {percent} at {speed}")
elif event == "finished":
debug(f"✓ Download finished: {status.get('filename')}")
elif event in ("postprocessing", "processing"):
debug(f"Post-processing: {status.get('postprocessor')}")
def is_url_supported_by_ytdlp(url: str) -> bool:
"""Check if URL is supported by yt-dlp."""
if yt_dlp is None:
return False
global _EXTRACTOR_CACHE
if _EXTRACTOR_CACHE is None:
try:
_EXTRACTOR_CACHE = [ie for ie in gen_extractors()] # type: ignore[arg-type]
except Exception:
_EXTRACTOR_CACHE = []
for extractor in _EXTRACTOR_CACHE:
try:
if not extractor.suitable(url):
continue
except Exception:
continue
name = getattr(extractor, "IE_NAME", "")
if name.lower() == "generic":
continue
return True
return False
def list_formats(url: str, no_playlist: bool = False, playlist_items: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
"""Get list of available formats for a URL using yt-dlp.
Args:
url: URL to get formats for
no_playlist: If True, ignore playlists and list formats for single video
playlist_items: If specified, only list formats for these playlist items (e.g., "1,3,5-8")
Returns:
List of format dictionaries with keys: format_id, format, resolution, fps, vcodec, acodec, filesize, etc.
Returns None if yt-dlp is not available or format listing fails.
"""
_ensure_yt_dlp_ready()
try:
ydl_opts = {
"quiet": False,
"no_warnings": False,
"socket_timeout": 30,
}
# Add no_playlist option if specified
if no_playlist:
ydl_opts["noplaylist"] = True
# Add playlist_items filter if specified
if playlist_items:
ydl_opts["playlist_items"] = playlist_items
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
debug(f"Fetching format list for: {url}")
info = ydl.extract_info(url, download=False)
formats = info.get("formats", [])
if not formats:
log("No formats available", file=sys.stderr)
return None
# Parse and extract relevant format info
result_formats = []
for fmt in formats:
format_info = {
"format_id": fmt.get("format_id", ""),
"format": fmt.get("format", ""),
"ext": fmt.get("ext", ""),
"resolution": fmt.get("resolution", ""),
"width": fmt.get("width"),
"height": fmt.get("height"),
"fps": fmt.get("fps"),
"vcodec": fmt.get("vcodec", "none"),
"acodec": fmt.get("acodec", "none"),
"filesize": fmt.get("filesize"),
"tbr": fmt.get("tbr"), # Total bitrate
}
result_formats.append(format_info)
debug(f"Found {len(result_formats)} available formats")
return result_formats
except Exception as e:
log(f"✗ Error fetching formats: {e}", file=sys.stderr)
return None
def _build_ytdlp_options(opts: DownloadOptions) -> Dict[str, Any]:
"""Build yt-dlp download options."""
ensure_directory(opts.output_dir)
outtmpl = str((opts.output_dir / "%(title)s.%(ext)s").resolve())
base_options: Dict[str, Any] = {
"outtmpl": outtmpl,
"quiet": False,
"no_warnings": False,
"noprogress": False,
"socket_timeout": 30,
"retries": 10,
"fragment_retries": 10,
"http_chunk_size": 10_485_760,
"restrictfilenames": True,
"progress_hooks": [_progress_callback],
}
if opts.cookies_path and opts.cookies_path.is_file():
base_options["cookiefile"] = str(opts.cookies_path)
# Add no-playlist option if specified (for single video from playlist URLs)
if opts.no_playlist:
base_options["noplaylist"] = True
# Configure based on mode
if opts.mode == "audio":
base_options["format"] = opts.ytdl_format or "251/140/bestaudio"
base_options["postprocessors"] = [{"key": "FFmpegExtractAudio"}]
else: # video
base_options["format"] = opts.ytdl_format or "bestvideo+bestaudio/best"
base_options["format_sort"] = [
"res:4320", "res:2880", "res:2160", "res:1440", "res:1080", "res:720", "res"
]
# Add clip sections if provided
if opts.clip_sections:
base_options["download_sections"] = opts.clip_sections
# Add playlist items selection if provided
if opts.playlist_items:
base_options["playlist_items"] = opts.playlist_items
debug(f"yt-dlp: mode={opts.mode}, format={base_options.get('format')}")
return base_options
def _iter_download_entries(info: Dict[str, Any]) -> Iterator[Dict[str, Any]]:
"""Iterate through download entries, handling playlists."""
queue: List[Dict[str, Any]] = [info]
seen: set[int] = set()
while queue:
current = queue.pop(0)
obj_id = id(current)
if obj_id in seen:
continue
seen.add(obj_id)
entries = current.get("entries")
if isinstance(entries, list):
for entry in entries:
if isinstance(entry, dict):
queue.append(entry)
if current.get("requested_downloads") or not entries:
yield current
def _candidate_paths(entry: Dict[str, Any], output_dir: Path) -> Iterator[Path]:
"""Get candidate file paths for downloaded media."""
requested = entry.get("requested_downloads")
if isinstance(requested, list):
for item in requested:
if isinstance(item, dict):
for key in ("filepath", "_filename", "filename"):
value = item.get(key)
if value:
yield Path(value)
for key in ("filepath", "_filename", "filename"):
value = entry.get(key)
if value:
yield Path(value)
if entry.get("filename"):
yield output_dir / entry["filename"]
def _resolve_entry_and_path(info: Dict[str, Any], output_dir: Path) -> tuple[Dict[str, Any], Path]:
"""Find downloaded file in yt-dlp metadata."""
for entry in _iter_download_entries(info):
for candidate in _candidate_paths(entry, output_dir):
if candidate.is_file():
return entry, candidate
if not candidate.is_absolute():
resolved = output_dir / candidate
if resolved.is_file():
return entry, resolved
raise FileNotFoundError("yt-dlp did not report a downloaded media file")
def _extract_sha256(info: Dict[str, Any]) -> Optional[str]:
"""Extract SHA256 hash from yt-dlp metadata."""
for payload in [info] + info.get("entries", []):
if not isinstance(payload, dict):
continue
hashes = payload.get("hashes")
if isinstance(hashes, dict):
for key in ("sha256", "sha-256", "sha_256"):
value = hashes.get(key)
if isinstance(value, str) and value.strip():
return value.strip().lower()
for key in ("sha256", "sha-256", "sha_256"):
value = payload.get(key)
if isinstance(value, str) and value.strip():
return value.strip().lower()
return None
def _get_libgen_download_url(libgen_url: str) -> Optional[str]:
"""Extract the actual download link from LibGen redirect URL.
LibGen URLs like https://libgen.gl/file.php?id=123456 redirect to
actual mirror URLs. This follows the redirect chain to get the real file.
Args:
libgen_url: LibGen file.php URL
Returns:
Actual download URL or None if extraction fails
"""
try:
import requests
from urllib.parse import urlparse
# Check if this is a LibGen URL
parsed = urlparse(libgen_url)
if 'libgen' not in parsed.netloc.lower():
return None
if '/file.php' not in parsed.path.lower():
return None
# LibGen redirects to actual mirrors, follow redirects to get final URL
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
debug(f"Following LibGen redirect chain for: {libgen_url}")
# First, get the page and look for direct download link
try:
response = session.get(libgen_url, timeout=10, allow_redirects=True)
final_url = response.url
# Try to find actual download link in the page
try:
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Look for download links - LibGen typically has forms with download buttons
# Look for all links and forms that might lead to download
for link in soup.find_all('a'):
href = link.get('href')
if href and isinstance(href, str):
# Look for direct file links or get.php redirects
if 'get.php' in href.lower() or href.endswith(('.pdf', '.epub', '.djvu', '.mobi')):
download_url = href if href.startswith('http') else urljoin(final_url, href)
debug(f"Found download link: {download_url}")
return download_url
except ImportError:
pass # BeautifulSoup not available
# If we followed redirects successfully, return the final URL
# This handles cases where libgen redirects to a direct download mirror
if final_url != libgen_url:
debug(f"LibGen resolved to mirror: {final_url}")
return final_url
except requests.RequestException as e:
log(f"Error following LibGen redirects: {e}", file=sys.stderr)
# Try head request as fallback
try:
response = session.head(libgen_url, allow_redirects=True, timeout=10)
if response.url != libgen_url:
debug(f"LibGen HEAD resolved to: {response.url}")
return response.url
except:
pass
return None
except Exception as e:
log(f"Error resolving LibGen URL: {e}", file=sys.stderr)
return None
def _download_direct_file(
url: str,
output_dir: Path,
debug_logger: Optional[DebugLogger] = None,
) -> DownloadMediaResult:
"""Download a direct file (PDF, image, document, etc.) without yt-dlp."""
ensure_directory(output_dir)
from urllib.parse import unquote, urlparse, parse_qs
import re
# Extract filename from URL
parsed_url = urlparse(url)
url_path = parsed_url.path
# Try to get filename from query parameters first (for LibGen and similar services)
# e.g., ?filename=Book+Title.pdf or &download=filename.pdf
filename = None
if parsed_url.query:
query_params = parse_qs(parsed_url.query)
for param_name in ('filename', 'download', 'file', 'name'):
if param_name in query_params and query_params[param_name]:
filename = query_params[param_name][0]
filename = unquote(filename)
break
# If not found in query params, extract from URL path
if not filename or not filename.strip():
filename = url_path.split("/")[-1] if url_path else ""
filename = unquote(filename)
# Remove query strings from filename if any
if "?" in filename:
filename = filename.split("?")[0]
# Try to get real filename from Content-Disposition header (HEAD request)
try:
with HTTPClient(timeout=10.0) as client:
response = client._request("HEAD", url, follow_redirects=True)
content_disposition = response.headers.get("content-disposition", "")
if content_disposition:
# Extract filename from Content-Disposition header
# Format: attachment; filename="filename.pdf" or filename=filename.pdf
match = re.search(r'filename\*?=(?:"([^"]*)"|([^;\s]*))', content_disposition)
if match:
extracted_name = match.group(1) or match.group(2)
if extracted_name:
filename = unquote(extracted_name)
debug(f"Filename from Content-Disposition: {filename}")
except Exception as e:
log(f"Could not get filename from headers: {e}", file=sys.stderr)
# Fallback if we still don't have a good filename
if not filename or "." not in filename:
filename = "downloaded_file.bin"
file_path = output_dir / filename
progress_bar = ProgressBar()
debug(f"Direct download: {filename}")
try:
start_time = time.time()
downloaded_bytes = [0]
total_bytes = [0]
last_progress_time = [start_time]
def progress_callback(bytes_downloaded: int, content_length: int) -> None:
downloaded_bytes[0] = bytes_downloaded
total_bytes[0] = content_length
now = time.time()
if now - last_progress_time[0] >= 0.5 and total_bytes[0] > 0:
elapsed = now - start_time
percent = (bytes_downloaded / content_length) * 100 if content_length > 0 else 0
speed = bytes_downloaded / elapsed if elapsed > 0 else 0
eta_seconds = (content_length - bytes_downloaded) / speed if speed > 0 else 0
speed_str = progress_bar.format_bytes(speed) + "/s"
minutes, seconds = divmod(int(eta_seconds), 60)
hours, minutes = divmod(minutes, 60)
eta_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
progress_line = progress_bar.format_progress(
percent_str=f"{percent:.1f}%",
downloaded=bytes_downloaded,
total=content_length,
speed_str=speed_str,
eta_str=eta_str,
)
debug(progress_line)
last_progress_time[0] = now
with HTTPClient(timeout=30.0) as client:
client.download(url, str(file_path), progress_callback=progress_callback)
elapsed = time.time() - start_time
avg_speed_str = progress_bar.format_bytes(downloaded_bytes[0] / elapsed if elapsed > 0 else 0) + "/s"
debug(f"✓ Downloaded in {elapsed:.1f}s at {avg_speed_str}")
# For direct file downloads, create minimal info dict without filename as title
# This prevents creating duplicate title: tags when filename gets auto-generated
# We'll add title back later only if we couldn't extract meaningful tags
info = {
"id": filename.rsplit(".", 1)[0],
"ext": filename.rsplit(".", 1)[1] if "." in filename else "bin",
"webpage_url": url,
}
hash_value = None
try:
hash_value = sha256_file(file_path)
except Exception:
pass
tags = []
if extract_ytdlp_tags:
try:
tags = extract_ytdlp_tags(info)
except Exception as e:
log(f"Error extracting tags: {e}", file=sys.stderr)
# Only use filename as a title tag if we couldn't extract any meaningful tags
# This prevents duplicate title: tags when the filename could be mistaken for metadata
if not any(t.startswith('title:') for t in tags):
# Re-extract tags with filename as title only if needed
info['title'] = filename
tags = []
if extract_ytdlp_tags:
try:
tags = extract_ytdlp_tags(info)
except Exception as e:
log(f"Error extracting tags with filename: {e}", file=sys.stderr)
if debug_logger is not None:
debug_logger.write_record(
"direct-file-downloaded",
{"url": url, "path": str(file_path), "hash": hash_value},
)
return DownloadMediaResult(
path=file_path,
info=info,
tags=tags,
source_url=url,
hash_value=hash_value,
)
except (httpx.HTTPError, httpx.RequestError) as exc:
log(f"Download error: {exc}", file=sys.stderr)
if debug_logger is not None:
debug_logger.write_record(
"exception",
{"phase": "direct-file", "url": url, "error": str(exc)},
)
raise DownloadError(f"Failed to download {url}: {exc}") from exc
except Exception as exc:
log(f"Error downloading file: {exc}", file=sys.stderr)
if debug_logger is not None:
debug_logger.write_record(
"exception",
{
"phase": "direct-file",
"url": url,
"error": str(exc),
"traceback": traceback.format_exc(),
},
)
raise DownloadError(f"Error downloading file: {exc}") from exc
def probe_url(url: str, no_playlist: bool = False) -> Optional[Dict[str, Any]]:
"""Probe URL to extract metadata WITHOUT downloading.
Args:
url: URL to probe
no_playlist: If True, ignore playlists and probe only the single video
Returns:
Dict with keys: extractor, title, entries (if playlist), duration, etc.
Returns None if not supported by yt-dlp.
"""
if not is_url_supported_by_ytdlp(url):
return None
_ensure_yt_dlp_ready()
assert yt_dlp is not None
try:
# Extract info without downloading
# Use extract_flat='in_playlist' to get full metadata for playlist items
ydl_opts = {
"quiet": True, # Suppress all output
"no_warnings": True,
"socket_timeout": 10,
"retries": 3,
"skip_download": True, # Don't actually download
"extract_flat": "in_playlist", # Get playlist with metadata for each entry
"noprogress": True, # No progress bars
"quiet": True,
}
# Add no_playlist option if specified
if no_playlist:
ydl_opts["noplaylist"] = True
with yt_dlp.YoutubeDL(ydl_opts) as ydl: # type: ignore[arg-type]
info = ydl.extract_info(url, download=False)
if not isinstance(info, dict):
return None
# Extract relevant fields
return {
"extractor": info.get("extractor", ""),
"title": info.get("title", ""),
"entries": info.get("entries", []), # Will be populated if playlist
"duration": info.get("duration"),
"uploader": info.get("uploader"),
"description": info.get("description"),
"url": url,
}
except Exception as exc:
log(f"Probe failed for {url}: {exc}")
return None
def download_media(
opts: DownloadOptions,
*,
debug_logger: Optional[DebugLogger] = None,
) -> DownloadMediaResult:
"""Download media from URL using yt-dlp or direct HTTP download.
Args:
opts: DownloadOptions with url, mode, output_dir, etc.
debug_logger: Optional debug logger for troubleshooting
Returns:
DownloadMediaResult with path, info, tags, hash
Raises:
DownloadError: If download fails
"""
# Handle LibGen URLs specially
# file.php redirects to mirrors, get.php is direct from modern API
if 'libgen' in opts.url.lower():
if '/get.php' in opts.url.lower():
# Modern API get.php links are direct downloads from mirrors (not file redirects)
log(f"Detected LibGen get.php URL, downloading directly...")
if debug_logger is not None:
debug_logger.write_record("libgen-direct", {"url": opts.url})
return _download_direct_file(opts.url, opts.output_dir, debug_logger)
elif '/file.php' in opts.url.lower():
# Old-style file.php redirects to mirrors, we need to resolve
log(f"Detected LibGen file.php URL, resolving to actual mirror...")
actual_url = _get_libgen_download_url(opts.url)
if actual_url and actual_url != opts.url:
log(f"Resolved LibGen URL to mirror: {actual_url}")
opts.url = actual_url
# After resolution, this will typically be an onion link or direct file
# Skip yt-dlp for this (it won't support onion/mirrors), go direct
if debug_logger is not None:
debug_logger.write_record("libgen-resolved", {"original": opts.url, "resolved": actual_url})
return _download_direct_file(opts.url, opts.output_dir, debug_logger)
else:
log(f"Could not resolve LibGen URL, trying direct download anyway", file=sys.stderr)
if debug_logger is not None:
debug_logger.write_record("libgen-resolve-failed", {"url": opts.url})
return _download_direct_file(opts.url, opts.output_dir, debug_logger)
# Try yt-dlp first if URL is supported
if not is_url_supported_by_ytdlp(opts.url):
log(f"URL not supported by yt-dlp, trying direct download: {opts.url}")
if debug_logger is not None:
debug_logger.write_record("direct-file-attempt", {"url": opts.url})
return _download_direct_file(opts.url, opts.output_dir, debug_logger)
_ensure_yt_dlp_ready()
ytdl_options = _build_ytdlp_options(opts)
log(f"Starting yt-dlp download: {opts.url}")
if debug_logger is not None:
debug_logger.write_record("ytdlp-start", {"url": opts.url})
assert yt_dlp is not None
try:
with yt_dlp.YoutubeDL(ytdl_options) as ydl: # type: ignore[arg-type]
info = ydl.extract_info(opts.url, download=True)
except Exception as exc:
log(f"yt-dlp failed: {exc}", file=sys.stderr)
if debug_logger is not None:
debug_logger.write_record(
"exception",
{
"phase": "yt-dlp",
"error": str(exc),
"traceback": traceback.format_exc(),
},
)
raise DownloadError("yt-dlp download failed") from exc
if not isinstance(info, dict):
log(f"Unexpected yt-dlp response: {type(info)}", file=sys.stderr)
raise DownloadError("Unexpected yt-dlp response type")
info_dict: Dict[str, Any] = info
if debug_logger is not None:
debug_logger.write_record(
"ytdlp-info",
{
"keys": sorted(info_dict.keys()),
"is_playlist": bool(info_dict.get("entries")),
},
)
try:
entry, media_path = _resolve_entry_and_path(info_dict, opts.output_dir)
except FileNotFoundError as exc:
log(f"Error: {exc}", file=sys.stderr)
if debug_logger is not None:
debug_logger.write_record(
"exception",
{"phase": "resolve-path", "error": str(exc)},
)
raise DownloadError(str(exc)) from exc
if debug_logger is not None:
debug_logger.write_record(
"resolved-media",
{"path": str(media_path), "entry_keys": sorted(entry.keys())},
)
# Extract hash from metadata or compute
hash_value = _extract_sha256(entry) or _extract_sha256(info_dict)
if not hash_value:
try:
hash_value = sha256_file(media_path)
except OSError as exc:
if debug_logger is not None:
debug_logger.write_record(
"hash-error",
{"path": str(media_path), "error": str(exc)},
)
# Extract tags using metadata.py
tags = []
if extract_ytdlp_tags:
try:
tags = extract_ytdlp_tags(entry)
except Exception as e:
log(f"Error extracting tags: {e}", file=sys.stderr)
source_url = (
entry.get("webpage_url")
or entry.get("original_url")
or entry.get("url")
)
log(f"✓ Downloaded: {media_path.name} ({len(tags)} tags)")
if debug_logger is not None:
debug_logger.write_record(
"downloaded",
{
"path": str(media_path),
"tag_count": len(tags),
"source_url": source_url,
"sha256": hash_value,
},
)
return DownloadMediaResult(
path=media_path,
info=entry,
tags=tags,
source_url=source_url,
hash_value=hash_value,
)
__all__ = [
"download_media",
"is_url_supported_by_ytdlp",
"DownloadError",
"DownloadOptions",
"DownloadMediaResult",
]

180
helper/file_server.py Normal file
View File

@@ -0,0 +1,180 @@
"""Simple HTTP file server for serving files in web mode."""
import threading
import socket
import logging
from http.server import HTTPServer, SimpleHTTPRequestHandler
from pathlib import Path
from typing import Optional
import mimetypes
import urllib.parse
logger = logging.getLogger(__name__)
# Global server instance
_file_server: Optional[HTTPServer] = None
_server_thread: Optional[threading.Thread] = None
_server_port: int = 8001
class FileServerHandler(SimpleHTTPRequestHandler):
"""HTTP request handler for file serving."""
def do_GET(self):
"""Handle GET requests."""
# Parse the path
parsed_path = urllib.parse.urlparse(self.path)
file_path = urllib.parse.unquote(parsed_path.path)
# Remove leading slash
if file_path.startswith('/'):
file_path = file_path[1:]
# Decode the file path (it's URL encoded)
try:
full_path = Path(file_path).resolve()
# Security check: ensure the path is within allowed directories
# For now, allow all paths (can be restricted later)
if full_path.is_file() and full_path.exists():
# Serve the file
logger.debug(f"Serving file: {full_path}")
# Determine content type
content_type, _ = mimetypes.guess_type(str(full_path))
if content_type is None:
content_type = 'application/octet-stream'
try:
with open(full_path, 'rb') as f:
file_content = f.read()
self.send_response(200)
self.send_header('Content-type', content_type)
self.send_header('Content-Length', str(len(file_content)))
self.send_header('Content-Disposition', f'attachment; filename="{full_path.name}"')
self.end_headers()
self.wfile.write(file_content)
logger.info(f"Successfully served file: {full_path.name}")
return
except Exception as e:
logger.error(f"Error serving file: {e}")
self.send_error(500, "Internal server error")
return
else:
logger.warning(f"File not found: {full_path}")
self.send_error(404, "File not found")
return
except Exception as e:
logger.error(f"Error handling request: {e}")
self.send_error(400, "Bad request")
def log_message(self, format, *args):
"""Override to use our logger instead of stderr."""
logger.debug(format % args)
def get_local_ip() -> Optional[str]:
"""Get the local IP address that's accessible from other devices."""
try:
# Connect to a remote server to determine local IP
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
s.connect(("8.8.8.8", 80))
ip = s.getsockname()[0]
s.close()
return ip
except Exception as e:
logger.warning(f"Failed to determine local IP: {e}")
return None
def start_file_server(port: int = 8001) -> Optional[str]:
"""Start the HTTP file server.
Args:
port: Port to serve on
Returns:
Server URL if successful, None otherwise
"""
global _file_server, _server_thread, _server_port
if _file_server is not None:
logger.debug(f"File server already running on port {_server_port}")
local_ip = get_local_ip()
if local_ip:
return f"http://{local_ip}:{_server_port}"
return None
try:
_server_port = port
# Create server
server_address = ('', port)
_file_server = HTTPServer(server_address, FileServerHandler)
# Start in daemon thread
_server_thread = threading.Thread(target=_file_server.serve_forever, daemon=True)
_server_thread.start()
logger.info(f"File server started on port {port}")
# Get local IP
local_ip = get_local_ip()
if local_ip:
server_url = f"http://{local_ip}:{port}"
logger.info(f"File server accessible at: {server_url}")
return server_url
else:
logger.warning("Could not determine local IP")
return None
except Exception as e:
logger.error(f"Failed to start file server: {e}")
_file_server = None
_server_thread = None
return None
def stop_file_server():
"""Stop the HTTP file server."""
global _file_server, _server_thread
if _file_server is not None:
try:
_file_server.shutdown()
_file_server.server_close()
logger.info("File server stopped")
except Exception as e:
logger.error(f"Error stopping file server: {e}")
finally:
_file_server = None
_server_thread = None
def get_file_url(file_path: Path, server_url: Optional[str] = None) -> Optional[str]:
"""Get the HTTP URL for a file.
Args:
file_path: Path to the file
server_url: Base server URL (gets determined if None)
Returns:
HTTP URL to the file, or None if server not running
"""
if not file_path.exists():
logger.warning(f"File does not exist: {file_path}")
return None
if server_url is None:
local_ip = get_local_ip()
if not local_ip:
logger.error("Cannot determine local IP for file URL")
return None
server_url = f"http://{local_ip}:{_server_port}"
# URL encode the file path
encoded_path = urllib.parse.quote(str(file_path.resolve()))
return f"{server_url}/{encoded_path}"

1039
helper/file_storage.py Normal file

File diff suppressed because it is too large Load Diff

579
helper/http_client.py Normal file
View File

@@ -0,0 +1,579 @@
"""
Unified HTTP client for downlow using httpx.
Provides synchronous and asynchronous HTTP operations with:
- Automatic retries on transient failures
- Configurable timeouts and headers
- Built-in progress tracking for downloads
- Request/response logging support
"""
import httpx
import asyncio
from typing import Optional, Dict, Any, Callable, BinaryIO
from pathlib import Path
import logging
logger = logging.getLogger(__name__)
# Default configuration
DEFAULT_TIMEOUT = 30.0
DEFAULT_RETRIES = 3
DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
class HTTPClient:
"""Unified HTTP client with sync support."""
def __init__(
self,
timeout: float = DEFAULT_TIMEOUT,
retries: int = DEFAULT_RETRIES,
user_agent: str = DEFAULT_USER_AGENT,
verify_ssl: bool = True,
headers: Optional[Dict[str, str]] = None,
):
"""
Initialize HTTP client.
Args:
timeout: Request timeout in seconds
retries: Number of retries on transient failures
user_agent: User-Agent header value
verify_ssl: Whether to verify SSL certificates
headers: Additional headers to include in all requests
"""
self.timeout = timeout
self.retries = retries
self.user_agent = user_agent
self.verify_ssl = verify_ssl
self.base_headers = headers or {}
self._client: Optional[httpx.Client] = None
def __enter__(self):
"""Context manager entry."""
self._client = httpx.Client(
timeout=self.timeout,
verify=self.verify_ssl,
headers=self._get_headers(),
)
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit."""
if self._client:
self._client.close()
self._client = None
def _get_headers(self) -> Dict[str, str]:
"""Get request headers with user-agent."""
headers = {"User-Agent": self.user_agent}
headers.update(self.base_headers)
return headers
def get(
self,
url: str,
params: Optional[Dict[str, Any]] = None,
headers: Optional[Dict[str, str]] = None,
allow_redirects: bool = True,
) -> httpx.Response:
"""
Make a GET request.
Args:
url: Request URL
params: Query parameters
headers: Additional headers
allow_redirects: Follow redirects
Returns:
httpx.Response object
"""
return self._request(
"GET",
url,
params=params,
headers=headers,
follow_redirects=allow_redirects,
)
def post(
self,
url: str,
data: Optional[Any] = None,
json: Optional[Dict] = None,
files: Optional[Dict] = None,
headers: Optional[Dict[str, str]] = None,
) -> httpx.Response:
"""
Make a POST request.
Args:
url: Request URL
data: Form data
json: JSON data
files: Files to upload
headers: Additional headers
Returns:
httpx.Response object
"""
return self._request(
"POST",
url,
data=data,
json=json,
files=files,
headers=headers,
)
def put(
self,
url: str,
data: Optional[Any] = None,
json: Optional[Dict] = None,
content: Optional[Any] = None,
files: Optional[Dict] = None,
headers: Optional[Dict[str, str]] = None,
) -> httpx.Response:
"""
Make a PUT request.
Args:
url: Request URL
data: Form data
json: JSON data
content: Raw content
files: Files to upload
headers: Additional headers
Returns:
httpx.Response object
"""
return self._request(
"PUT",
url,
data=data,
json=json,
content=content,
files=files,
headers=headers,
)
def delete(
self,
url: str,
headers: Optional[Dict[str, str]] = None,
) -> httpx.Response:
"""
Make a DELETE request.
Args:
url: Request URL
headers: Additional headers
Returns:
httpx.Response object
"""
return self._request(
"DELETE",
url,
headers=headers,
)
def request(
self,
method: str,
url: str,
**kwargs
) -> httpx.Response:
"""
Make a generic HTTP request.
Args:
method: HTTP method
url: Request URL
**kwargs: Additional arguments
Returns:
httpx.Response object
"""
return self._request(method, url, **kwargs)
def download(
self,
url: str,
file_path: str,
chunk_size: int = 8192,
progress_callback: Optional[Callable[[int, int], None]] = None,
headers: Optional[Dict[str, str]] = None,
) -> Path:
"""
Download a file from URL with optional progress tracking.
Args:
url: File URL
file_path: Local file path to save to
chunk_size: Download chunk size
progress_callback: Callback(bytes_downloaded, total_bytes)
headers: Additional headers
Returns:
Path object of downloaded file
"""
path = Path(file_path)
path.parent.mkdir(parents=True, exist_ok=True)
with self._request_stream("GET", url, headers=headers, follow_redirects=True) as response:
response.raise_for_status()
total_bytes = int(response.headers.get("content-length", 0))
bytes_downloaded = 0
with open(path, "wb") as f:
for chunk in response.iter_bytes(chunk_size):
if chunk:
f.write(chunk)
bytes_downloaded += len(chunk)
if progress_callback:
progress_callback(bytes_downloaded, total_bytes)
return path
def _request(
self,
method: str,
url: str,
**kwargs
) -> httpx.Response:
"""
Make an HTTP request with automatic retries.
Args:
method: HTTP method
url: Request URL
**kwargs: Additional arguments for httpx.Client.request()
Returns:
httpx.Response object
"""
if not self._client:
raise RuntimeError("HTTPClient must be used with context manager (with statement)")
# Merge headers
if "headers" in kwargs and kwargs["headers"]:
headers = self._get_headers()
headers.update(kwargs["headers"])
kwargs["headers"] = headers
else:
kwargs["headers"] = self._get_headers()
last_exception = None
for attempt in range(self.retries):
try:
response = self._client.request(method, url, **kwargs)
response.raise_for_status()
return response
except httpx.TimeoutException as e:
last_exception = e
logger.warning(f"Timeout on attempt {attempt + 1}/{self.retries}: {url}")
if attempt < self.retries - 1:
continue
except httpx.HTTPStatusError as e:
# Don't retry on 4xx errors
if 400 <= e.response.status_code < 500:
try:
response_text = e.response.text[:500]
except:
response_text = "<unable to read response>"
logger.error(f"HTTP {e.response.status_code} from {url}: {response_text}")
raise
last_exception = e
try:
response_text = e.response.text[:200]
except:
response_text = "<unable to read response>"
logger.warning(f"HTTP {e.response.status_code} on attempt {attempt + 1}/{self.retries}: {url} - {response_text}")
if attempt < self.retries - 1:
continue
except (httpx.RequestError, httpx.ConnectError) as e:
last_exception = e
logger.warning(f"Connection error on attempt {attempt + 1}/{self.retries}: {url} - {e}")
if attempt < self.retries - 1:
continue
if last_exception:
logger.error(f"Request failed after {self.retries} attempts: {url} - {last_exception}")
raise last_exception
raise RuntimeError("Request failed after retries")
def _request_stream(self, method: str, url: str, **kwargs):
"""Make a streaming request."""
if not self._client:
raise RuntimeError("HTTPClient must be used with context manager (with statement)")
# Merge headers
if "headers" in kwargs and kwargs["headers"]:
headers = self._get_headers()
headers.update(kwargs["headers"])
kwargs["headers"] = headers
else:
kwargs["headers"] = self._get_headers()
return self._client.stream(method, url, **kwargs)
class AsyncHTTPClient:
"""Unified async HTTP client with asyncio support."""
def __init__(
self,
timeout: float = DEFAULT_TIMEOUT,
retries: int = DEFAULT_RETRIES,
user_agent: str = DEFAULT_USER_AGENT,
verify_ssl: bool = True,
headers: Optional[Dict[str, str]] = None,
):
"""
Initialize async HTTP client.
Args:
timeout: Request timeout in seconds
retries: Number of retries on transient failures
user_agent: User-Agent header value
verify_ssl: Whether to verify SSL certificates
headers: Additional headers to include in all requests
"""
self.timeout = timeout
self.retries = retries
self.user_agent = user_agent
self.verify_ssl = verify_ssl
self.base_headers = headers or {}
self._client: Optional[httpx.AsyncClient] = None
async def __aenter__(self):
"""Async context manager entry."""
self._client = httpx.AsyncClient(
timeout=self.timeout,
verify=self.verify_ssl,
headers=self._get_headers(),
)
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
"""Async context manager exit."""
if self._client:
await self._client.aclose()
self._client = None
def _get_headers(self) -> Dict[str, str]:
"""Get request headers with user-agent."""
headers = {"User-Agent": self.user_agent}
headers.update(self.base_headers)
return headers
async def get(
self,
url: str,
params: Optional[Dict[str, Any]] = None,
headers: Optional[Dict[str, str]] = None,
allow_redirects: bool = True,
) -> httpx.Response:
"""
Make an async GET request.
Args:
url: Request URL
params: Query parameters
headers: Additional headers
allow_redirects: Follow redirects
Returns:
httpx.Response object
"""
return await self._request(
"GET",
url,
params=params,
headers=headers,
follow_redirects=allow_redirects,
)
async def post(
self,
url: str,
data: Optional[Any] = None,
json: Optional[Dict] = None,
headers: Optional[Dict[str, str]] = None,
) -> httpx.Response:
"""
Make an async POST request.
Args:
url: Request URL
data: Form data
json: JSON data
headers: Additional headers
Returns:
httpx.Response object
"""
return await self._request(
"POST",
url,
data=data,
json=json,
headers=headers,
)
async def download(
self,
url: str,
file_path: str,
chunk_size: int = 8192,
progress_callback: Optional[Callable[[int, int], None]] = None,
headers: Optional[Dict[str, str]] = None,
) -> Path:
"""
Download a file from URL asynchronously with optional progress tracking.
Args:
url: File URL
file_path: Local file path to save to
chunk_size: Download chunk size
progress_callback: Callback(bytes_downloaded, total_bytes)
headers: Additional headers
Returns:
Path object of downloaded file
"""
path = Path(file_path)
path.parent.mkdir(parents=True, exist_ok=True)
async with self._request_stream("GET", url, headers=headers) as response:
response.raise_for_status()
total_bytes = int(response.headers.get("content-length", 0))
bytes_downloaded = 0
with open(path, "wb") as f:
async for chunk in response.aiter_bytes(chunk_size):
if chunk:
f.write(chunk)
bytes_downloaded += len(chunk)
if progress_callback:
progress_callback(bytes_downloaded, total_bytes)
return path
async def _request(
self,
method: str,
url: str,
**kwargs
) -> httpx.Response:
"""
Make an async HTTP request with automatic retries.
Args:
method: HTTP method
url: Request URL
**kwargs: Additional arguments for httpx.AsyncClient.request()
Returns:
httpx.Response object
"""
if not self._client:
raise RuntimeError("AsyncHTTPClient must be used with async context manager")
# Merge headers
if "headers" in kwargs and kwargs["headers"]:
headers = self._get_headers()
headers.update(kwargs["headers"])
kwargs["headers"] = headers
else:
kwargs["headers"] = self._get_headers()
last_exception = None
for attempt in range(self.retries):
try:
response = await self._client.request(method, url, **kwargs)
response.raise_for_status()
return response
except httpx.TimeoutException as e:
last_exception = e
logger.warning(f"Timeout on attempt {attempt + 1}/{self.retries}: {url}")
if attempt < self.retries - 1:
await asyncio.sleep(0.5) # Brief delay before retry
continue
except httpx.HTTPStatusError as e:
# Don't retry on 4xx errors
if 400 <= e.response.status_code < 500:
try:
response_text = e.response.text[:500]
except:
response_text = "<unable to read response>"
logger.error(f"HTTP {e.response.status_code} from {url}: {response_text}")
raise
last_exception = e
try:
response_text = e.response.text[:200]
except:
response_text = "<unable to read response>"
logger.warning(f"HTTP {e.response.status_code} on attempt {attempt + 1}/{self.retries}: {url} - {response_text}")
if attempt < self.retries - 1:
await asyncio.sleep(0.5)
continue
except (httpx.RequestError, httpx.ConnectError) as e:
last_exception = e
logger.warning(f"Connection error on attempt {attempt + 1}/{self.retries}: {url} - {e}")
if attempt < self.retries - 1:
await asyncio.sleep(0.5)
continue
if last_exception:
logger.error(f"Request failed after {self.retries} attempts: {url} - {last_exception}")
raise last_exception
raise RuntimeError("Request failed after retries")
def _request_stream(self, method: str, url: str, **kwargs):
"""Make a streaming request."""
if not self._client:
raise RuntimeError("AsyncHTTPClient must be used with async context manager")
# Merge headers
if "headers" in kwargs and kwargs["headers"]:
headers = self._get_headers()
headers.update(kwargs["headers"])
kwargs["headers"] = headers
else:
kwargs["headers"] = self._get_headers()
return self._client.stream(method, url, **kwargs)
# Convenience function for quick sync requests
def get(url: str, **kwargs) -> httpx.Response:
"""Quick GET request without context manager."""
with HTTPClient() as client:
return client.get(url, **kwargs)
def post(url: str, **kwargs) -> httpx.Response:
"""Quick POST request without context manager."""
with HTTPClient() as client:
return client.post(url, **kwargs)
def download(
url: str,
file_path: str,
progress_callback: Optional[Callable[[int, int], None]] = None,
**kwargs
) -> Path:
"""Quick file download without context manager."""
with HTTPClient() as client:
return client.download(url, file_path, progress_callback=progress_callback, **kwargs)

1553
helper/hydrus.py Normal file

File diff suppressed because it is too large Load Diff

377
helper/libgen_service.py Normal file
View File

@@ -0,0 +1,377 @@
"""Shared Library Genesis search and download helpers."""
from __future__ import annotations
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Optional
import logging
import requests
from urllib.parse import quote, urljoin
from libgen import search_sync, LibgenError
LogFn = Optional[Callable[[str], None]]
ErrorFn = Optional[Callable[[str], None]]
DEFAULT_TIMEOUT = 10.0
DEFAULT_LIMIT = 50
logging.getLogger(__name__).setLevel(logging.WARNING)
def _call(logger: LogFn, message: str) -> None:
if logger:
logger(message)
def search_libgen_no_ads(query: str, session: Optional[requests.Session] = None) -> List[Dict[str, Any]]:
"""Search Libgen without triggering ads.php requests."""
try:
from bs4 import BeautifulSoup
except ImportError: # pragma: no cover
logging.warning("BeautifulSoup not available; falling back to standard search")
return []
mirrors = [
"https://libgen.gl",
"https://libgen.vg",
"https://libgen.la",
"https://libgen.bz",
"https://libgen.gs",
]
session = session or requests.Session()
session.headers.setdefault(
"User-Agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
)
for mirror in mirrors:
try:
search_url = f"{mirror}/index.php?req={quote(query)}&res=100&covers=on&filesuns=all"
response = session.get(search_url, timeout=DEFAULT_TIMEOUT)
if response.status_code != 200:
continue
soup = BeautifulSoup(response.content, "html.parser")
table = soup.find("table", {"class": "catalog"})
if table is None:
for candidate in soup.find_all("table"):
rows = candidate.find_all("tr")
if len(rows) > 2:
table = candidate
break
if table is None:
logging.debug("[libgen_no_ads] No results table on %s", mirror)
continue
rows = table.find_all("tr")[1:]
results: List[Dict[str, Any]] = []
for row in rows:
try:
cells = row.find_all("td")
if len(cells) < 9:
continue
size_cell = cells[7]
file_link = size_cell.find("a")
mirror_link = ""
if file_link:
href = str(file_link.get("href", ""))
if href.startswith("/"):
mirror_link = mirror + href
elif href:
mirror_link = urljoin(mirror, href)
if not mirror_link:
title_link = cells[1].find("a") if len(cells) > 1 else None
if title_link:
href = str(title_link.get("href", ""))
if href.startswith("/"):
mirror_link = mirror + href
elif href:
mirror_link = urljoin(mirror, href)
if not mirror_link:
continue
results.append(
{
"id": "",
"mirror": mirror_link,
"cover": "",
"title": cells[1].get_text(strip=True) if len(cells) > 1 else "Unknown",
"authors": [cells[2].get_text(strip=True)]
if len(cells) > 2
else ["Unknown"],
"publisher": cells[3].get_text(strip=True) if len(cells) > 3 else "",
"year": cells[4].get_text(strip=True) if len(cells) > 4 else "",
"pages": cells[6].get_text(strip=True) if len(cells) > 6 else "",
"language": cells[5].get_text(strip=True) if len(cells) > 5 else "",
"size": cells[7].get_text(strip=True) if len(cells) > 7 else "",
"extension": cells[8].get_text(strip=True) if len(cells) > 8 else "",
"isbn": "",
}
)
except Exception as exc: # pragma: no cover - defensive
logging.debug("[libgen_no_ads] Error parsing row: %s", exc)
continue
if results:
logging.info("[libgen_no_ads] %d results from %s", len(results), mirror)
return results
except Exception as exc: # pragma: no cover - mirror issues
logging.debug("[libgen_no_ads] Mirror %s failed: %s", mirror, exc)
continue
return []
def format_book_info(book: Any) -> Dict[str, Any]:
"""Format Libgen search result into a consistent dictionary."""
filesize_bytes = 0
size_str = getattr(book, "size", "") or ""
if size_str:
parts = size_str.strip().split()
try:
value = float(parts[0])
unit = parts[1].upper() if len(parts) > 1 else "B"
if unit in {"MB", "M"}:
filesize_bytes = int(value * 1024 * 1024)
elif unit in {"GB", "G"}:
filesize_bytes = int(value * 1024 * 1024 * 1024)
elif unit in {"KB", "K"}:
filesize_bytes = int(value * 1024)
else:
filesize_bytes = int(value)
except (ValueError, IndexError): # pragma: no cover - defensive
filesize_bytes = 0
title = getattr(book, "title", "") or ""
isbn = getattr(book, "isbn", "") or ""
if not isbn and title:
import re
match = re.search(
r"((?:[\d]{10,13}(?:\s*[;,]\s*[\d]{10,13})+)|(?:[\d]{10,13})(?:\s*[;,]?\s*[\d\-]{0,50})?)\s*(?:\b|$)",
title,
)
if match:
potential_isbn = match.group(0).strip()
if re.search(r"\d{10,13}", potential_isbn):
isbn = potential_isbn
title = re.sub(r"\s+[a-z]\s*$", "", title[: match.start()].strip(), flags=re.IGNORECASE)
authors_value = getattr(book, "authors", None)
if isinstance(authors_value, Iterable) and not isinstance(authors_value, str):
authors_str = ", ".join(str(author) for author in authors_value)
else:
authors_str = str(authors_value or "Unknown")
download_links = getattr(book, "download_links", None)
mirror_url = None
if download_links and getattr(download_links, "get_link", None):
mirror_url = download_links.get_link
return {
"title": title or "Unknown",
"author": authors_str,
"publisher": getattr(book, "publisher", "") or "",
"year": getattr(book, "year", "") or "",
"pages": getattr(book, "pages", "") or "",
"language": getattr(book, "language", "") or "",
"filesize": filesize_bytes,
"filesize_str": size_str or "Unknown",
"extension": getattr(book, "extension", "") or "",
"isbn": isbn,
"mirror_url": mirror_url,
}
def search_libgen(
query: str,
limit: int = DEFAULT_LIMIT,
*,
log_info: LogFn = None,
log_error: ErrorFn = None,
session: Optional[requests.Session] = None,
) -> List[Dict[str, Any]]:
"""Search Libgen returning formatted dictionaries with multiple mirrors.
Uses HTML scraper (search_libgen_no_ads) to find books quickly.
Returns mirror URLs and book IDs that can be used to generate alternative mirrors.
"""
try:
_call(log_info, f"[search] Searching Libgen for: {query}")
session = session or requests.Session()
# Use HTML scraper - more reliable and doesn't hang on mirror resolution
_call(log_info, "[search] Using HTML scraper (search_libgen_no_ads)...")
results: List[Any] = search_libgen_no_ads(query, session=session)
if not results:
_call(log_info, "[search] No results from HTML scraper")
return []
formatted: List[Dict[str, Any]] = []
mirrors_list = [
"https://libgen.gl",
"https://libgen.vg",
"https://libgen.la",
"https://libgen.bz",
"https://libgen.gs",
]
for book in results[:limit]:
if isinstance(book, dict):
# Result from search_libgen_no_ads (HTML scraper)
authors = book.get("authors", ["Unknown"])
if isinstance(authors, list):
author_value = ", ".join(str(a) for a in authors)
else:
author_value = str(authors)
# Extract book ID from mirror URL if available
mirror = book.get("mirror", "")
book_id = ""
if mirror and "/file.php?id=" in mirror:
try:
book_id = mirror.split("/file.php?id=")[1].split("&")[0]
except (IndexError, ValueError):
pass
# Build list of alternative mirrors based on book ID
mirrors_dict = {}
if book_id:
for mirror_base in mirrors_list:
mirrors_dict[mirror_base] = f"{mirror_base}/file.php?id={book_id}"
elif mirror:
# Fallback: use the mirror we found
mirrors_dict["primary"] = mirror
formatted.append(
{
"title": book.get("title", "Unknown"),
"author": author_value,
"publisher": book.get("publisher", ""),
"year": book.get("year", ""),
"pages": book.get("pages", ""),
"language": book.get("language", ""),
"filesize": 0,
"filesize_str": book.get("size", "Unknown"),
"extension": book.get("extension", ""),
"isbn": book.get("isbn", ""),
"mirror_url": mirror, # Primary mirror
"mirrors": mirrors_dict, # Alternative mirrors
"book_id": book_id,
}
)
else:
# Fallback: try to format as book object
try:
formatted.append(format_book_info(book))
except Exception:
pass
_call(log_info, f"[search] Found {len(formatted)} result(s)")
return formatted
except LibgenError as exc:
_call(log_error, f"[search] Libgen error: {exc}")
return []
except Exception as exc: # pragma: no cover - defensive
_call(log_error, f"[search] Error: {exc}")
return []
def download_from_mirror(
mirror_url: str,
output_path: str | Path,
*,
log_info: LogFn = None,
log_error: ErrorFn = None,
session: Optional[requests.Session] = None,
) -> bool:
"""Download a Libgen file and write it to disk.
Handles Libgen redirects and ensures proper file download by:
- Following all redirects (default behavior)
- Setting User-Agent header (required by some mirrors)
- Validating that we're downloading binary content, not HTML
- Attempting alternative download method if HTML is returned
"""
session = session or requests.Session()
try:
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
_call(log_info, f"[download] Downloading from mirror: {mirror_url}")
# Ensure session has proper headers for Libgen
if 'User-Agent' not in session.headers:
session.headers['User-Agent'] = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
)
# Download with redirects enabled (default) and referer
session.headers['Referer'] = 'https://libgen.gs/'
response = session.get(mirror_url, stream=True, timeout=30, allow_redirects=True)
response.raise_for_status()
# Check if we got HTML instead of a file (common Libgen issue)
content_type = response.headers.get('content-type', '').lower()
if 'text/html' in content_type:
_call(log_error, f"[download] Server returned HTML. Trying alternative method...")
# Try to extract file ID and use alternative CDN
try:
# Parse the HTML to extract MD5 or file ID
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
# Look for download link in the HTML
# Common patterns: md5 hash in form, or direct link in anchor tags
download_link = None
# Try to find forms that might contain download functionality
forms = soup.find_all('form')
for form in forms:
action = form.get('action', '')
if 'download' in action.lower() or 'get' in action.lower():
download_link = action
break
if not download_link:
_call(log_error, f"[download] Could not extract alternative download link from HTML")
return False
_call(log_info, f"[download] Using alternative download method: {download_link[:100]}")
# Try downloading from alternative link
response2 = session.get(download_link, stream=True, timeout=30, allow_redirects=True)
response2.raise_for_status()
response = response2 # Use the new response
except Exception as alt_error:
_call(log_error, f"[download] Alternative method failed: {alt_error}")
return False
total_size = int(response.headers.get("content-length", 0))
downloaded = 0
with open(output_path, "wb") as handle:
for chunk in response.iter_content(chunk_size=8192):
if not chunk:
continue
handle.write(chunk)
downloaded += len(chunk)
if total_size > 0:
percent = downloaded / total_size * 100
_call(
log_info,
f"[download] {percent:.1f}% - {downloaded // (1024*1024)}MB / {total_size // (1024*1024)}MB",
)
_call(log_info, f"[download] Downloaded successfully to: {output_path}")
return True
except Exception as exc: # pragma: no cover - defensive
_call(log_error, f"[download] Error: {exc}")
return False

1395
helper/local_library.py Normal file

File diff suppressed because it is too large Load Diff

70
helper/logger.py Normal file
View File

@@ -0,0 +1,70 @@
"""Unified logging utility for automatic file and function name tracking."""
import sys
import inspect
from pathlib import Path
_DEBUG_ENABLED = False
def set_debug(enabled: bool) -> None:
"""Enable or disable debug logging."""
global _DEBUG_ENABLED
_DEBUG_ENABLED = enabled
def debug(*args, **kwargs) -> None:
"""Print debug message if debug logging is enabled.
Automatically prepends [filename.function_name] to all output.
"""
if not _DEBUG_ENABLED:
return
# Set default to stderr for debug messages
if 'file' not in kwargs:
kwargs['file'] = sys.stderr
# Prepend DEBUG label
args = ("DEBUG:", *args)
# Use the same logic as log()
log(*args, **kwargs)
def log(*args, **kwargs) -> None:
"""Print with automatic file.function prefix.
Automatically prepends [filename.function_name] to all output.
Defaults to stdout if not specified.
Example:
log("Upload started") # Output: [add_file.run] Upload started
"""
# Get the calling frame
frame = inspect.currentframe()
if frame is None:
print(*args, **kwargs)
return
caller_frame = frame.f_back
if caller_frame is None:
print(*args, **kwargs)
return
try:
# Get file name without extension
file_name = Path(caller_frame.f_code.co_filename).stem
# Get function name
func_name = caller_frame.f_code.co_name
# Set default to stdout if not specified
if 'file' not in kwargs:
kwargs['file'] = sys.stdout
# Build prefix
prefix = f"[{file_name}.{func_name}]"
# Print with prefix
print(prefix, *args, **kwargs)
finally:
del frame
del caller_frame

951
helper/mpv_file.py Normal file
View File

@@ -0,0 +1,951 @@
"""MPV file metadata aggregation helpers."""
from __future__ import annotations
import os
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Sequence
from urllib.parse import parse_qs, urlparse, unquote
from config import get_hydrus_url
from helper.utils import sha256_file, unique_preserve_order
from helper.hydrus import HydrusClient, HydrusRequestError
import metadata
class MPVFileError(RuntimeError):
"""Raised when we cannot construct an MPV file snapshot."""
@dataclass(slots=True)
class DebridMagnet:
"""Represents a magnet result from AllDebrid search.
This class matches the structure expected by the TUI (like Hydrus results)
with title, target, media_kind attributes for compatibility.
"""
magnet_id: str
title: str
size: int
status_code: int
status_text: str
progress: float
downloaded: int
seeders: int
dl_speed: int
tag_summary: Optional[str] = None
metadata: Optional[Dict[str, Any]] = None # Complete magnet file metadata from AllDebrid API
@property
def target(self) -> str:
"""Return the target URI for this magnet (used by TUI for access operations)."""
return f"alldebrid://{self.magnet_id}"
@property
def media_kind(self) -> str:
"""Return media kind for display."""
return "magnet"
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for metadata display."""
return {
"magnet_id": self.magnet_id,
"title": self.title,
"size": self.size,
"status_code": self.status_code,
"status_text": self.status_text,
"progress": f"{self.progress:.1f}%",
"downloaded": self.downloaded,
"seeders": self.seeders,
"dl_speed": self.dl_speed,
}
@dataclass(slots=True)
class HydrusSettings:
base_url: Optional[str]
access_key: Optional[str]
timeout: float
prefer_service_name: Optional[str]
include_relationships: bool
def as_metadata_options(self) -> Dict[str, Any]:
options: Dict[str, Any] = {
"timeout": self.timeout,
"include_relationships": self.include_relationships,
}
if self.prefer_service_name:
options["prefer_service_name"] = self.prefer_service_name
return options
@dataclass(slots=True)
class MPVfile:
path: Optional[str] = None
filename: Optional[str] = None
type: str = "unknown"
hash: Optional[str] = None
local_path: Optional[str] = None
mpv_metadata: Dict[str, Any] = field(default_factory=dict)
metadata: Dict[str, Any] = field(default_factory=dict)
remote_metadata: Optional[Dict[str, Any]] = None
relationships: Optional[Dict[str, Any]] = None
relationship_metadata: Dict[str, Any] = field(default_factory=dict)
tags: List[str] = field(default_factory=list)
original_tags: Dict[str, str] = field(default_factory=dict)
known_urls: List[str] = field(default_factory=list)
title: Optional[str] = None
source_url: Optional[str] = None
clip_time: Optional[str] = None
duration: Optional[float] = None
filesize_mb: Optional[float] = None
is_video: bool = False
is_audio: bool = False
is_deleted: Optional[bool] = None
is_local: Optional[bool] = None
has_current_file_service: Optional[bool] = None
tag_service_key: Optional[str] = None
swap_recommended: bool = False
warnings: List[str] = field(default_factory=list)
# New relationship fields for menu
king: Optional[str] = None
alts: List[str] = field(default_factory=list)
def to_dict(self) -> Dict[str, Any]:
payload: Dict[str, Any] = {
"path": self.path,
"filename": self.filename,
"type": self.type,
"hash": self.hash,
"local_path": self.local_path,
"mpv_metadata": self.mpv_metadata,
"metadata": self.metadata,
"remote_metadata": self.remote_metadata,
"relationships": self.relationships,
"relationship_metadata": self.relationship_metadata,
"tags": self.tags,
"original_tags": self.original_tags,
"known_urls": self.known_urls,
"title": self.title,
"source_url": self.source_url,
"clip_time": self.clip_time,
"duration": self.duration,
"filesize_mb": self.filesize_mb,
"is_video": self.is_video,
"is_audio": self.is_audio,
"is_deleted": self.is_deleted,
"is_local": self.is_local,
"has_current_file_service": self.has_current_file_service,
"tag_service_key": self.tag_service_key,
"swap_recommended": self.swap_recommended,
"warnings": self.warnings,
# relationship summary fields for easier Lua consumption
"king": self.king,
"alts": self.alts,
}
# Remove empty optional values for terser payloads.
for key in list(payload.keys()):
value = payload[key]
if value in (None, [], {}, ""):
del payload[key]
return payload
def _normalise_string_list(values: Optional[Iterable[Any]]) -> List[str]:
if not values:
return []
seen: set[str] = set()
result: List[str] = []
for value in values:
if value is None:
continue
text = str(value).strip()
if not text or text in seen:
continue
seen.add(text)
result.append(text)
return result
def _looks_like_hash(value: Optional[str]) -> bool:
if not value:
return False
candidate = value.strip().lower()
return len(candidate) == 64 and all(ch in "0123456789abcdef" for ch in candidate)
class MPVFileBuilder:
def __init__(self, payload: Dict[str, Any], config: Dict[str, Any]):
self.payload = payload or {}
self.config = config or {}
self.state = MPVfile()
self.hydrus_settings = self._resolve_hydrus_settings()
self.remote_options = self._resolve_remote_options()
self.include_relationships = bool(self.payload.get("include_relationships", True))
self.last_url = self._normalise_url(self.payload.get("last_url"))
self._initialise_identity()
# ------------------------------------------------------------------
# public API
# ------------------------------------------------------------------
def build(self) -> Dict[str, Any]:
if self.state.type == "hydrus":
self._populate_hydrus_by_hash()
elif self.state.type == "local":
self._populate_local()
elif self.state.type == "remote":
self._populate_remote()
else:
# Attempt best effort resolution even for unknown types.
self._populate_local(best_effort=True)
self._finalise()
result = self.state.to_dict()
# Append King and Alts info to mpv_metadata for info menu
king = self.state.king
alts = self.state.alts
if king:
result.setdefault("mpv_metadata", {})["King"] = king
if alts:
result.setdefault("mpv_metadata", {})["Alts"] = ", ".join(alts)
return result
# ------------------------------------------------------------------
# configuration helpers
# ------------------------------------------------------------------
def _resolve_hydrus_settings(self) -> HydrusSettings:
overrides = self.payload.get("hydrus")
overrides = overrides if isinstance(overrides, dict) else {}
base_url = overrides.get("url") or overrides.get("base_url")
access_key = overrides.get("access_key")
timeout_raw = overrides.get("timeout") or overrides.get("hydrus_timeout")
prefer_service = overrides.get("prefer_service_name")
include_relationships = overrides.get("include_relationships")
if base_url is None:
base_url = get_hydrus_url(self.config)
if access_key is None:
raw_key = self.config.get("HydrusNetwork_Access_Key")
access_key = str(raw_key) if raw_key is not None else None
if timeout_raw is None:
timeout_raw = self.config.get("HydrusNetwork_Request_Timeout")
try:
timeout = float(timeout_raw) if timeout_raw is not None else 60.0
except (TypeError, ValueError):
timeout = 60.0
if prefer_service is None:
prefer_service = self.config.get("Hydrus_Tag_Service")
if isinstance(prefer_service, str):
prefer_service = prefer_service.strip() or None
if include_relationships is None:
include_relationships = self.payload.get("include_relationships")
include_relationships = bool(True if include_relationships is None else include_relationships)
base_url = base_url.strip() if isinstance(base_url, str) else None
access_key = access_key.strip() if isinstance(access_key, str) else None
return HydrusSettings(
base_url=base_url or None,
access_key=access_key or None,
timeout=timeout,
prefer_service_name=prefer_service,
include_relationships=include_relationships,
)
def _resolve_remote_options(self) -> Dict[str, Any]:
remote_payload = self.payload.get("remote")
remote_payload = remote_payload if isinstance(remote_payload, dict) else {}
options = remote_payload.get("options")
options = options if isinstance(options, dict) else {}
ytdlp_args = options.get("ytdlp_args")
if not ytdlp_args:
options["ytdlp_args"] = ["--no-playlist", "--skip-download", "--no-warnings"]
existing_timeout = options.get("timeout")
if existing_timeout is None:
options["timeout"] = min(90.0, max(10.0, float(self.payload.get("remote_timeout") or 45.0)))
return options
# ------------------------------------------------------------------
# initialisation
# ------------------------------------------------------------------
def _initialise_identity(self) -> None:
s = self.state
p = self.payload
def _str_or_none(v):
return str(v) if v is not None and v != "" else None
def _copy_dict_if_dict(v):
return dict(v) if isinstance(v, dict) else {}
# path and filename
s.path = _str_or_none(p.get("path"))
s.filename = _str_or_none(p.get("filename"))
# mpv metadata
s.mpv_metadata = _copy_dict_if_dict(p.get("mpv_metadata"))
# tags (support both "tags" and legacy "existing_tags")
existing_tags = p.get("tags") or p.get("existing_tags")
s.tags = _normalise_string_list(existing_tags)
if s.tags:
s.original_tags = {tag: tag for tag in s.tags}
# known URLs + last_url
s.known_urls = _normalise_string_list(p.get("known_urls"))
if self.last_url and self.last_url not in s.known_urls:
s.known_urls.append(self.last_url)
# source URL (explicit or fallback to last_url)
explicit_source = p.get("source_url")
s.source_url = self._normalise_url(explicit_source) or self.last_url
# hash (validate looks-like-hash)
hash_candidate = p.get("hash")
if isinstance(hash_candidate, str):
candidate = hash_candidate.strip().lower()
if _looks_like_hash(candidate):
s.hash = candidate
# local_path (non-empty string)
local_path_override = p.get("local_path")
if isinstance(local_path_override, str):
lp = local_path_override.strip()
if lp:
s.local_path = lp
# derive remaining fields from path/filename/type
self._derive_filename_from_path()
self._determine_type()
def _derive_filename_from_path(self) -> None:
if self.state.filename or not self.state.path:
return
parsed = urlparse(self.state.path)
if parsed.scheme in ("http", "https", "ytdl") and parsed.path:
candidate = Path(parsed.path).name
if candidate:
self.state.filename = candidate
elif parsed.scheme == "file":
decoded = self._decode_file_url(self.state.path)
if decoded:
self.state.filename = Path(decoded).name
else:
try:
self.state.filename = Path(self.state.path).name
except Exception:
pass
def _determine_type(self) -> None:
s = self.state
p = self.payload
def _set_local_from_path(pth: str | None):
if not pth:
return
# Prefer resolved local path when available
resolved = self._resolve_local_path(pth)
s.local_path = resolved if resolved else pth
s.type = "local"
# 1) Respect explicit type when valid
explicit = p.get("type")
if isinstance(explicit, str):
lowered = explicit.strip().lower()
if lowered in {"local", "hydrus", "remote"}:
s.type = lowered
if lowered == "local":
s.local_path = self._resolve_local_path(s.path)
return
# 2) Work from path
path = s.path or ""
if not path:
s.type = "unknown"
return
# 3) Hydrus-specific quick checks
if self._looks_like_hydrus_url(path):
s.type = "hydrus"
return
parsed = urlparse(path)
scheme = (parsed.scheme or "").lower()
# 4) scheme-based handling
if scheme == "hydrus":
s.type = "hydrus"
return
if scheme in {"http", "https", "rtmp", "rtsp", "magnet", "ytdl"}:
s.type = "hydrus" if self._looks_like_hydrus_url(path) else "remote"
return
if scheme == "file":
decoded = self._decode_file_url(path)
if decoded:
s.local_path = decoded
s.type = "local"
return
# 5) Windows/UNC absolute paths
if re.match(r"^[A-Za-z]:[\\/]", path) or path.startswith(("\\\\", "//")):
s.type = "local"
s.local_path = path
return
# 6) Fallback: if it looks like a URL with a scheme separator treat as remote/hydrus
if "://" in path:
s.type = "hydrus" if self._looks_like_hydrus_url(path) else "remote"
return
# 7) Otherwise treat as a local path
_set_local_from_path(path)
# ------------------------------------------------------------------
# population helpers
# ------------------------------------------------------------------
def _populate_local(self, best_effort: bool = False) -> None:
local_path = self.state.local_path or self._resolve_local_path(self.state.path)
if local_path:
self.state.local_path = local_path
self._load_sidecar_tags(local_path)
if not self.state.hash:
self._compute_local_hash(local_path)
# If Hydrus is configured and we have a hash, enrich from Hydrus; otherwise keep local tags only
if self.state.hash and self.hydrus_settings.base_url and self.hydrus_settings.access_key:
self._populate_hydrus_by_hash()
elif best_effort and self.hydrus_settings.base_url and self.state.source_url and self.hydrus_settings.access_key:
self._populate_hydrus_by_url(self.state.source_url)
# (helpers for resolving local path and loading sidecars already exist below)
def _populate_remote(self) -> None:
source_url = self.state.source_url or self.last_url or self.state.path
source_url = self._normalise_url(source_url)
if source_url:
self.state.source_url = source_url
remote_payload = {
"source_url": self.state.source_url,
"existing_tags": self.state.tags,
"metadata": self.payload.get("remote_metadata"),
"mpv_metadata": self.state.mpv_metadata,
"options": self.remote_options,
}
try:
remote_result = metadata.resolve_remote_metadata(remote_payload)
except Exception as exc: # pragma: no cover - surfaced to the caller
self.state.warnings.append(str(exc))
remote_result = None
if remote_result:
tags = remote_result.get("tags") or []
self._merge_tags(tags)
self.state.remote_metadata = remote_result.get("metadata")
self.state.title = remote_result.get("title") or self.state.title
self.state.duration = remote_result.get("duration") or self.state.duration
self.state.source_url = remote_result.get("source_url") or self.state.source_url
warnings = remote_result.get("warnings") or []
if warnings:
self.state.warnings.extend(warnings)
if self.hydrus_settings.base_url and self.state.source_url:
self._populate_hydrus_by_url(self.state.source_url)
def _populate_hydrus_by_hash(self) -> None:
hash_hex = self.state.hash or self._extract_hash_from_path(self.state.path)
if hash_hex and not _looks_like_hash(hash_hex):
hash_hex = None
if not hash_hex:
return
self.state.hash = hash_hex
if not self.hydrus_settings.base_url:
return
payload: Dict[str, Any] = {
"api_url": self.hydrus_settings.base_url,
"access_key": self.hydrus_settings.access_key or "",
"options": self.hydrus_settings.as_metadata_options(),
"hash": hash_hex,
}
try:
result = metadata.fetch_hydrus_metadata(payload)
except Exception as exc: # pragma: no cover - surfaced to caller
self.state.warnings.append(str(exc))
return
self._apply_hydrus_result(result)
# Enrich relationships using the dedicated Hydrus endpoint (robust GET)
if self.include_relationships and self.state.hash and self.hydrus_settings.base_url:
self._enrich_relationships_from_api(self.state.hash)
def _populate_hydrus_by_url(self, url: str) -> None:
if not self.hydrus_settings.base_url:
return
payload: Dict[str, Any] = {
"api_url": self.hydrus_settings.base_url,
"access_key": self.hydrus_settings.access_key or "",
"options": self.hydrus_settings.as_metadata_options(),
"url": url,
}
try:
result = metadata.fetch_hydrus_metadata_by_url(payload)
except Exception as exc: # pragma: no cover - surfaced to caller
self.state.warnings.append(str(exc))
return
if result.get("error") == "not_found":
self.state.warnings.extend(result.get("warnings") or [])
return
self._apply_hydrus_result(result)
self.state.type = "hydrus"
matched_url = result.get("matched_url") or result.get("url")
if matched_url and matched_url not in self.state.known_urls:
self.state.known_urls.append(matched_url)
# Enrich relationships once we know the hash
if self.include_relationships and self.state.hash and self.hydrus_settings.base_url:
self._enrich_relationships_from_api(self.state.hash)
# ------------------------------------------------------------------
# state modification helpers
# ------------------------------------------------------------------
def _apply_hydrus_result(self, result: Dict[str, Any]) -> None:
metadata_payload = result.get("metadata")
if isinstance(metadata_payload, dict):
# Process mime into type for Lua
mime = metadata_payload.get("mime")
if isinstance(mime, str):
if mime.startswith("video/"):
metadata_payload["type"] = "video"
elif mime.startswith("audio/"):
metadata_payload["type"] = "audio"
elif mime.startswith("image/"):
metadata_payload["type"] = "image"
else:
metadata_payload["type"] = "other"
self.state.metadata = metadata_payload
# Do NOT overwrite MPVfile.type with metadata.type
self._merge_known_urls(metadata_payload.get("known_urls") or metadata_payload.get("known_urls_set"))
source_url = metadata_payload.get("original_url") or metadata_payload.get("source_url")
if source_url and not self.state.source_url:
self.state.source_url = self._normalise_url(source_url)
# If file_relationships are embedded in metadata, capture as relationships when missing
if self.state.relationships is None:
embedded = metadata_payload.get("file_relationships")
if isinstance(embedded, dict) and embedded:
self.state.relationships = embedded
tags = result.get("tags") or []
self._merge_tags(tags)
hash_value = result.get("hash") or result.get("matched_hash")
if isinstance(hash_value, str) and _looks_like_hash(hash_value):
self.state.hash = hash_value.lower()
self.state.tag_service_key = result.get("tag_service_key") or self.state.tag_service_key
self.state.duration = result.get("duration") or self.state.duration
self.state.filesize_mb = result.get("filesize_mb") or self.state.filesize_mb
self.state.is_video = bool(result.get("is_video") or self.state.is_video)
self.state.is_audio = bool(result.get("is_audio") or self.state.is_audio)
if result.get("is_deleted") is not None:
self.state.is_deleted = bool(result.get("is_deleted"))
if result.get("is_local") is not None:
self.state.is_local = bool(result.get("is_local"))
if result.get("has_current_file_service") is not None:
self.state.has_current_file_service = bool(result.get("has_current_file_service"))
# Consolidate relationships from explicit result or embedded metadata
relationships_obj: Optional[Dict[str, Any]] = None
if isinstance(result.get("relationships"), dict):
relationships_obj = result["relationships"]
self.state.relationships = relationships_obj
elif isinstance(self.state.relationships, dict):
relationships_obj = self.state.relationships
# Helper to flatten any hashes from the relationships object
def _collect_hashes(obj: Any, acc: set[str]) -> None:
if obj is None:
return
if isinstance(obj, dict):
for v in obj.values():
_collect_hashes(v, acc)
elif isinstance(obj, (list, tuple, set)):
for v in obj:
_collect_hashes(v, acc)
elif isinstance(obj, str) and _looks_like_hash(obj):
acc.add(obj.lower())
# Derive king and alts robustly from available data
king: Optional[str] = None
alts: list[str] = []
# 1) Try direct king fields on relationships object
rels = relationships_obj or {}
if isinstance(rels, dict):
# Common variants
for key in ("king", "king_hash", "duplicate_king", "best", "best_hash"):
val = rels.get(key)
if isinstance(val, str) and _looks_like_hash(val):
king = val.lower()
break
if isinstance(val, list):
for h in val:
if isinstance(h, str) and _looks_like_hash(h):
king = h.lower()
break
if king:
break
# 2) Extract alternates from known fields: numeric "3" (clips), or textual synonyms
for alt_key in ("3", "alternates", "alts", "clips"):
val = rels.get(alt_key)
if isinstance(val, list):
for h in val:
if isinstance(h, str) and _looks_like_hash(h):
h_low = h.lower()
if not king or h_low != king:
alts.append(h_low)
# some APIs might nest
elif isinstance(val, dict):
tmp: set[str] = set()
_collect_hashes(val, tmp)
for h in sorted(tmp):
if not king or h != king:
alts.append(h)
# 3) Use relationship_metadata keys as additional alternates and king hint
rel_meta = result.get("relationship_metadata")
if isinstance(rel_meta, dict):
# prefer king candidate with no clip_time if not set
if not king:
for h, meta in rel_meta.items():
if isinstance(h, str) and _looks_like_hash(h) and isinstance(meta, dict):
if not meta.get("clip_time"):
king = h.lower()
break
for h in rel_meta.keys():
if isinstance(h, str) and _looks_like_hash(h):
h_low = h.lower()
if not king or h_low != king:
alts.append(h_low)
# 4) As a last resort, flatten all relationship hashes
if not alts and relationships_obj:
tmp: set[str] = set()
_collect_hashes(relationships_obj, tmp)
for h in sorted(tmp):
if not king or h != king:
alts.append(h)
# 5) Include current file when appropriate
if self.state.hash and (not king or self.state.hash != king) and self.state.hash not in alts:
alts.append(self.state.hash)
# 6) Sort alternates by clip start time when available
rel_meta_all = result.get("relationship_metadata") if isinstance(result.get("relationship_metadata"), dict) else {}
def _clip_start_for(h: str) -> float:
meta = rel_meta_all.get(h) if isinstance(rel_meta_all, dict) else None
clip = meta.get("clip_time") if isinstance(meta, dict) else None
if isinstance(clip, str):
m = re.match(r"^(\d+)-(\d+)$", clip)
if m:
try:
return float(m.group(1))
except Exception:
return float("inf")
return float("inf")
if alts:
# de-duplicate while preserving earliest clip time ordering
seen: set[str] = set()
alts = [h for h in sorted(alts, key=_clip_start_for) if (h not in seen and not seen.add(h))]
self.state.king = king
self.state.alts = alts
if isinstance(result.get("relationship_metadata"), dict):
self.state.relationship_metadata = result["relationship_metadata"]
self.state.title = result.get("title") or self.state.title
self.state.clip_time = result.get("clip_time") or self.state.clip_time
if result.get("swap_recommended"):
self.state.swap_recommended = True
warnings = result.get("warnings") or []
if warnings:
self.state.warnings.extend(warnings)
# ------------------------------------------------------------------
# relationships enrichment (Hydrus endpoint + alt metadata)
# ------------------------------------------------------------------
def _enrich_relationships_from_api(self, file_hash: str) -> None:
"""Fetch relationships for the given hash and enrich state's king/alts and alt metadata.
- Uses GET /manage_file_relationships/get_file_relationships?hash=...
- If alts exist, batch-fetch their metadata via GET /get_files/file_metadata?hashes=[...]
- Extracts title, duration, size, tags (cleaned: title: kept with namespace, others stripped)
"""
base_url = self.hydrus_settings.base_url or ""
access_key = self.hydrus_settings.access_key or ""
if not base_url:
return
try:
client = HydrusClient(base_url, access_key, timeout=self.hydrus_settings.timeout)
except Exception as exc: # pragma: no cover - construction should rarely fail
self.state.warnings.append(f"Hydrus client init failed: {exc}")
return
try:
rel_resp = client.get_file_relationships(file_hash)
except HydrusRequestError as hre: # pragma: no cover - surfaced but non-fatal
self.state.warnings.append(f"relationships api: {hre}")
return
except Exception as exc: # pragma: no cover
self.state.warnings.append(f"relationships api: {exc}")
return
rel_map = rel_resp.get("file_relationships") or {}
rel_obj = None
if isinstance(rel_map, dict):
rel_obj = rel_map.get(file_hash) or next((v for v in rel_map.values() if isinstance(v, dict)), None)
if isinstance(rel_obj, dict):
# Preserve the full relationships object
self.state.relationships = rel_obj
# Update king and alts from canonical fields
king = rel_obj.get("king")
alts = rel_obj.get("3") or []
if isinstance(king, str) and _looks_like_hash(king):
self.state.king = king.lower()
if isinstance(alts, list):
self.state.alts = [h.lower() for h in alts if isinstance(h, str) and _looks_like_hash(h)]
# Fetch alt metadata if we have alts
if not self.state.alts:
return
try:
meta_resp = client.fetch_file_metadata(
hashes=self.state.alts,
include_service_keys_to_tags=True,
include_duration=True,
include_size=True,
include_file_urls=False,
include_mime=False,
)
except HydrusRequestError as hre: # pragma: no cover
self.state.warnings.append(f"metadata api: {hre}")
return
except Exception as exc: # pragma: no cover
self.state.warnings.append(f"metadata api: {exc}")
return
if not isinstance(meta_resp, dict):
return
entries = meta_resp.get("metadata") or []
if not isinstance(entries, list):
return
def _extract_tags(meta: Dict[str, Any]) -> list[str]:
tags: list[str] = []
tag_root = meta.get("tags") or meta.get("service_keys_to_statuses_to_tags") or {}
if isinstance(tag_root, dict):
for service_dict in tag_root.values():
if not isinstance(service_dict, dict):
continue
# Prefer storage_tags but fall back to any list values under known keys
storage = service_dict.get("storage_tags")
if isinstance(storage, dict):
for vals in storage.values():
if isinstance(vals, list):
tags.extend([str(t) for t in vals if isinstance(t, str)])
else:
# fall back: inspect lists directly under service_dict
for vals in service_dict.values():
if isinstance(vals, list):
tags.extend([str(t) for t in vals if isinstance(t, str)])
return tags
def _clean_tags_and_title(all_tags: list[str]) -> tuple[Optional[str], list[str]]:
title_val: Optional[str] = None
cleaned: list[str] = []
for tag in all_tags:
if not isinstance(tag, str):
continue
if tag.startswith("title:"):
if title_val is None:
title_val = tag.split(":", 1)[1]
cleaned.append(tag) # keep namespaced title
else:
if ":" in tag:
cleaned.append(tag.split(":", 1)[1])
else:
cleaned.append(tag)
return title_val, cleaned
for meta in entries:
if not isinstance(meta, dict):
continue
h = meta.get("hash")
if not (isinstance(h, str) and _looks_like_hash(h)):
continue
tags_all = _extract_tags(meta)
title_val, tags_clean = _clean_tags_and_title(tags_all)
alt_info = {
"title": title_val,
"duration": meta.get("duration"),
"size": meta.get("size"),
"tags": tags_clean,
}
self.state.relationship_metadata[h.lower()] = alt_info
def _merge_tags(self, tags: Sequence[Any]) -> None:
incoming = _normalise_string_list(tags)
if not incoming:
return
combined = list(self.state.tags or []) + incoming
self.state.tags = unique_preserve_order(combined)
for tag in incoming:
if tag not in self.state.original_tags:
self.state.original_tags[tag] = tag
def _merge_known_urls(self, urls: Optional[Iterable[Any]]) -> None:
if not urls:
return
combined = list(self.state.known_urls or []) + _normalise_string_list(urls)
self.state.known_urls = unique_preserve_order(combined)
def _load_sidecar_tags(self, local_path: str) -> None:
try:
media_path = Path(local_path)
except Exception:
return
if not media_path.exists():
return
candidates = [media_path.with_suffix(".tags"), media_path.with_suffix(".tags.txt")]
for candidate in candidates:
if candidate.exists():
hash_value, tags, known = self._read_sidecar(candidate)
if hash_value and not self.state.hash and _looks_like_hash(hash_value):
self.state.hash = hash_value.lower()
self._merge_tags(tags)
self._merge_known_urls(known)
break
def _read_sidecar(self, sidecar_path: Path) -> tuple[Optional[str], List[str], List[str]]:
try:
raw = sidecar_path.read_text(encoding="utf-8", errors="ignore")
except OSError:
return None, [], []
hash_value: Optional[str] = None
tags: List[str] = []
known_urls: List[str] = []
for line in raw.splitlines():
trimmed = line.strip()
if not trimmed:
continue
lowered = trimmed.lower()
if lowered.startswith("hash:"):
candidate = trimmed.split(":", 1)[1].strip() if ":" in trimmed else ""
if candidate:
hash_value = candidate
elif lowered.startswith("known_url:") or lowered.startswith("url:"):
candidate = trimmed.split(":", 1)[1].strip() if ":" in trimmed else ""
if candidate:
known_urls.append(candidate)
else:
tags.append(trimmed)
return hash_value, tags, known_urls
def _compute_local_hash(self, local_path: str) -> None:
try:
digest = sha256_file(Path(local_path))
except OSError as exc:
self.state.warnings.append(f"sha256 failed: {exc}")
return
self.state.hash = digest.lower()
# ------------------------------------------------------------------
# finalisation helpers
# ------------------------------------------------------------------
def _finalise(self) -> None:
if self.state.tags:
self.state.tags = unique_preserve_order(self.state.tags)
if self.state.known_urls:
self.state.known_urls = unique_preserve_order(self.state.known_urls)
# Ensure metadata.type is always present for Lua, but do NOT overwrite MPVfile.type
if not self.state.title:
if self.state.metadata.get("title"):
self.state.title = str(self.state.metadata["title"]).strip()
elif self.state.filename:
self.state.title = self.state.filename
if self.state.hash and not _looks_like_hash(self.state.hash):
self.state.hash = None
if self.state.relationship_metadata is None:
self.state.relationship_metadata = {}
if self.state.relationships is not None and not isinstance(self.state.relationships, dict):
self.state.relationships = None
if self.state.original_tags is None:
self.state.original_tags = {}
# ------------------------------------------------------------------
# util helpers
# ------------------------------------------------------------------
@staticmethod
def _normalise_url(value: Any) -> Optional[str]:
if value is None:
return None
text = str(value).strip()
if not text:
return None
return text
@staticmethod
def _resolve_local_path(path: Optional[str]) -> Optional[str]:
if not path:
return None
parsed = urlparse(path)
if parsed.scheme == "file":
decoded = MPVFileBuilder._decode_file_url(path)
return decoded
return path
@staticmethod
def _decode_file_url(value: str) -> Optional[str]:
parsed = urlparse(value)
if parsed.scheme != "file":
return None
netloc = parsed.netloc or ""
path = unquote(parsed.path or "")
if netloc:
path = f"//{netloc}{path}"
if os.name == "nt" and path.startswith("/") and re.match(r"/[A-Za-z]:", path):
path = path[1:]
path = path.replace("/", os.sep)
return path
def _looks_like_hydrus_url(self, url: str) -> bool:
if not url:
return False
if url.startswith("hydrus://"):
return True
if "Hydrus-Client-API-Access-Key=" in url:
return True
base = self.hydrus_settings.base_url
if base and url.startswith(base) and "/get_files/" in url:
return True
return False
@staticmethod
def _extract_hash_from_path(path: Optional[str]) -> Optional[str]:
if not path:
return None
parsed = urlparse(path)
query = parse_qs(parsed.query)
if "hash" in query and query["hash"]:
candidate = query["hash"][0].strip()
if candidate:
return candidate.lower()
match = re.search(r"hash=([0-9a-fA-F]{64})", path)
if match:
return match.group(1).lower()
return None
def build_mpv_file_state(payload: Dict[str, Any], config: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
builder = MPVFileBuilder(payload or {}, config or {})
return builder.build()

143
helper/progress.py Normal file
View File

@@ -0,0 +1,143 @@
#!/usr/bin/env python3
"""Text-based progress bar utilities for consistent display across all downloads."""
import sys
from helper.logger import log, debug
def format_progress_bar(current: int, total: int, width: int = 40, label: str = "") -> str:
"""Create a text-based progress bar.
Args:
current: Current progress (bytes/items)
total: Total to complete (bytes/items)
width: Width of the bar in characters (default 40)
label: Optional label prefix
Returns:
Formatted progress bar string
Examples:
format_progress_bar(50, 100)
# Returns: "[████████████████░░░░░░░░░░░░░░░░░░░░] 50.0%"
format_progress_bar(256*1024*1024, 1024*1024*1024, label="download.zip")
# Returns: "download.zip: [████████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 25.0%"
"""
if total <= 0:
percentage = 0
filled = 0
else:
percentage = (current / total) * 100
filled = int((current / total) * width)
# Create bar: filled blocks + empty blocks
bar = "" * filled + "" * (width - filled)
# Format percentage
pct_str = f"{percentage:.1f}%"
# Build result
if label:
result = f"{label}: [{bar}] {pct_str}"
else:
result = f"[{bar}] {pct_str}"
return result
def format_size(bytes_val: float) -> str:
"""Format bytes to human-readable size.
Examples:
format_size(1024) -> "1.00 KB"
format_size(1024*1024) -> "1.00 MB"
format_size(1024*1024*1024) -> "1.00 GB"
"""
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
if bytes_val < 1024:
return f"{bytes_val:.2f} {unit}"
bytes_val /= 1024
return f"{bytes_val:.2f} PB"
def format_download_status(filename: str, current: int, total: int, speed: float = 0) -> str:
"""Format download status with progress bar and details.
Args:
filename: Name of file being downloaded
current: Current bytes downloaded
total: Total file size
speed: Download speed in bytes/sec
Returns:
Formatted status line
Examples:
format_download_status("movie.mkv", 512*1024*1024, 2*1024*1024*1024, 10*1024*1024)
# Returns: "movie.mkv: [████████████░░░░░░░░░░░░░░░░░░░░░░░░░░] 25.0% (512.00 MB / 2.00 GB @ 10.00 MB/s)"
"""
bar = format_progress_bar(current, total, width=30)
size_current = format_size(current)
size_total = format_size(total)
if speed > 0:
speed_str = f" @ {format_size(speed)}/s"
else:
speed_str = ""
return f"{bar} ({size_current} / {size_total}{speed_str})"
def print_progress(filename: str, current: int, total: int, speed: float = 0, end: str = "\r") -> None:
"""Print download progress to stderr (doesn't interfere with piped output).
Args:
filename: File being downloaded
current: Current bytes
total: Total bytes
speed: Speed in bytes/sec
end: Line ending (default "\r" for overwriting, use "\n" for final)
"""
status = format_download_status(filename, current, total, speed)
debug(status, end=end, flush=True)
def print_final_progress(filename: str, total: int, elapsed: float) -> None:
"""Print final progress line (100%) with time elapsed.
Args:
filename: File that was downloaded
total: Total size
elapsed: Time elapsed in seconds
"""
bar = format_progress_bar(total, total, width=30)
size_str = format_size(total)
# Format elapsed time
if elapsed < 60:
time_str = f"{elapsed:.1f}s"
elif elapsed < 3600:
minutes = elapsed / 60
time_str = f"{minutes:.1f}m"
else:
hours = elapsed / 3600
time_str = f"{hours:.2f}h"
debug(f"{bar} ({size_str}) - {time_str}")
if __name__ == "__main__":
# Demo
import time
log("Progress Bar Demo:", file=sys.stderr)
# Demo 1: Simple progress
for i in range(101):
print_progress("demo.bin", i * 10 * 1024 * 1024, 1024 * 1024 * 1024)
time.sleep(0.02)
print_final_progress("demo.bin", 1024 * 1024 * 1024, 2.0)
log()

159
helper/query_parser.py Normal file
View File

@@ -0,0 +1,159 @@
"""Dynamic query parser for filtering and field extraction.
Supports query syntax like:
- isbn:0557677203
- author:"Albert Pike"
- title:"Morals and Dogma"
- year:2010
- isbn:0557677203 author:"Albert Pike"
- Mixed with free text: "Morals" isbn:0557677203
This allows flexible query strings that can be parsed by any search provider
to extract specific fields for filtering and searching.
"""
from typing import Dict, List, Tuple, Optional, Any
import re
def parse_query(query: str) -> Dict[str, Any]:
"""Parse a query string into field:value pairs and free text.
Args:
query: Query string like 'isbn:0557677203 author:"Albert Pike" Morals'
Returns:
Dictionary with:
- 'fields': Dict[field_name, field_value] for structured fields
- 'text': str with remaining free text
- 'raw': str original query
"""
result = {
'fields': {},
'text': '',
'raw': query,
}
if not query or not query.strip():
return result
query = query.strip()
remaining_parts = []
# Pattern to match: field:value or field:"quoted value"
# Matches: word: followed by either quoted string or unquoted word
pattern = r'(\w+):(?:"([^"]*)"|(\S+))'
pos = 0
for match in re.finditer(pattern, query):
# Add any text before this match
if match.start() > pos:
before_text = query[pos:match.start()].strip()
if before_text:
remaining_parts.append(before_text)
field_name = match.group(1).lower()
field_value = match.group(2) if match.group(2) is not None else match.group(3)
result['fields'][field_name] = field_value
pos = match.end()
# Add any remaining text after last match
if pos < len(query):
remaining_text = query[pos:].strip()
if remaining_text:
remaining_parts.append(remaining_text)
result['text'] = ' '.join(remaining_parts)
return result
def get_field(parsed_query: Dict[str, Any], field_name: str, default: Optional[str] = None) -> Optional[str]:
"""Get a field value from parsed query, with optional default.
Args:
parsed_query: Result from parse_query()
field_name: Field name to look up (case-insensitive)
default: Default value if field not found
Returns:
Field value or default
"""
return parsed_query.get('fields', {}).get(field_name.lower(), default)
def has_field(parsed_query: Dict[str, Any], field_name: str) -> bool:
"""Check if a field exists in parsed query.
Args:
parsed_query: Result from parse_query()
field_name: Field name to check (case-insensitive)
Returns:
True if field exists
"""
return field_name.lower() in parsed_query.get('fields', {})
def get_free_text(parsed_query: Dict[str, Any]) -> str:
"""Get the free text portion of a parsed query.
Args:
parsed_query: Result from parse_query()
Returns:
Free text or empty string
"""
return parsed_query.get('text', '')
def build_query_for_provider(
parsed_query: Dict[str, Any],
provider: str,
extraction_map: Optional[Dict[str, str]] = None
) -> Tuple[str, Dict[str, str]]:
"""Build a search query and filters dict for a specific provider.
Different providers have different search syntax. This function
extracts the appropriate fields for each provider.
Args:
parsed_query: Result from parse_query()
provider: Provider name ('libgen', 'openlibrary', 'soulseek')
extraction_map: Optional mapping of field names to provider-specific names
e.g. {'isbn': 'isbn', 'author': 'author', 'title': 'title'}
Returns:
Tuple of (search_query: str, extracted_fields: Dict[field, value])
"""
extraction_map = extraction_map or {}
extracted = {}
free_text = get_free_text(parsed_query)
# Extract fields based on map
for field_name, provider_key in extraction_map.items():
if has_field(parsed_query, field_name):
extracted[provider_key] = get_field(parsed_query, field_name)
# If provider-specific extraction needed, providers can implement it
# For now, return the free text as query
return free_text, extracted
if __name__ == '__main__':
# Test cases
test_queries = [
'isbn:0557677203',
'isbn:0557677203 author:"Albert Pike"',
'Morals and Dogma isbn:0557677203',
'title:"Morals and Dogma" author:"Albert Pike" year:2010',
'search term without fields',
'author:"John Smith" title:"A Book"',
]
for query in test_queries:
print(f"\nQuery: {query}")
parsed = parse_query(query)
print(f" Fields: {parsed['fields']}")
print(f" Text: {parsed['text']}")

1777
helper/search_provider.py Normal file

File diff suppressed because it is too large Load Diff

155
helper/tasks.py Normal file
View File

@@ -0,0 +1,155 @@
"""Background task handling and IPC helpers for mpv integration."""
from __future__ import annotations
import errno
import json
import os
import socket
import subprocess
import sys
from helper.logger import log
import threading
import time
from typing import IO, Iterable
def connect_ipc(path: str, timeout: float = 5.0) -> IO[bytes] | None:
"""Connect to the mpv IPC server located at *path*."""
deadline = time.time() + timeout
if not path:
return None
if os.name == 'nt':
# mpv exposes a named pipe on Windows. Keep retrying until it is ready.
while True:
try:
return open(path, 'r+b', buffering=0)
except FileNotFoundError:
if time.time() > deadline:
return None
time.sleep(0.05)
except OSError as exc: # Pipe busy
if exc.errno not in (errno.ENOENT, errno.EPIPE, errno.EBUSY):
raise
if time.time() > deadline:
return None
time.sleep(0.05)
else:
sock = socket.socket(socket.AF_UNIX)
while True:
try:
sock.connect(path)
return sock.makefile('r+b', buffering=0)
except FileNotFoundError:
if time.time() > deadline:
return None
time.sleep(0.05)
except OSError as exc:
if exc.errno not in (errno.ENOENT, errno.ECONNREFUSED):
raise
if time.time() > deadline:
return None
time.sleep(0.05)
def ipc_sender(ipc: IO[bytes] | None):
"""Create a helper function for sending script messages via IPC."""
if ipc is None:
def _noop(_event: str, _payload: dict) -> None:
return None
return _noop
lock = threading.Lock()
def _send(event: str, payload: dict) -> None:
message = json.dumps({'command': ['script-message', event, json.dumps(payload)]}, ensure_ascii=False)
encoded = message.encode('utf-8') + b'\n'
with lock:
try:
ipc.write(encoded)
ipc.flush()
except OSError:
pass
return _send
def iter_stream(stream: Iterable[str]) -> Iterable[str]:
for raw in stream:
yield raw.rstrip('\r\n')
def _run_task(args, parser) -> int:
if not args.command:
parser.error('run-task requires a command to execute (use "--" before the command).')
env = os.environ.copy()
for entry in args.env:
key, sep, value = entry.partition('=')
if not sep:
parser.error(f'Invalid environment variable definition: {entry!r}')
env[key] = value
command = list(args.command)
if command and command[0] == '--':
command.pop(0)
notifier = ipc_sender(connect_ipc(args.ipc, timeout=args.ipc_timeout))
if not command:
notifier('downlow-task-event', {
'id': args.task_id,
'event': 'error',
'message': 'No command provided after separator',
})
log('[downlow.py] No command provided for run-task', file=sys.stderr)
return 1
if command and isinstance(command[0], str) and sys.executable:
first = command[0].lower()
if first in {'python', 'python3', 'py', 'python.exe', 'python3.exe', 'py.exe'}:
command[0] = sys.executable
if os.environ.get('DOWNLOW_DEBUG'):
log(f"Launching command: {command}", file=sys.stderr)
notifier('downlow-task-event', {
'id': args.task_id,
'event': 'start',
'command': command,
'cwd': args.cwd or os.getcwd(),
})
try:
process = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
cwd=args.cwd or None,
env=env,
text=True,
bufsize=1,
universal_newlines=True,
)
except FileNotFoundError as exc:
notifier('downlow-task-event', {
'id': args.task_id,
'event': 'error',
'message': f'Executable not found: {exc.filename}',
})
log(f"{exc}", file=sys.stderr)
return 1
stdout_lines: list[str] = []
stderr_lines: list[str] = []
def pump(stream: IO[str], label: str, sink: list[str]) -> None:
for line in iter_stream(stream):
sink.append(line)
notifier('downlow-task-event', {
'id': args.task_id,
'event': label,
'line': line,
})
threads = []
if process.stdout:
t_out = threading.Thread(target=pump, args=(process.stdout, 'stdout', stdout_lines), daemon=True)
t_out.start()
threads.append(t_out)
if process.stderr:
t_err = threading.Thread(target=pump, args=(process.stderr, 'stderr', stderr_lines), daemon=True)
t_err.start()
threads.append(t_err)
return_code = process.wait()
for t in threads:
t.join(timeout=0.1)
notifier('downlow-task-event', {
'id': args.task_id,
'event': 'exit',
'returncode': return_code,
'success': return_code == 0,
})
# Also mirror aggregated output to stdout/stderr for compatibility when IPC is unavailable.
if stdout_lines:
log('\n'.join(stdout_lines))
if stderr_lines:
log('\n'.join(stderr_lines), file=sys.stderr)
return return_code

View File

@@ -0,0 +1,706 @@
"""Unified book downloader - handles Archive.org borrowing and Libgen fallback.
This module provides a single interface for downloading books from multiple sources:
1. Try Archive.org direct download (if available)
2. Try Archive.org borrowing (if user has credentials)
3. Fallback to Libgen search by ISBN
4. Attempt Libgen download
All sources integrated with proper metadata scraping and error handling.
"""
import logging
import asyncio
import requests
from typing import Optional, Dict, Any, Tuple, List, Callable, cast
from pathlib import Path
from helper.logger import debug
logger = logging.getLogger(__name__)
class UnifiedBookDownloader:
"""Unified interface for downloading books from multiple sources."""
def __init__(self, config: Optional[Dict[str, Any]] = None, output_dir: Optional[str] = None):
"""Initialize the unified book downloader.
Args:
config: Configuration dict with credentials
output_dir: Default output directory
"""
self.config = config or {}
self.output_dir = output_dir
self.session = requests.Session()
# Import download functions from their modules
self._init_downloaders()
def _init_downloaders(self) -> None:
"""Initialize downloader functions from their modules."""
try:
from helper.archive_client import (
check_direct_download,
get_openlibrary_by_isbn,
loan
)
self.check_direct_download = check_direct_download
self.get_openlibrary_by_isbn = get_openlibrary_by_isbn
self.loan_func = loan
logger.debug("[UnifiedBookDownloader] Loaded archive.org downloaders from archive_client")
except Exception as e:
logger.warning(f"[UnifiedBookDownloader] Failed to load archive.org functions: {e}")
self.check_direct_download = None
self.get_openlibrary_by_isbn = None
self.loan_func = None
try:
from helper.libgen_service import (
DEFAULT_LIMIT as _LIBGEN_DEFAULT_LIMIT,
download_from_mirror as _libgen_download,
search_libgen as _libgen_search,
)
def _log_info(message: str) -> None:
debug(f"[UnifiedBookDownloader] {message}")
def _log_error(message: str) -> None:
logger.error(f"[UnifiedBookDownloader] {message}")
self.search_libgen = lambda query, limit=_LIBGEN_DEFAULT_LIMIT: _libgen_search(
query,
limit=limit,
log_info=_log_info,
log_error=_log_error,
)
self.download_from_mirror = lambda mirror_url, output_path: _libgen_download(
mirror_url,
output_path,
log_info=_log_info,
log_error=_log_error,
)
logger.debug("[UnifiedBookDownloader] Loaded Libgen helpers")
except Exception as e:
logger.warning(f"[UnifiedBookDownloader] Failed to load Libgen helpers: {e}")
self.search_libgen = None
self.download_from_mirror = None
def get_download_options(self, book_data: Dict[str, Any]) -> Dict[str, Any]:
"""Get all available download options for a book.
Checks in priority order:
1. Archive.org direct download (public domain)
2. Archive.org borrowing (if credentials available and book is borrowable)
3. Libgen fallback (by ISBN)
Args:
book_data: Book metadata dict with at least 'openlibrary_id' or 'isbn'
Returns:
Dict with available download methods and metadata
"""
options = {
'book_title': book_data.get('title', 'Unknown'),
'book_author': book_data.get('author', 'Unknown'),
'isbn': book_data.get('isbn', ''),
'openlibrary_id': book_data.get('openlibrary_id', ''),
'methods': [], # Will be sorted by priority
'metadata': {}
}
# Extract book ID from openlibrary_id (e.g., OL8513721M -> 8513721, OL8513721W -> 8513721)
ol_id = book_data.get('openlibrary_id', '')
book_id = None
if ol_id.startswith('OL') and len(ol_id) > 2:
# Remove 'OL' prefix (keep everything after it including the suffix letter)
# The book_id is all digits after 'OL'
book_id = ''.join(c for c in ol_id[2:] if c.isdigit())
# PRIORITY 1: Check direct download (fastest, no auth needed)
if self.check_direct_download:
try:
can_download, pdf_url = self.check_direct_download(book_id)
if can_download:
options['methods'].append({
'type': 'archive.org_direct',
'label': 'Archive.org Direct Download',
'requires_auth': False,
'pdf_url': pdf_url,
'book_id': book_id,
'priority': 1 # Highest priority
})
logger.info(f"[UnifiedBookDownloader] Direct download available for {book_id}")
except Exception as e:
logger.debug(f"[UnifiedBookDownloader] Direct download check failed: {e}")
# PRIORITY 2: Check borrowing option (requires auth, 14-day loan)
# First verify the book is actually lendable via OpenLibrary API
if self._has_archive_credentials():
is_lendable, status = self._check_book_lendable_status(ol_id)
if is_lendable:
options['methods'].append({
'type': 'archive.org_borrow',
'label': 'Archive.org Borrow',
'requires_auth': True,
'book_id': book_id,
'priority': 2 # Second priority
})
logger.info(f"[UnifiedBookDownloader] Borrow option available for {book_id} (status: {status})")
else:
logger.debug(f"[UnifiedBookDownloader] Borrow not available for {book_id} (status: {status})")
# PRIORITY 3: Check Libgen fallback (by ISBN, no auth needed, most reliable)
isbn = book_data.get('isbn', '')
title = book_data.get('title', '')
author = book_data.get('author', '')
if self.search_libgen:
# Can use Libgen if we have ISBN OR title (or both)
if isbn or title:
options['methods'].append({
'type': 'libgen',
'label': 'Libgen Search & Download',
'requires_auth': False,
'isbn': isbn,
'title': title,
'author': author,
'priority': 3 # Third priority (fallback)
})
logger.info(f"[UnifiedBookDownloader] Libgen fallback available (ISBN: {isbn if isbn else 'N/A'}, Title: {title})")
# Sort by priority (higher priority first)
options['methods'].sort(key=lambda x: x.get('priority', 999))
return options
def _has_archive_credentials(self) -> bool:
"""Check if Archive.org credentials are available."""
try:
from helper.archive_client import credential_openlibrary
email, password = credential_openlibrary(self.config)
return bool(email and password)
except Exception:
return False
def _check_book_lendable_status(self, ol_id: str) -> Tuple[bool, Optional[str]]:
"""Check if a book is lendable via OpenLibrary API.
Queries: https://openlibrary.org/api/volumes/brief/json/OLID:{ol_id}
Note: Only works with Edition IDs (OL...M), not Work IDs (OL...W)
Args:
ol_id: OpenLibrary ID (e.g., OL8513721M for Edition or OL4801915W for Work)
Returns:
Tuple of (is_lendable: bool, status_reason: Optional[str])
"""
try:
if not ol_id.startswith('OL'):
return False, "Invalid OpenLibrary ID format"
# If this is a Work ID (ends with W), we can't query Volumes API
# Work IDs are abstract umbrella records, not specific editions
if ol_id.endswith('W'):
logger.debug(f"[UnifiedBookDownloader] Work ID {ol_id} - skipping Volumes API (not lendable)")
return False, "Work ID not supported by Volumes API (not a specific edition)"
# If it ends with M, it's an Edition ID - proceed with query
if not ol_id.endswith('M'):
logger.debug(f"[UnifiedBookDownloader] Unknown ID type {ol_id} (not M or W)")
return False, "Invalid OpenLibrary ID type"
url = f"https://openlibrary.org/api/volumes/brief/json/OLID:{ol_id}"
response = self.session.get(url, timeout=10)
response.raise_for_status()
data = response.json()
# Empty response means no records found
if not data:
logger.debug(f"[UnifiedBookDownloader] Empty response for {ol_id}")
return False, "No availability data found"
# The response is wrapped in OLID key
olid_key = f"OLID:{ol_id}"
if olid_key not in data:
logger.debug(f"[UnifiedBookDownloader] OLID key not found in response")
return False, "No availability data found"
olid_data = data[olid_key]
# Check items array for lendable status
if 'items' in olid_data and olid_data['items'] and len(olid_data['items']) > 0:
items = olid_data['items']
# Check the first item for lending status
first_item = items[0]
# Handle both dict and string representations (PowerShell converts to string)
if isinstance(first_item, dict):
status = first_item.get('status', '')
else:
# String representation - check if 'lendable' is in it
status = str(first_item).lower()
is_lendable = 'lendable' in str(status).lower()
if is_lendable:
logger.info(f"[UnifiedBookDownloader] Book {ol_id} is lendable")
return True, "LENDABLE"
else:
status_str = status.get('status', 'NOT_LENDABLE') if isinstance(status, dict) else 'NOT_LENDABLE'
logger.debug(f"[UnifiedBookDownloader] Book {ol_id} is not lendable (status: {status_str})")
return False, status_str
else:
# No items array or empty
logger.debug(f"[UnifiedBookDownloader] No items found for {ol_id}")
return False, "Not available for lending"
except requests.exceptions.Timeout:
logger.warning(f"[UnifiedBookDownloader] OpenLibrary API timeout for {ol_id}")
return False, "API timeout"
except Exception as e:
logger.debug(f"[UnifiedBookDownloader] Failed to check lendable status for {ol_id}: {e}")
return False, f"API error"
async def download_book(self, method: Dict[str, Any], output_dir: Optional[str] = None) -> Tuple[bool, str]:
"""Download a book using the specified method.
Args:
method: Download method dict from get_download_options()
output_dir: Directory to save the book
Returns:
Tuple of (success: bool, message: str)
"""
output_dir = output_dir or self.output_dir or str(Path.home() / "Downloads")
method_type = method.get('type', '')
logger.info(f"[UnifiedBookDownloader] Starting download with method: {method_type}")
try:
if method_type == 'archive.org_direct':
return await self._download_archive_direct(method, output_dir)
elif method_type == 'archive.org_borrow':
return await self._download_archive_borrow(method, output_dir)
elif method_type == 'libgen':
return await self._download_libgen(method, output_dir)
else:
return False, f"Unknown download method: {method_type}"
except Exception as e:
logger.error(f"[UnifiedBookDownloader] Download error: {e}", exc_info=True)
return False, f"Download failed: {str(e)}"
async def _download_archive_direct(self, method: Dict[str, Any], output_dir: str) -> Tuple[bool, str]:
"""Download directly from Archive.org."""
try:
pdf_url = method.get('pdf_url', '')
book_id = method.get('book_id', '')
if not pdf_url:
return False, "No PDF URL available"
# Determine output filename
filename = f"{book_id}.pdf"
output_path = Path(output_dir) / filename
logger.info(f"[UnifiedBookDownloader] Downloading PDF from: {pdf_url}")
# Download in a thread to avoid blocking
loop = asyncio.get_event_loop()
success = await loop.run_in_executor(
None,
self._download_file,
pdf_url,
str(output_path)
)
if success:
logger.info(f"[UnifiedBookDownloader] Successfully downloaded to: {output_path}")
return True, f"Downloaded to: {output_path}"
else:
return False, "Failed to download PDF"
except Exception as e:
logger.error(f"[UnifiedBookDownloader] Archive direct download error: {e}")
return False, f"Archive download failed: {str(e)}"
async def _download_archive_borrow(self, method: Dict[str, Any], output_dir: str) -> Tuple[bool, str]:
"""Download via Archive.org borrowing (requires credentials).
Process (follows archive_client.py pattern):
1. Login to Archive.org with credentials
2. Call loan endpoint to borrow the book (14-day loan)
3. Get book info (page links, metadata)
4. Download all pages as images
5. Merge images into PDF
The loan function from archive_client.py handles:
- Checking if book needs borrowing (status 400 = "doesn't need to be borrowed")
- Creating borrow token for access
- Handling borrow failures
get_book_infos() extracts page links from the borrowed book viewer
download() downloads all pages using thread pool
img2pdf merges pages into searchable PDF
"""
try:
from helper.archive_client import credential_openlibrary
book_id = method.get('book_id', '')
# Get credentials
email, password = credential_openlibrary(self.config)
if not email or not password:
return False, "Archive.org credentials not configured"
logger.info(f"[UnifiedBookDownloader] Logging into Archive.org...")
# Login and borrow (in thread, following download_book.py pattern)
loop = asyncio.get_event_loop()
borrow_result = await loop.run_in_executor(
None,
self._archive_borrow_and_download,
email,
password,
book_id,
output_dir
)
if borrow_result and isinstance(borrow_result, tuple):
success, filepath = borrow_result
if success:
logger.info(f"[UnifiedBookDownloader] Borrow succeeded: {filepath}")
return True, filepath
else:
logger.warning(f"[UnifiedBookDownloader] Borrow failed: {filepath}")
return False, filepath
else:
return False, "Failed to borrow book from Archive.org"
except Exception as e:
logger.error(f"[UnifiedBookDownloader] Archive borrow error: {e}")
return False, f"Archive borrow failed: {str(e)}"
async def _download_libgen(self, method: Dict[str, Any], output_dir: str) -> Tuple[bool, str]:
"""Download via Libgen search and download with mirror fallback."""
try:
isbn = method.get('isbn', '')
title = method.get('title', '')
if not isbn and not title:
return False, "Need ISBN or title for Libgen search"
if not self.search_libgen:
return False, "Libgen searcher not available"
# Define wrapper functions to safely call the methods
search_func = self.search_libgen
if search_func is None:
return False, "Search function not available"
preloaded_results = method.get('results')
loop = asyncio.get_event_loop()
if preloaded_results:
results = list(preloaded_results)
if not results:
results = await loop.run_in_executor(None, lambda: search_func(isbn or title, 10))
else:
results = await loop.run_in_executor(None, lambda: search_func(isbn or title, 10))
if not results:
logger.warning(f"[UnifiedBookDownloader] No Libgen results for: {isbn or title}")
return False, f"No Libgen results found for: {isbn or title}"
logger.info(f"[UnifiedBookDownloader] Found {len(results)} Libgen results")
# Determine output filename (use first result for naming)
first_result = results[0]
filename = f"{first_result.get('title', 'book')}"
filename = "".join(c for c in filename if c.isalnum() or c in (' ', '.', '-'))[:100]
# Try each result's mirror until one succeeds
for idx, result in enumerate(results, 1):
mirror_url = result.get('mirror_url', '')
if not mirror_url:
logger.debug(f"[UnifiedBookDownloader] Result {idx}: No mirror URL")
continue
# Use extension from this result if available
extension = result.get('extension', 'pdf')
if extension and not extension.startswith('.'):
extension = f".{extension}"
elif not extension:
extension = '.pdf'
output_path = Path(output_dir) / (filename + extension)
logger.info(f"[UnifiedBookDownloader] Trying mirror {idx}/{len(results)}: {mirror_url}")
download_func = self.download_from_mirror
if download_func is None:
return False, "Download function not available"
download_callable = cast(Callable[[str, str], bool], download_func)
def download_wrapper():
return download_callable(mirror_url, str(output_path))
# Download (in thread)
try:
success = await loop.run_in_executor(None, download_wrapper)
if success:
# Validate downloaded file is not HTML (common Libgen issue)
if output_path.exists():
try:
with open(output_path, 'rb') as f:
file_start = f.read(1024).decode('utf-8', errors='ignore').lower()
if '<!doctype' in file_start or '<html' in file_start:
logger.warning(f"[UnifiedBookDownloader] Mirror {idx} returned HTML instead of file, trying next mirror...")
output_path.unlink() # Delete the HTML file
continue
except Exception as e:
logger.debug(f"[UnifiedBookDownloader] Could not validate file content: {e}")
logger.info(f"[UnifiedBookDownloader] Successfully downloaded from mirror {idx} to: {output_path}")
return True, str(output_path)
else:
logger.warning(f"[UnifiedBookDownloader] Mirror {idx} download failed, trying next...")
except Exception as e:
logger.warning(f"[UnifiedBookDownloader] Mirror {idx} error: {e}, trying next...")
continue
return False, f"All {len(results)} mirrors failed"
except Exception as e:
logger.error(f"[UnifiedBookDownloader] Libgen download error: {e}")
return False, f"Libgen download failed: {str(e)}"
async def download_libgen_selection(
self,
selected: Dict[str, Any],
remaining: Optional[List[Dict[str, Any]]] = None,
output_dir: Optional[str] = None,
) -> Tuple[bool, str]:
"""Download a specific Libgen result with optional fallbacks."""
if not isinstance(selected, dict):
return False, "Selected result must be a dictionary"
ordered_results: List[Dict[str, Any]] = [selected]
if remaining:
for item in remaining:
if isinstance(item, dict) and item is not selected:
ordered_results.append(item)
method: Dict[str, Any] = {
'type': 'libgen',
'isbn': selected.get('isbn', '') or '',
'title': selected.get('title', '') or '',
'author': selected.get('author', '') or '',
'results': ordered_results,
}
return await self.download_book(method, output_dir)
def download_libgen_selection_sync(
self,
selected: Dict[str, Any],
remaining: Optional[List[Dict[str, Any]]] = None,
output_dir: Optional[str] = None,
) -> Tuple[bool, str]:
"""Synchronous helper for downloading a Libgen selection."""
async def _run() -> Tuple[bool, str]:
return await self.download_libgen_selection(selected, remaining, output_dir)
loop = asyncio.new_event_loop()
try:
asyncio.set_event_loop(loop)
return loop.run_until_complete(_run())
finally:
loop.close()
asyncio.set_event_loop(None)
def _download_file(self, url: str, output_path: str) -> bool:
"""Download a file from URL."""
try:
response = requests.get(url, stream=True, timeout=30)
response.raise_for_status()
with open(output_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
return True
except Exception as e:
logger.error(f"[UnifiedBookDownloader] File download error: {e}")
return False
def _archive_borrow_and_download(self, email: str, password: str, book_id: str, output_dir: str) -> Tuple[bool, str]:
"""Borrow a book from Archive.org and download pages as PDF.
This follows the exact process from archive_client.py:
1. Login with credentials
2. Call loan() to create 14-day borrow
3. Get book info (extract page URLs)
4. Download all pages as images
5. Merge images into searchable PDF
Returns tuple of (success: bool, filepath/message: str)
"""
try:
from helper.archive_client import login, loan, get_book_infos, download
import tempfile
import shutil
logger.info(f"[UnifiedBookDownloader] Logging into Archive.org as {email}")
session = login(email, password)
logger.info(f"[UnifiedBookDownloader] Attempting to borrow book: {book_id}")
# Call loan to create the 14-day borrow
session = loan(session, book_id, verbose=True)
# If we get here, borrowing succeeded
logger.info(f"[UnifiedBookDownloader] Successfully borrowed book: {book_id}")
# Now get the book info (page URLs and metadata)
logger.info(f"[UnifiedBookDownloader] Extracting book page information...")
# Try both URL formats: with /borrow and without
book_urls = [
f"https://archive.org/borrow/{book_id}", # Try borrow page first (for borrowed books)
f"https://archive.org/details/{book_id}" # Fallback to details page
]
title = None
links = None
metadata = None
last_error = None
for book_url in book_urls:
try:
logger.debug(f"[UnifiedBookDownloader] Trying to get book info from: {book_url}")
response = session.get(book_url, timeout=10)
# Log response status
if response.status_code != 200:
logger.debug(f"[UnifiedBookDownloader] URL returned {response.status_code}: {book_url}")
# Continue to try next URL
continue
# Try to parse the response
title, links, metadata = get_book_infos(session, book_url)
logger.info(f"[UnifiedBookDownloader] Successfully got info from: {book_url}")
logger.info(f"[UnifiedBookDownloader] Found {len(links)} pages to download")
break
except Exception as e:
logger.debug(f"[UnifiedBookDownloader] Failed with {book_url}: {e}")
last_error = e
continue
if links is None:
logger.error(f"[UnifiedBookDownloader] Failed to get book info from all URLs: {last_error}")
# Borrow extraction failed - return False
return False, "Could not extract borrowed book pages"
# Create temporary directory for images
temp_dir = tempfile.mkdtemp(prefix=f"{title}_", dir=output_dir)
logger.info(f"[UnifiedBookDownloader] Downloading {len(links)} pages to temporary directory...")
try:
# Download all pages (uses thread pool)
images = download(
session=session,
n_threads=10,
directory=temp_dir,
links=links,
scale=3, # Default resolution
book_id=book_id
)
logger.info(f"[UnifiedBookDownloader] Downloaded {len(images)} pages")
# Try to merge pages into PDF
try:
import img2pdf
logger.info(f"[UnifiedBookDownloader] Merging pages into PDF...")
# Prepare PDF metadata
pdfmeta = {}
if metadata:
if "title" in metadata:
pdfmeta["title"] = metadata["title"]
if "creator" in metadata:
pdfmeta["author"] = metadata["creator"]
pdfmeta["keywords"] = [f"https://archive.org/details/{book_id}"]
pdfmeta["creationdate"] = None # Avoid timezone issues
# Convert images to PDF
pdf_content = img2pdf.convert(images, **pdfmeta) if images else None
if not pdf_content:
logger.error(f"[UnifiedBookDownloader] PDF conversion failed")
return False, "Failed to convert pages to PDF"
# Save the PDF
pdf_filename = f"{title}.pdf" if title else "book.pdf"
pdf_path = Path(output_dir) / pdf_filename
# Handle duplicate filenames
i = 1
while pdf_path.exists():
pdf_path = Path(output_dir) / f"{title or 'book'}({i}).pdf"
i += 1
with open(pdf_path, 'wb') as f:
f.write(pdf_content)
logger.info(f"[UnifiedBookDownloader] Successfully created PDF: {pdf_path}")
return True, str(pdf_path)
except ImportError:
logger.warning(f"[UnifiedBookDownloader] img2pdf not available, saving as JPG collection instead")
# Create JPG collection directory
if not title:
title = f"book_{book_id}"
jpg_dir = Path(output_dir) / title
i = 1
while jpg_dir.exists():
jpg_dir = Path(output_dir) / f"{title}({i})"
i += 1
# Move temporary directory to final location
shutil.move(temp_dir, str(jpg_dir))
temp_dir = None # Mark as already moved
logger.info(f"[UnifiedBookDownloader] Saved as JPG collection: {jpg_dir}")
return True, str(jpg_dir)
finally:
# Clean up temporary directory if it still exists
if temp_dir and Path(temp_dir).exists():
shutil.rmtree(temp_dir)
except SystemExit:
# loan() function calls sys.exit on failure - catch it
logger.error(f"[UnifiedBookDownloader] Borrow process exited (book may not be borrowable)")
return False, "Book could not be borrowed (may not be available for borrowing)"
except Exception as e:
logger.error(f"[UnifiedBookDownloader] Archive borrow error: {e}")
return False, f"Borrow failed: {str(e)}"
def close(self) -> None:
"""Close the session."""
self.session.close()

492
helper/utils.py Normal file
View File

@@ -0,0 +1,492 @@
"""General-purpose helpers used across the downlow CLI."""
from __future__ import annotations
import json
import hashlib
import ffmpeg
import base64
import logging
import time
from pathlib import Path
from typing import Any, Iterable
from datetime import datetime
from dataclasses import dataclass, field
from fnmatch import fnmatch
from urllib.parse import urlparse
import helper.utils_constant
try:
import cbor2
except ImportError:
cbor2 = None # type: ignore
CHUNK_SIZE = 1024 * 1024 # 1 MiB
_format_logger = logging.getLogger(__name__)
def ensure_directory(path: Path) -> None:
"""Ensure *path* exists as a directory."""
try:
path.mkdir(parents=True, exist_ok=True)
except OSError as exc: # pragma: no cover - surfaced to caller
raise RuntimeError(f"Failed to create directory {path}: {exc}") from exc
def unique_path(path: Path) -> Path:
"""Return a unique path by appending " (n)" if needed."""
if not path.exists():
return path
stem = path.stem
suffix = path.suffix
parent = path.parent
counter = 1
while True:
candidate = parent / f"{stem} ({counter}){suffix}"
if not candidate.exists():
return candidate
counter += 1
def sanitize_metadata_value(value: Any) -> str | None:
if value is None:
return None
if not isinstance(value, str):
value = str(value)
value = value.replace('\x00', ' ').replace('\r', ' ').replace('\n', ' ').strip()
if not value:
return None
return value
def unique_preserve_order(values: Iterable[str]) -> list[str]:
seen: set[str] = set()
ordered: list[str] = []
for value in values:
if value not in seen:
seen.add(value)
ordered.append(value)
return ordered
def sha256_file(file_path: Path) -> str:
"""Return the SHA-256 hex digest of *path*."""
hasher = hashlib.sha256()
with file_path.open('rb') as handle:
for chunk in iter(lambda: handle.read(CHUNK_SIZE), b''):
hasher.update(chunk)
return hasher.hexdigest()
def create_metadata_sidecar(file_path: Path, metadata: dict) -> None:
"""Create a .metadata sidecar file with JSON metadata.
The metadata dict should contain title. If not present, it will be derived from
the filename. This ensures the .metadata file can be matched during batch import.
Args:
file_path: Path to the exported file
metadata: Dictionary of metadata to save
"""
if not metadata:
return
file_name = file_path.stem
file_ext = file_path.suffix.lower()
# Ensure metadata has a title field that matches the filename (without extension)
# This allows the sidecar to be matched and imported properly during batch import
if 'title' not in metadata or not metadata.get('title'):
metadata['title'] = file_name
metadata['hash'] = sha256_file(file_path)
metadata['size'] = Path(file_path).stat().st_size
format_found = False
for mime_type, ext_map in helper.utils_constant.mime_maps.items():
for key, info in ext_map.items():
if info.get("ext") == file_ext:
metadata['type'] = mime_type
format_found = True
break
if format_found:
break
else:
metadata['type'] = 'unknown'
metadata.update(ffprobe(str(file_path)))
metadata_path = file_path.with_suffix(file_path.suffix + '.metadata')
try:
with open(metadata_path, 'w', encoding='utf-8') as f:
json.dump(metadata, f, ensure_ascii=False, indent=2)
except OSError as exc:
raise RuntimeError(f"Failed to write metadata sidecar {metadata_path}: {exc}") from exc
def create_tags_sidecar(file_path: Path, tags: set) -> None:
"""Create a .tags sidecar file with tags (one per line).
Args:
file_path: Path to the exported file
tags: Set of tag strings
"""
if not tags:
return
tags_path = file_path.with_suffix(file_path.suffix + '.tags')
try:
with open(tags_path, 'w', encoding='utf-8') as f:
for tag in sorted(tags):
f.write(f"{tag}\n")
except Exception as e:
raise RuntimeError(f"Failed to create tags sidecar {tags_path}: {e}") from e
def ffprobe(file_path: str) -> dict:
probe = ffmpeg.probe(file_path)
metadata = {}
# Format-level info
fmt = probe.get("format", {})
metadata["duration"] = float(fmt.get("duration", 0)) if "duration" in fmt else None
metadata["size"] = int(fmt.get("size", 0)) if "size" in fmt else None
metadata["format_name"] = fmt.get("format_name", None)
# Stream-level info
for stream in probe.get("streams", []):
codec_type = stream.get("codec_type")
if codec_type == "audio":
metadata["audio_codec"] = stream.get("codec_name")
metadata["bitrate"] = int(stream.get("bit_rate", 0)) if "bit_rate" in stream else None
metadata["samplerate"] = int(stream.get("sample_rate", 0)) if "sample_rate" in stream else None
metadata["channels"] = int(stream.get("channels", 0)) if "channels" in stream else None
elif codec_type == "video":
metadata["video_codec"] = stream.get("codec_name")
metadata["width"] = int(stream.get("width", 0)) if "width" in stream else None
metadata["height"] = int(stream.get("height", 0)) if "height" in stream else None
elif codec_type == "image":
metadata["image_codec"] = stream.get("codec_name")
metadata["width"] = int(stream.get("width", 0)) if "width" in stream else None
metadata["height"] = int(stream.get("height", 0)) if "height" in stream else None
return metadata
# ============================================================================
# CBOR Utilities - Consolidated from cbor.py
# ============================================================================
"""CBOR utilities backed by the `cbor2` library."""
def decode_cbor(data: bytes) -> Any:
"""Decode *data* from CBOR into native Python objects."""
if not data:
return None
if cbor2 is None:
raise ImportError("cbor2 library is required for CBOR decoding")
return cbor2.loads(data)
def jsonify(value: Any) -> Any:
"""Convert *value* into a JSON-friendly structure."""
if isinstance(value, dict):
return {str(key): jsonify(val) for key, val in value.items()}
if isinstance(value, list):
return [jsonify(item) for item in value]
if isinstance(value, bytes):
return {"__bytes__": base64.b64encode(value).decode("ascii")}
return value
# ============================================================================
# Format Utilities - Consolidated from format_utils.py
# ============================================================================
"""Formatting utilities for displaying metadata consistently across the application."""
def format_bytes(bytes_value) -> str:
"""Format bytes to human-readable format (e.g., '1.5 MB', '250 KB').
Args:
bytes_value: Size in bytes (int or float)
Returns:
Formatted string like '1.5 MB' or '756 MB'
"""
if bytes_value is None or bytes_value <= 0:
return "0 B"
if isinstance(bytes_value, (int, float)):
for unit in ("B", "KB", "MB", "GB", "TB"):
if bytes_value < 1024:
if unit == "B":
return f"{int(bytes_value)} {unit}"
return f"{bytes_value:.1f} {unit}"
bytes_value /= 1024
return f"{bytes_value:.1f} PB"
return str(bytes_value)
def format_duration(seconds) -> str:
"""Format duration in seconds to human-readable format (e.g., '1h 23m 5s', '5m 30s').
Args:
seconds: Duration in seconds (int or float)
Returns:
Formatted string like '1:23:45' or '5:30'
"""
if seconds is None or seconds == '':
return "N/A"
if isinstance(seconds, str):
try:
seconds = float(seconds)
except ValueError:
return str(seconds)
if not isinstance(seconds, (int, float)):
return str(seconds)
total_seconds = int(seconds)
if total_seconds < 0:
return "N/A"
hours = total_seconds // 3600
minutes = (total_seconds % 3600) // 60
secs = total_seconds % 60
if hours > 0:
return f"{hours}:{minutes:02d}:{secs:02d}"
elif minutes > 0:
return f"{minutes}:{secs:02d}"
else:
return f"{secs}s"
def format_timestamp(timestamp_str) -> str:
"""Format ISO timestamp to readable format.
Args:
timestamp_str: ISO format timestamp string or None
Returns:
Formatted string like "2025-10-28 19:36:01" or original string if parsing fails
"""
if not timestamp_str:
return "N/A"
try:
# Handle ISO format timestamps
if isinstance(timestamp_str, str):
# Try parsing ISO format
if 'T' in timestamp_str:
dt = datetime.fromisoformat(timestamp_str.replace('Z', '+00:00'))
else:
# Try other common formats
dt = datetime.fromisoformat(timestamp_str)
return dt.strftime("%Y-%m-%d %H:%M:%S")
except Exception as e:
_format_logger.debug(f"Could not parse timestamp '{timestamp_str}': {e}")
return str(timestamp_str)
def format_metadata_value(key: str, value) -> str:
"""Format a metadata value based on its key for display.
This is the central formatting rule for all metadata display.
Args:
key: Metadata field name
value: Value to format
Returns:
Formatted string for display
"""
if value is None or value == '':
return "N/A"
# Apply field-specific formatting
if key in ('size', 'file_size'):
return format_bytes(value)
elif key in ('duration', 'length'):
return format_duration(value)
elif key in ('time_modified', 'time_imported', 'created_at', 'updated_at', 'indexed_at', 'timestamp'):
return format_timestamp(value)
else:
return str(value)
# ============================================================================
# Link Utilities - Consolidated from link_utils.py
# ============================================================================
"""Link utilities - Extract and process URLs from various sources."""
def extract_link_from_args(args: Iterable[str]) -> Any | None:
"""Extract HTTP/HTTPS URL from command arguments.
Args:
args: Command arguments
Returns:
URL string if found, None otherwise
"""
args_list = list(args) if not isinstance(args, (list, tuple)) else args
if not args_list or len(args_list) == 0:
return None
potential_link = str(args_list[0])
if potential_link.startswith(('http://', 'https://')):
return potential_link
return None
def extract_link_from_result(result: Any) -> Any | None:
"""Extract URL from a result object (dict or object with attributes).
Args:
result: Result object from pipeline (dict or object)
Returns:
URL string if found, None otherwise
"""
if isinstance(result, dict):
return result.get('url') or result.get('link') or result.get('href')
return (
getattr(result, 'url', None) or
getattr(result, 'link', None) or
getattr(result, 'href', None)
)
def extract_link(result: Any, args: Iterable[str]) -> Any | None:
"""Extract link from args or result (args take priority).
Args:
result: Pipeline result object
args: Command arguments
Returns:
URL string if found, None otherwise
"""
# Try args first
link = extract_link_from_args(args)
if link:
return link
# Fall back to result
return extract_link_from_result(result)
def get_api_key(config: dict[str, Any], service: str, key_path: str) -> str | None:
"""Get API key from config with fallback support.
Args:
config: Configuration dictionary
service: Service name for logging
key_path: Dot-notation path to key (e.g., "Debrid.All-debrid")
Returns:
API key if found and not empty, None otherwise
"""
try:
parts = key_path.split('.')
value = config
for part in parts:
if isinstance(value, dict):
value = value.get(part)
else:
return None
if isinstance(value, str):
return value.strip() or None
return None
except Exception:
return None
def add_direct_link_to_result(result: Any, direct_link: str, original_link: str) -> None:
"""Add direct link information to result object.
Args:
result: Result object to modify (dict or object)
direct_link: The unlocked/direct URL
original_link: The original restricted URL
"""
if isinstance(result, dict):
result['direct_link'] = direct_link
result['original_link'] = original_link
else:
setattr(result, 'direct_link', direct_link)
setattr(result, 'original_link', original_link)
# ============================================================================
# URL Policy Resolution - Consolidated from url_parser.py
# ============================================================================
"""URL policy resolution for downlow workflows."""
@dataclass(slots=True)
class UrlPolicy:
"""Describe how a URL should be handled by download and screenshot flows."""
skip_download: bool = False
skip_metadata: bool = False
force_screenshot: bool = False
extra_tags: list[str] = field(default_factory=list)
def apply_tags(self, sources: Iterable[str]) -> list[str]:
tags = [tag.strip() for tag in self.extra_tags if tag and tag.strip()]
for value in sources:
text = str(value).strip()
if text:
tags.append(text)
return tags
def _normalise_rule(rule: dict[str, Any]) -> dict[str, Any] | None:
pattern = str(rule.get("pattern") or rule.get("host") or "").strip()
if not pattern:
return None
skip_download = bool(rule.get("skip_download"))
skip_metadata = bool(rule.get("skip_metadata"))
force_screenshot = bool(rule.get("force_screenshot"))
extra_tags_raw = rule.get("extra_tags")
if isinstance(extra_tags_raw, str):
extra_tags = [part.strip() for part in extra_tags_raw.split(",") if part.strip()]
elif isinstance(extra_tags_raw, (list, tuple, set)):
extra_tags = [str(item).strip() for item in extra_tags_raw if str(item).strip()]
else:
extra_tags = []
return {
"pattern": pattern,
"skip_download": skip_download,
"skip_metadata": skip_metadata,
"force_screenshot": force_screenshot,
"extra_tags": extra_tags,
}
def resolve_url_policy(config: dict[str, Any], url: str) -> UrlPolicy:
policies_raw = config.get("url_policies")
if not policies_raw:
return UrlPolicy()
if not isinstance(policies_raw, list):
return UrlPolicy()
parsed = urlparse(url)
subject = f"{parsed.netloc}{parsed.path}"
host = parsed.netloc
resolved = UrlPolicy()
for rule_raw in policies_raw:
if not isinstance(rule_raw, dict):
continue
rule = _normalise_rule(rule_raw)
if rule is None:
continue
pattern = rule["pattern"]
if not (fnmatch(host, pattern) or fnmatch(subject, pattern)):
continue
if rule["skip_download"]:
resolved.skip_download = True
if rule["skip_metadata"]:
resolved.skip_metadata = True
if rule["force_screenshot"]:
resolved.force_screenshot = True
if rule["extra_tags"]:
for tag in rule["extra_tags"]:
if tag not in resolved.extra_tags:
resolved.extra_tags.append(tag)
return resolved

79
helper/utils_constant.py Normal file
View File

@@ -0,0 +1,79 @@
mime_maps = {
"image": {
"jpg": { "ext": ".jpg", "mimes": ["image/jpeg", "image/jpg"] },
"png": { "ext": ".png", "mimes": ["image/png"] },
"gif": { "ext": ".gif", "mimes": ["image/gif"] },
"webp": { "ext": ".webp", "mimes": ["image/webp"] },
"avif": { "ext": ".avif", "mimes": ["image/avif"] },
"jxl": { "ext": ".jxl", "mimes": ["image/jxl"] },
"bmp": { "ext": ".bmp", "mimes": ["image/bmp"] },
"heic": { "ext": ".heic", "mimes": ["image/heic"] },
"heif": { "ext": ".heif", "mimes": ["image/heif"] },
"ico": { "ext": ".ico", "mimes": ["image/x-icon", "image/vnd.microsoft.icon"] },
"qoi": { "ext": ".qoi", "mimes": ["image/qoi"] },
"tiff": { "ext": ".tiff", "mimes": ["image/tiff", "image/x-tiff"] },
"svg": { "ext": ".svg", "mimes": ["image/svg+xml"] }
},
"image_sequence": {
"apng": { "ext": ".apng", "mimes": ["image/apng"], "sequence": True },
"avifs": { "ext": ".avifs", "mimes": ["image/avif-sequence"], "sequence": True },
"heics": { "ext": ".heics", "mimes": ["image/heic-sequence"], "sequence": True },
"heifs": { "ext": ".heifs", "mimes": ["image/heif-sequence"], "sequence": True }
},
"video": {
"mp4": { "ext": ".mp4", "mimes": ["video/mp4", "audio/mp4"] },
"webm": { "ext": ".webm", "mimes": ["video/webm", "audio/webm"] },
"mov": { "ext": ".mov", "mimes": ["video/quicktime"] },
"ogv": { "ext": ".ogv", "mimes": ["video/ogg"] },
"mpeg": { "ext": ".mpeg", "mimes": ["video/mpeg"] },
"avi": { "ext": ".avi", "mimes": ["video/x-msvideo", "video/avi"] },
"flv": { "ext": ".flv", "mimes": ["video/x-flv"] },
"mkv": { "ext": ".mkv", "mimes": ["video/x-matroska", "application/x-matroska"], "audio_only_ext": ".mka" },
"wmv": { "ext": ".wmv", "mimes": ["video/x-ms-wmv"] },
"rv": { "ext": ".rv", "mimes": ["video/vnd.rn-realvideo"] }
},
"audio": {
"mp3": { "ext": ".mp3", "mimes": ["audio/mpeg", "audio/mp3"] },
"m4a": { "ext": ".m4a", "mimes": ["audio/mp4", "audio/x-m4a"] },
"ogg": { "ext": ".ogg", "mimes": ["audio/ogg"] },
"flac": { "ext": ".flac", "mimes": ["audio/flac"] },
"wav": { "ext": ".wav", "mimes": ["audio/wav", "audio/x-wav", "audio/vnd.wave"] },
"wma": { "ext": ".wma", "mimes": ["audio/x-ms-wma"] },
"tta": { "ext": ".tta", "mimes": ["audio/x-tta"] },
"wv": { "ext": ".wv", "mimes": ["audio/x-wavpack", "audio/wavpack"] },
"mka": { "ext": ".mka", "mimes": ["audio/x-matroska", "video/x-matroska"] }
},
"document": {
"pdf": { "ext": ".pdf", "mimes": ["application/pdf"] },
"epub": { "ext": ".epub", "mimes": ["application/epub+zip"] },
"djvu": { "ext": ".djvu", "mimes": ["application/vnd.djvu"] },
"rtf": { "ext": ".rtf", "mimes": ["application/rtf"] },
"docx": { "ext": ".docx", "mimes": ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"] },
"xlsx": { "ext": ".xlsx", "mimes": ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"] },
"pptx": { "ext": ".pptx", "mimes": ["application/vnd.openxmlformats-officedocument.presentationml.presentation"] },
"doc": { "ext": ".doc", "mimes": ["application/msword"] },
"xls": { "ext": ".xls", "mimes": ["application/vnd.ms-excel"] },
"ppt": { "ext": ".ppt", "mimes": ["application/vnd.ms-powerpoint"] }
},
"archive": {
"zip": { "ext": ".zip", "mimes": ["application/zip"] },
"7z": { "ext": ".7z", "mimes": ["application/x-7z-compressed"] },
"rar": { "ext": ".rar", "mimes": ["application/x-rar-compressed", "application/vnd.rar"] },
"gz": { "ext": ".gz", "mimes": ["application/gzip", "application/x-gzip"] },
"tar": { "ext": ".tar", "mimes": ["application/x-tar"] },
"cbz": { "ext": ".cbz", "mimes": ["application/zip"], "note": "zip archive of images; prefer extension-based detection for comics" }
},
"project": {
"clip": { "ext": ".clip", "mimes": ["application/clip"] },
"kra": { "ext": ".kra", "mimes": ["application/x-krita"] },
"procreate": { "ext": ".procreate", "mimes": ["application/x-procreate"] },
"psd": { "ext": ".psd", "mimes": ["image/vnd.adobe.photoshop"] },
"swf": { "ext": ".swf", "mimes": ["application/x-shockwave-flash"] }
},
"other": {
"octet-stream": { "ext": "", "mimes": ["application/octet-stream"] },
"json": { "ext": ".json", "mimes": ["application/json"] },
"xml": { "ext": ".xml", "mimes": ["application/xml", "text/xml"] },
"csv": { "ext": ".csv", "mimes": ["text/csv"] }
}
}

655
helper/worker_manager.py Normal file
View File

@@ -0,0 +1,655 @@
"""Worker task management with persistent database storage.
Manages worker tasks for downloads, searches, imports, etc. with automatic
persistence to database and optional auto-refresh callbacks.
"""
import logging
from pathlib import Path
from typing import Optional, Dict, Any, List, Callable
from datetime import datetime
from threading import Thread, Lock
import time
from .local_library import LocalLibraryDB
from helper.logger import log
logger = logging.getLogger(__name__)
class Worker:
"""Represents a single worker task with state management."""
def __init__(self, worker_id: str, worker_type: str, title: str = "",
description: str = "", manager: Optional['WorkerManager'] = None):
"""Initialize a worker.
Args:
worker_id: Unique identifier for this worker
worker_type: Type of work (e.g., 'download', 'search', 'import')
title: Human-readable title
description: Detailed description
manager: Reference to parent WorkerManager for state updates
"""
self.id = worker_id
self.type = worker_type
self.title = title or worker_type
self.description = description
self.manager = manager
self.status = "running"
self.progress = ""
self.details = ""
self.error_message = ""
self.result = "pending"
self._stdout_buffer = []
self._steps_buffer = []
def log_step(self, step_text: str) -> None:
"""Log a step for this worker.
Args:
step_text: Text describing the step
"""
try:
if self.manager:
self.manager.log_step(self.id, step_text)
else:
logger.info(f"[{self.id}] {step_text}")
except Exception as e:
logger.error(f"Error logging step for worker {self.id}: {e}")
def append_stdout(self, text: str) -> None:
"""Append text to stdout log.
Args:
text: Text to append
"""
try:
if self.manager:
self.manager.append_worker_stdout(self.id, text)
else:
self._stdout_buffer.append(text)
except Exception as e:
logger.error(f"Error appending stdout for worker {self.id}: {e}")
def get_stdout(self) -> str:
"""Get all stdout for this worker.
Returns:
Complete stdout text
"""
try:
if self.manager:
return self.manager.get_stdout(self.id)
else:
return "\n".join(self._stdout_buffer)
except Exception as e:
logger.error(f"Error getting stdout for worker {self.id}: {e}")
return ""
def get_steps(self) -> str:
"""Get all steps for this worker.
Returns:
Complete steps text
"""
try:
if self.manager:
return self.manager.get_steps(self.id)
else:
return "\n".join(self._steps_buffer)
except Exception as e:
logger.error(f"Error getting steps for worker {self.id}: {e}")
return ""
def update_progress(self, progress: str = "", details: str = "") -> None:
"""Update worker progress.
Args:
progress: Progress string (e.g., "50%")
details: Additional details
"""
self.progress = progress
self.details = details
try:
if self.manager:
self.manager.update_worker(self.id, progress, details)
except Exception as e:
logger.error(f"Error updating worker {self.id}: {e}")
def finish(self, result: str = "completed", message: str = "") -> None:
"""Mark worker as finished.
Args:
result: Result status ('completed', 'error', 'cancelled')
message: Result message/error details
"""
self.result = result
self.status = "finished"
self.error_message = message
try:
if self.manager:
# Flush and disable logging handler before marking finished
self.manager.disable_logging_for_worker(self.id)
# Then mark as finished in database
self.manager.finish_worker(self.id, result, message)
except Exception as e:
logger.error(f"Error finishing worker {self.id}: {e}")
class WorkerLoggingHandler(logging.StreamHandler):
"""Custom logging handler that captures logs for a worker."""
def __init__(self, worker_id: str, db: LocalLibraryDB,
manager: Optional['WorkerManager'] = None,
buffer_size: int = 50):
"""Initialize the handler.
Args:
worker_id: ID of the worker to capture logs for
db: Reference to LocalLibraryDB for storing logs
buffer_size: Number of logs to buffer before flushing to DB
"""
super().__init__()
self.worker_id = worker_id
self.db = db
self.manager = manager
self.buffer_size = buffer_size
self.buffer = []
self._lock = Lock()
# Set a format that includes timestamp and level
formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
self.setFormatter(formatter)
def emit(self, record):
"""Emit a log record."""
try:
# Try to format the record normally
try:
msg = self.format(record)
except (TypeError, ValueError):
# If formatting fails (e.g., %d format with non-int arg),
# build message manually without calling getMessage()
try:
# Try to format with args if possible
if record.args:
msg = record.msg % record.args
else:
msg = record.msg
except (TypeError, ValueError):
# If that fails too, just use the raw message string
msg = str(record.msg)
# Add timestamp and level if not already in message
import time
timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(record.created))
msg = f"{timestamp} - {record.name} - {record.levelname} - {msg}"
with self._lock:
self.buffer.append(msg)
# Flush to DB when buffer reaches size
if len(self.buffer) >= self.buffer_size:
self._flush()
except Exception:
self.handleError(record)
def _flush(self):
"""Flush buffered logs to database."""
if self.buffer:
log_text = '\n'.join(self.buffer)
try:
if self.manager:
self.manager.append_worker_stdout(self.worker_id, log_text, channel='log')
else:
self.db.append_worker_stdout(self.worker_id, log_text, channel='log')
except Exception as e:
# If we can't write to DB, at least log it
log(f"Error flushing worker logs: {e}")
self.buffer = []
def flush(self):
"""Flush any buffered records."""
with self._lock:
self._flush()
super().flush()
def close(self):
"""Close the handler."""
self.flush()
super().close()
class WorkerManager:
"""Manages persistent worker tasks with auto-refresh capability."""
def __init__(self, library_root: Path, auto_refresh_interval: float = 2.0):
"""Initialize the worker manager.
Args:
library_root: Root directory for the local library database
auto_refresh_interval: Seconds between auto-refresh checks (0 = disabled)
"""
self.library_root = Path(library_root)
self.db = LocalLibraryDB(library_root)
self.auto_refresh_interval = auto_refresh_interval
self.refresh_callbacks: List[Callable] = []
self.refresh_thread: Optional[Thread] = None
self._stop_refresh = False
self._lock = Lock()
self.worker_handlers: Dict[str, WorkerLoggingHandler] = {} # Track active handlers
self._worker_last_step: Dict[str, str] = {}
def add_refresh_callback(self, callback: Callable[[List[Dict[str, Any]]], None]) -> None:
"""Register a callback to be called on worker updates.
Args:
callback: Function that receives list of active workers
"""
with self._lock:
self.refresh_callbacks.append(callback)
def expire_running_workers(
self,
older_than_seconds: int = 300,
worker_id_prefix: Optional[str] = None,
reason: Optional[str] = None,
status: str = "error",
) -> int:
"""Mark stale running workers as finished.
Args:
older_than_seconds: Idle threshold before expiring.
worker_id_prefix: Optional wildcard filter (e.g., 'cli_%').
reason: Error message if none already exists.
status: New status to apply.
Returns:
Count of workers updated.
"""
try:
return self.db.expire_running_workers(
older_than_seconds=older_than_seconds,
status=status,
reason=reason,
worker_id_prefix=worker_id_prefix,
)
except Exception as exc:
logger.error(f"Failed to expire stale workers: {exc}", exc_info=True)
return 0
def remove_refresh_callback(self, callback: Callable) -> None:
"""Remove a refresh callback.
Args:
callback: The callback function to remove
"""
with self._lock:
if callback in self.refresh_callbacks:
self.refresh_callbacks.remove(callback)
def enable_logging_for_worker(self, worker_id: str) -> Optional[WorkerLoggingHandler]:
"""Enable logging capture for a worker.
Creates a logging handler that captures all logs for this worker.
Args:
worker_id: ID of the worker to capture logs for
Returns:
The logging handler that was created, or None if there was an error
"""
try:
handler = WorkerLoggingHandler(worker_id, self.db, manager=self)
with self._lock:
self.worker_handlers[worker_id] = handler
# Add the handler to the root logger so it captures all logs
root_logger = logging.getLogger()
root_logger.addHandler(handler)
root_logger.setLevel(logging.DEBUG) # Capture all levels
logger.debug(f"[WorkerManager] Enabled logging for worker: {worker_id}")
return handler
except Exception as e:
logger.error(f"[WorkerManager] Error enabling logging for worker {worker_id}: {e}", exc_info=True)
return None
def disable_logging_for_worker(self, worker_id: str) -> None:
"""Disable logging capture for a worker and flush any pending logs.
Args:
worker_id: ID of the worker to stop capturing logs for
"""
try:
with self._lock:
handler = self.worker_handlers.pop(worker_id, None)
if handler:
# Flush and close the handler
handler.flush()
handler.close()
# Remove from root logger
root_logger = logging.getLogger()
root_logger.removeHandler(handler)
logger.debug(f"[WorkerManager] Disabled logging for worker: {worker_id}")
except Exception as e:
logger.error(f"[WorkerManager] Error disabling logging for worker {worker_id}: {e}", exc_info=True)
def track_worker(self, worker_id: str, worker_type: str, title: str = "",
description: str = "", total_steps: int = 0,
pipe: Optional[str] = None) -> bool:
"""Start tracking a new worker.
Args:
worker_id: Unique identifier for the worker
worker_type: Type of worker (e.g., 'download', 'search', 'import')
title: Worker title/name
description: Worker description
total_steps: Total number of steps for progress tracking
pipe: Text of the originating pipe/prompt, if any
Returns:
True if worker was inserted successfully
"""
try:
result = self.db.insert_worker(worker_id, worker_type, title, description, total_steps, pipe=pipe)
if result > 0:
logger.debug(f"[WorkerManager] Tracking worker: {worker_id} ({worker_type})")
self._start_refresh_if_needed()
return True
return False
except Exception as e:
logger.error(f"[WorkerManager] Error tracking worker: {e}", exc_info=True)
return False
def update_worker(self, worker_id: str, progress: float = 0.0, current_step: str = "",
details: str = "", error: str = "") -> bool:
"""Update worker progress and status.
Args:
worker_id: Unique identifier for the worker
progress: Progress percentage (0-100)
current_step: Current step description
details: Additional details
error: Error message if any
Returns:
True if update was successful
"""
try:
kwargs = {}
if progress > 0:
kwargs['progress'] = progress
if current_step:
kwargs['current_step'] = current_step
if details:
kwargs['description'] = details
if error:
kwargs['error_message'] = error
if kwargs:
kwargs['last_updated'] = datetime.now().isoformat()
if 'current_step' in kwargs and kwargs['current_step']:
self._worker_last_step[worker_id] = str(kwargs['current_step'])
return self.db.update_worker(worker_id, **kwargs)
return True
except Exception as e:
logger.error(f"[WorkerManager] Error updating worker {worker_id}: {e}", exc_info=True)
return False
def finish_worker(self, worker_id: str, result: str = "completed",
error_msg: str = "", result_data: str = "") -> bool:
"""Mark a worker as finished.
Args:
worker_id: Unique identifier for the worker
result: Result status ('completed', 'error', 'cancelled')
error_msg: Error message if any
result_data: Result data as JSON string
Returns:
True if update was successful
"""
try:
kwargs = {
'status': result,
'completed_at': datetime.now().isoformat()
}
if error_msg:
kwargs['error_message'] = error_msg
if result_data:
kwargs['result_data'] = result_data
success = self.db.update_worker(worker_id, **kwargs)
logger.info(f"[WorkerManager] Worker finished: {worker_id} ({result})")
self._worker_last_step.pop(worker_id, None)
return success
except Exception as e:
logger.error(f"[WorkerManager] Error finishing worker {worker_id}: {e}", exc_info=True)
return False
def get_active_workers(self) -> List[Dict[str, Any]]:
"""Get all active (running) workers.
Returns:
List of active worker dictionaries
"""
try:
return self.db.get_active_workers()
except Exception as e:
logger.error(f"[WorkerManager] Error getting active workers: {e}", exc_info=True)
return []
def get_finished_workers(self, limit: int = 100) -> List[Dict[str, Any]]:
"""Get all finished workers (completed, errored, or cancelled).
Args:
limit: Maximum number of workers to retrieve
Returns:
List of finished worker dictionaries
"""
try:
all_workers = self.db.get_all_workers(limit=limit)
# Filter to only finished workers
finished = [w for w in all_workers if w.get('status') in ['completed', 'error', 'cancelled']]
return finished
except Exception as e:
logger.error(f"[WorkerManager] Error getting finished workers: {e}", exc_info=True)
return []
def get_worker(self, worker_id: str) -> Optional[Dict[str, Any]]:
"""Get a specific worker's data.
Args:
worker_id: Unique identifier for the worker
Returns:
Worker data or None if not found
"""
try:
return self.db.get_worker(worker_id)
except Exception as e:
logger.error(f"[WorkerManager] Error getting worker {worker_id}: {e}", exc_info=True)
return None
def get_worker_events(self, worker_id: str, limit: int = 500) -> List[Dict[str, Any]]:
"""Fetch recorded worker timeline events."""
return self.db.get_worker_events(worker_id, limit)
def log_step(self, worker_id: str, step_text: str) -> bool:
"""Log a step to a worker's step history.
Args:
worker_id: Unique identifier for the worker
step_text: Step description to log
Returns:
True if successful
"""
try:
success = self.db.append_worker_steps(worker_id, step_text)
if success:
self._worker_last_step[worker_id] = step_text
return success
except Exception as e:
logger.error(f"[WorkerManager] Error logging step for worker {worker_id}: {e}", exc_info=True)
return False
def _get_last_step(self, worker_id: str) -> Optional[str]:
"""Return the most recent step description for a worker."""
return self._worker_last_step.get(worker_id)
def get_steps(self, worker_id: str) -> str:
"""Get step logs for a worker.
Args:
worker_id: Unique identifier for the worker
Returns:
Steps text or empty string if not found
"""
try:
return self.db.get_worker_steps(worker_id)
except Exception as e:
logger.error(f"[WorkerManager] Error getting steps for worker {worker_id}: {e}", exc_info=True)
return ''
def start_auto_refresh(self) -> None:
"""Start the auto-refresh thread for periodic worker updates."""
if self.auto_refresh_interval <= 0:
logger.debug("[WorkerManager] Auto-refresh disabled (interval <= 0)")
return
if self.refresh_thread and self.refresh_thread.is_alive():
logger.debug("[WorkerManager] Auto-refresh already running")
return
logger.info(f"[WorkerManager] Starting auto-refresh with {self.auto_refresh_interval}s interval")
self._stop_refresh = False
self.refresh_thread = Thread(target=self._auto_refresh_loop, daemon=True)
self.refresh_thread.start()
def stop_auto_refresh(self) -> None:
"""Stop the auto-refresh thread."""
logger.info("[WorkerManager] Stopping auto-refresh")
self._stop_refresh = True
if self.refresh_thread:
self.refresh_thread.join(timeout=5)
self.refresh_thread = None
def _start_refresh_if_needed(self) -> None:
"""Start auto-refresh if we have active workers and callbacks."""
active = self.get_active_workers()
if active and self.refresh_callbacks and not self._stop_refresh:
self.start_auto_refresh()
def _auto_refresh_loop(self) -> None:
"""Main auto-refresh loop that periodically queries and notifies."""
try:
while not self._stop_refresh:
time.sleep(self.auto_refresh_interval)
# Check if there are active workers
active = self.get_active_workers()
if not active:
# No more active workers, stop refreshing
logger.debug("[WorkerManager] No active workers, stopping auto-refresh")
break
# Call all registered callbacks with the active workers
with self._lock:
for callback in self.refresh_callbacks:
try:
callback(active)
except Exception as e:
logger.error(f"[WorkerManager] Error in refresh callback: {e}", exc_info=True)
except Exception as e:
logger.error(f"[WorkerManager] Error in auto-refresh loop: {e}", exc_info=True)
finally:
logger.debug("[WorkerManager] Auto-refresh loop ended")
def cleanup_old_workers(self, days: int = 7) -> int:
"""Clean up completed/errored workers older than specified days.
Args:
days: Delete workers completed more than this many days ago
Returns:
Number of workers deleted
"""
try:
count = self.db.cleanup_old_workers(days)
if count > 0:
logger.info(f"[WorkerManager] Cleaned up {count} old workers")
return count
except Exception as e:
logger.error(f"[WorkerManager] Error cleaning up old workers: {e}", exc_info=True)
return 0
def append_stdout(self, worker_id: str, text: str, channel: str = "stdout") -> bool:
"""Append text to a worker's stdout log.
Args:
worker_id: Unique identifier for the worker
text: Text to append
channel: Logical channel (stdout, stderr, log, etc.)
Returns:
True if append was successful
"""
try:
step_label = self._get_last_step(worker_id)
return self.db.append_worker_stdout(worker_id, text, step=step_label, channel=channel)
except Exception as e:
logger.error(f"[WorkerManager] Error appending stdout: {e}", exc_info=True)
return False
def get_stdout(self, worker_id: str) -> str:
"""Get stdout logs for a worker.
Args:
worker_id: Unique identifier for the worker
Returns:
Worker's stdout or empty string
"""
try:
return self.db.get_worker_stdout(worker_id)
except Exception as e:
logger.error(f"[WorkerManager] Error getting stdout: {e}", exc_info=True)
return ""
def append_worker_stdout(self, worker_id: str, text: str, channel: str = "stdout") -> bool:
"""Compatibility wrapper for append_stdout."""
return self.append_stdout(worker_id, text, channel=channel)
def clear_stdout(self, worker_id: str) -> bool:
"""Clear stdout logs for a worker.
Args:
worker_id: Unique identifier for the worker
Returns:
True if clear was successful
"""
try:
return self.db.clear_worker_stdout(worker_id)
except Exception as e:
logger.error(f"[WorkerManager] Error clearing stdout: {e}", exc_info=True)
return False
def close(self) -> None:
"""Close the worker manager and database connection."""
self.stop_auto_refresh()
self.db.close()
logger.info("[WorkerManager] Closed")