AST
This commit is contained in:
92
helper/__init__.py
Normal file
92
helper/__init__.py
Normal file
@@ -0,0 +1,92 @@
|
||||
"""Helper modules for the downlow mpv integration."""
|
||||
from . import hydrus as _hydrus
|
||||
from . import download as _download
|
||||
from . import tasks as _tasks
|
||||
from . import utils as _utils
|
||||
|
||||
try: # Optional dependency on Playwright
|
||||
from . import webshot as _webshot
|
||||
except Exception as exc: # pragma: no cover - surfaced when Playwright is missing
|
||||
_webshot = None # type: ignore
|
||||
ScreenshotError = None # type: ignore[assignment]
|
||||
ScreenshotOptions = None # type: ignore[assignment]
|
||||
ScreenshotResult = None # type: ignore[assignment]
|
||||
capture_screenshot = None # type: ignore[assignment]
|
||||
ScreenshotImportError = exc # type: ignore[assignment]
|
||||
else:
|
||||
ScreenshotError = _webshot.ScreenshotError
|
||||
ScreenshotOptions = _webshot.ScreenshotOptions
|
||||
ScreenshotResult = _webshot.ScreenshotResult
|
||||
capture_screenshot = _webshot.capture_screenshot
|
||||
ScreenshotImportError = None
|
||||
# CBOR utilities
|
||||
decode_cbor = _utils.decode_cbor
|
||||
jsonify = _utils.jsonify
|
||||
# General utilities
|
||||
CHUNK_SIZE = _utils.CHUNK_SIZE
|
||||
ensure_directory = _utils.ensure_directory
|
||||
unique_path = _utils.unique_path
|
||||
download_hydrus_file = _hydrus.download_hydrus_file
|
||||
sanitize_metadata_value = _utils.sanitize_metadata_value
|
||||
unique_preserve_order = _utils.unique_preserve_order
|
||||
sha256_file = _utils.sha256_file
|
||||
create_metadata_sidecar = _utils.create_metadata_sidecar
|
||||
create_tags_sidecar = _utils.create_tags_sidecar
|
||||
# Format utilities
|
||||
format_bytes = _utils.format_bytes
|
||||
format_duration = _utils.format_duration
|
||||
format_timestamp = _utils.format_timestamp
|
||||
format_metadata_value = _utils.format_metadata_value
|
||||
# Link utilities
|
||||
extract_link = _utils.extract_link
|
||||
extract_link_from_args = _utils.extract_link_from_args
|
||||
extract_link_from_result = _utils.extract_link_from_result
|
||||
get_api_key = _utils.get_api_key
|
||||
add_direct_link_to_result = _utils.add_direct_link_to_result
|
||||
# URL policy utilities
|
||||
resolve_url_policy = _utils.resolve_url_policy
|
||||
UrlPolicy = _utils.UrlPolicy
|
||||
# Download utilities
|
||||
DownloadOptions = _download.DownloadOptions
|
||||
DownloadError = _download.DownloadError
|
||||
DownloadMediaResult = _download.DownloadMediaResult
|
||||
download_media = _download.download_media
|
||||
is_url_supported_by_ytdlp = _download.is_url_supported_by_ytdlp
|
||||
probe_url = _download.probe_url
|
||||
# Hydrus utilities
|
||||
hydrus_request = _hydrus.hydrus_request
|
||||
hydrus_export = _hydrus.hydrus_export
|
||||
HydrusClient = _hydrus.HydrusClient
|
||||
HydrusRequestError = _hydrus.HydrusRequestError
|
||||
connect_ipc = _tasks.connect_ipc
|
||||
ipc_sender = _tasks.ipc_sender
|
||||
__all__ = [
|
||||
'decode_cbor',
|
||||
'jsonify',
|
||||
'CHUNK_SIZE',
|
||||
'ensure_directory',
|
||||
'unique_path',
|
||||
'download_hydrus_file',
|
||||
'sanitize_metadata_value',
|
||||
'unique_preserve_order',
|
||||
'sha256_file',
|
||||
'resolve_url_policy',
|
||||
'UrlPolicy',
|
||||
'ScreenshotError',
|
||||
'ScreenshotOptions',
|
||||
'ScreenshotResult',
|
||||
'capture_screenshot',
|
||||
'ScreenshotImportError',
|
||||
'DownloadOptions',
|
||||
'DownloadError',
|
||||
'DownloadMediaResult',
|
||||
'download_media',
|
||||
'is_url_supported_by_ytdlp',
|
||||
'probe_url',
|
||||
'HydrusClient',
|
||||
'HydrusRequestError',
|
||||
'hydrus_request',
|
||||
'hydrus_export',
|
||||
'connect_ipc',
|
||||
'ipc_sender',
|
||||
]
|
||||
130
helper/adjective.json
Normal file
130
helper/adjective.json
Normal file
@@ -0,0 +1,130 @@
|
||||
{
|
||||
"Occult": [
|
||||
"esoterica",
|
||||
"ritual",
|
||||
"alchemy",
|
||||
"magic",
|
||||
"hermetic",
|
||||
"divination",
|
||||
"grimoires",
|
||||
"symbolism",
|
||||
"ceremony"
|
||||
],
|
||||
"Philosophy": [
|
||||
"ethics",
|
||||
"metaphysics",
|
||||
"epistemology",
|
||||
"logic",
|
||||
"existentialism",
|
||||
"stoicism",
|
||||
"phenomenology",
|
||||
"dialectic",
|
||||
"aesthetics"
|
||||
],
|
||||
"Mystery": [
|
||||
"investigation",
|
||||
"crime",
|
||||
"detective",
|
||||
"noir",
|
||||
"thriller",
|
||||
"suspense",
|
||||
"conspiracy",
|
||||
"whodunit",
|
||||
"clues"
|
||||
],
|
||||
"Religion": [
|
||||
"scripture",
|
||||
"theology",
|
||||
"worship",
|
||||
"ritual",
|
||||
"doctrine",
|
||||
"faith",
|
||||
"tradition",
|
||||
"liturgy",
|
||||
"sacred"
|
||||
],
|
||||
"Mythology": [
|
||||
"gods",
|
||||
"creation",
|
||||
"heroes",
|
||||
"legends",
|
||||
"folklore",
|
||||
"pantheon",
|
||||
"epic",
|
||||
"mythic",
|
||||
"archetype"
|
||||
],
|
||||
"Science": [
|
||||
"research",
|
||||
"experiment",
|
||||
"theory",
|
||||
"biology",
|
||||
"physics",
|
||||
"chemistry",
|
||||
"data",
|
||||
"method",
|
||||
"innovation"
|
||||
],
|
||||
"Art": [
|
||||
"visual",
|
||||
"painting",
|
||||
"sculpture",
|
||||
"modernism",
|
||||
"technique",
|
||||
"studio",
|
||||
"curation",
|
||||
"expression",
|
||||
"composition"
|
||||
],
|
||||
"Literature": [
|
||||
"fiction",
|
||||
"poetry",
|
||||
"novel",
|
||||
"criticism",
|
||||
"narrative",
|
||||
"prose",
|
||||
"drama",
|
||||
"canonical",
|
||||
"translation"
|
||||
],
|
||||
"History": [
|
||||
"archaeology",
|
||||
"chronicle",
|
||||
"period",
|
||||
"empire",
|
||||
"revolution",
|
||||
"archive",
|
||||
"heritage",
|
||||
"historiography",
|
||||
"timeline"
|
||||
],
|
||||
"Psychology": [
|
||||
"cognition",
|
||||
"behavior",
|
||||
"therapy",
|
||||
"development",
|
||||
"neuroscience",
|
||||
"personality",
|
||||
"perception",
|
||||
"emotion",
|
||||
"motivation"
|
||||
],
|
||||
"gnostic": [
|
||||
"religion",
|
||||
"scripture",
|
||||
"gnostic",
|
||||
"gospel",
|
||||
"wisdom",
|
||||
"spirituality",
|
||||
"ancient",
|
||||
"philosophy",
|
||||
"esoteric",
|
||||
"mysticism",
|
||||
"mythology",
|
||||
"theology",
|
||||
"sacred",
|
||||
"divine",
|
||||
"apocrapha",
|
||||
"gnosticism"
|
||||
]
|
||||
}
|
||||
829
helper/alldebrid.py
Normal file
829
helper/alldebrid.py
Normal file
@@ -0,0 +1,829 @@
|
||||
"""AllDebrid API integration for converting free links to direct downloads.
|
||||
|
||||
AllDebrid is a debrid service that unlocks free file hosters and provides direct download links.
|
||||
API docs: https://docs.alldebrid.com/#general-informations
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sys
|
||||
|
||||
from helper.logger import log, debug
|
||||
import time
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional, Set, List, Sequence
|
||||
from urllib.parse import urlencode, urlparse
|
||||
from .http_client import HTTPClient
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AllDebridError(Exception):
|
||||
"""Raised when AllDebrid API request fails."""
|
||||
pass
|
||||
|
||||
|
||||
# Cache for supported hosters (domain -> host info)
|
||||
_SUPPORTED_HOSTERS_CACHE: Optional[Dict[str, Dict[str, Any]]] = None
|
||||
_CACHE_TIMESTAMP: float = 0
|
||||
_CACHE_DURATION: float = 3600 # 1 hour
|
||||
|
||||
|
||||
class AllDebridClient:
|
||||
"""Client for AllDebrid API."""
|
||||
|
||||
# Try both v4 and v3 APIs
|
||||
BASE_URLS = [
|
||||
"https://api.alldebrid.com/v4",
|
||||
"https://api.alldebrid.com/v3",
|
||||
]
|
||||
|
||||
def __init__(self, api_key: str):
|
||||
"""Initialize AllDebrid client with API key.
|
||||
|
||||
Args:
|
||||
api_key: AllDebrid API key from config
|
||||
"""
|
||||
self.api_key = api_key.strip()
|
||||
if not self.api_key:
|
||||
raise AllDebridError("AllDebrid API key is empty")
|
||||
self.base_url = self.BASE_URLS[0] # Start with v4
|
||||
|
||||
def _request(self, endpoint: str, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
|
||||
"""Make a request to AllDebrid API.
|
||||
|
||||
Args:
|
||||
endpoint: API endpoint (e.g., "user/profile", "link/unlock")
|
||||
params: Query parameters
|
||||
|
||||
Returns:
|
||||
Parsed JSON response
|
||||
|
||||
Raises:
|
||||
AllDebridError: If request fails or API returns error
|
||||
"""
|
||||
if params is None:
|
||||
params = {}
|
||||
|
||||
# Add API key to params
|
||||
params['apikey'] = self.api_key
|
||||
|
||||
url = f"{self.base_url}/{endpoint}"
|
||||
query_string = urlencode(params)
|
||||
full_url = f"{url}?{query_string}"
|
||||
|
||||
logger.debug(f"[AllDebrid] {endpoint} request to {full_url[:80]}...")
|
||||
|
||||
try:
|
||||
# Pass timeout to HTTPClient init, not to get()
|
||||
with HTTPClient(timeout=30.0, headers={'User-Agent': 'downlow/1.0'}) as client:
|
||||
try:
|
||||
response = client.get(full_url)
|
||||
response.raise_for_status()
|
||||
except Exception as req_err:
|
||||
# Log detailed error info
|
||||
logger.error(f"[AllDebrid] Request error to {full_url[:80]}: {req_err}", exc_info=True)
|
||||
if hasattr(req_err, 'response') and req_err.response is not None: # type: ignore
|
||||
try:
|
||||
error_body = req_err.response.content.decode('utf-8') # type: ignore
|
||||
logger.error(f"[AllDebrid] Response body: {error_body[:200]}")
|
||||
except:
|
||||
pass
|
||||
raise
|
||||
|
||||
data = json.loads(response.content.decode('utf-8'))
|
||||
logger.debug(f"[AllDebrid] Response status: {response.status_code}")
|
||||
|
||||
# Check for API errors
|
||||
if data.get('status') == 'error':
|
||||
error_msg = data.get('error', {}).get('message', 'Unknown error')
|
||||
logger.error(f"[AllDebrid] API error: {error_msg}")
|
||||
raise AllDebridError(f"AllDebrid API error: {error_msg}")
|
||||
|
||||
return data
|
||||
except AllDebridError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
error_msg = f"AllDebrid request failed: {exc}"
|
||||
logger.error(f"[AllDebrid] {error_msg}", exc_info=True)
|
||||
raise AllDebridError(error_msg)
|
||||
|
||||
def unlock_link(self, link: str) -> Optional[str]:
|
||||
"""Unlock a restricted link and get direct download URL.
|
||||
|
||||
Args:
|
||||
link: Restricted link to unlock
|
||||
|
||||
Returns:
|
||||
Direct download URL, or None if already unrestricted
|
||||
|
||||
Raises:
|
||||
AllDebridError: If unlock fails
|
||||
"""
|
||||
if not link.startswith(('http://', 'https://')):
|
||||
raise AllDebridError(f"Invalid URL: {link}")
|
||||
|
||||
try:
|
||||
response = self._request('link/unlock', {'link': link})
|
||||
|
||||
# Check if unlock was successful
|
||||
if response.get('status') == 'success':
|
||||
data = response.get('data', {})
|
||||
|
||||
# AllDebrid returns the download info in 'link' field
|
||||
if 'link' in data:
|
||||
return data['link']
|
||||
|
||||
# Alternative: check for 'file' field
|
||||
if 'file' in data:
|
||||
return data['file']
|
||||
|
||||
# If no direct link, return the input link
|
||||
return link
|
||||
|
||||
return None
|
||||
except AllDebridError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise AllDebridError(f"Failed to unlock link: {exc}")
|
||||
|
||||
def check_host(self, hostname: str) -> Dict[str, Any]:
|
||||
"""Check if a host is supported by AllDebrid.
|
||||
|
||||
Args:
|
||||
hostname: Hostname to check (e.g., "uploadhaven.com")
|
||||
|
||||
Returns:
|
||||
Host information dict with support status
|
||||
|
||||
Raises:
|
||||
AllDebridError: If request fails
|
||||
"""
|
||||
try:
|
||||
response = self._request('host', {'name': hostname})
|
||||
|
||||
if response.get('status') == 'success':
|
||||
return response.get('data', {})
|
||||
|
||||
return {}
|
||||
except AllDebridError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise AllDebridError(f"Failed to check host: {exc}")
|
||||
|
||||
def get_user_info(self) -> Dict[str, Any]:
|
||||
"""Get current user account information.
|
||||
|
||||
Returns:
|
||||
User information dict
|
||||
|
||||
Raises:
|
||||
AllDebridError: If request fails
|
||||
"""
|
||||
try:
|
||||
response = self._request('user/profile')
|
||||
|
||||
if response.get('status') == 'success':
|
||||
return response.get('data', {})
|
||||
|
||||
return {}
|
||||
except AllDebridError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise AllDebridError(f"Failed to get user info: {exc}")
|
||||
|
||||
def get_supported_hosters(self) -> Dict[str, Dict[str, Any]]:
|
||||
"""Get list of all supported hosters from AllDebrid API.
|
||||
|
||||
Returns:
|
||||
Dict mapping domain to host info (status, name, etc)
|
||||
|
||||
Raises:
|
||||
AllDebridError: If request fails
|
||||
"""
|
||||
try:
|
||||
response = self._request('hosts/domains')
|
||||
|
||||
if response.get('status') == 'success':
|
||||
data = response.get('data', {})
|
||||
# The API returns hosts keyed by domain
|
||||
return data if isinstance(data, dict) else {}
|
||||
|
||||
return {}
|
||||
except AllDebridError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise AllDebridError(f"Failed to get supported hosters: {exc}")
|
||||
|
||||
def magnet_add(self, magnet_uri: str) -> Dict[str, Any]:
|
||||
"""Submit a magnet link or torrent hash to AllDebrid for processing.
|
||||
|
||||
AllDebrid will download the torrent content and store it in the account.
|
||||
Processing time varies based on torrent size and availability.
|
||||
|
||||
Args:
|
||||
magnet_uri: Magnet URI (magnet:?xt=urn:btih:...) or torrent hash
|
||||
|
||||
Returns:
|
||||
Dict with magnet info:
|
||||
- id: Magnet ID (int) - needed for status checks
|
||||
- name: Torrent name
|
||||
- hash: Torrent hash
|
||||
- size: Total file size (bytes)
|
||||
- ready: Boolean - True if already available
|
||||
|
||||
Raises:
|
||||
AllDebridError: If submit fails (requires premium, invalid magnet, etc)
|
||||
"""
|
||||
if not magnet_uri:
|
||||
raise AllDebridError("Magnet URI is empty")
|
||||
|
||||
try:
|
||||
# API endpoint: POST /v4/magnet/upload
|
||||
# Format: /magnet/upload?apikey=key&magnets[]=magnet:?xt=...
|
||||
response = self._request('magnet/upload', {'magnets[]': magnet_uri})
|
||||
|
||||
if response.get('status') == 'success':
|
||||
data = response.get('data', {})
|
||||
magnets = data.get('magnets', [])
|
||||
|
||||
if magnets and len(magnets) > 0:
|
||||
magnet_info = magnets[0]
|
||||
|
||||
# Check for errors in the magnet response
|
||||
if 'error' in magnet_info:
|
||||
error = magnet_info['error']
|
||||
error_msg = error.get('message', 'Unknown error')
|
||||
raise AllDebridError(f"Magnet error: {error_msg}")
|
||||
|
||||
return magnet_info
|
||||
|
||||
raise AllDebridError("No magnet data in response")
|
||||
|
||||
raise AllDebridError(f"API error: {response.get('error', 'Unknown')}")
|
||||
except AllDebridError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise AllDebridError(f"Failed to submit magnet: {exc}")
|
||||
|
||||
def magnet_status(self, magnet_id: int, include_files: bool = False) -> Dict[str, Any]:
|
||||
"""Get status of a magnet currently being processed or stored.
|
||||
|
||||
Status codes:
|
||||
0-3: Processing (in queue, downloading, compressing, uploading)
|
||||
4: Ready (files available for download)
|
||||
5-15: Error (upload failed, not downloaded in 20min, too big, etc)
|
||||
|
||||
Args:
|
||||
magnet_id: Magnet ID from magnet_add()
|
||||
include_files: If True, includes file list in response
|
||||
|
||||
Returns:
|
||||
Dict with status info:
|
||||
- id: Magnet ID
|
||||
- filename: Torrent name
|
||||
- size: Total size (bytes)
|
||||
- status: Human-readable status
|
||||
- statusCode: Numeric code (0-15)
|
||||
- downloaded: Bytes downloaded so far
|
||||
- uploaded: Bytes uploaded so far
|
||||
- seeders: Number of seeders
|
||||
- downloadSpeed: Current speed (bytes/sec)
|
||||
- uploadSpeed: Current speed (bytes/sec)
|
||||
- files: (optional) Array of file objects when include_files=True
|
||||
Each file: {n: name, s: size, l: download_link}
|
||||
|
||||
Raises:
|
||||
AllDebridError: If status check fails
|
||||
"""
|
||||
if not isinstance(magnet_id, int) or magnet_id <= 0:
|
||||
raise AllDebridError(f"Invalid magnet ID: {magnet_id}")
|
||||
|
||||
try:
|
||||
# Use v4.1 endpoint for better response format
|
||||
# Temporarily override base_url for this request
|
||||
old_base = self.base_url
|
||||
self.base_url = "https://api.alldebrid.com/v4.1"
|
||||
|
||||
try:
|
||||
response = self._request('magnet/status', {'id': str(magnet_id)})
|
||||
finally:
|
||||
self.base_url = old_base
|
||||
|
||||
if response.get('status') == 'success':
|
||||
data = response.get('data', {})
|
||||
magnets = data.get('magnets', {})
|
||||
|
||||
# Handle both list and dict responses
|
||||
if isinstance(magnets, list) and len(magnets) > 0:
|
||||
return magnets[0]
|
||||
elif isinstance(magnets, dict) and magnets:
|
||||
return magnets
|
||||
|
||||
raise AllDebridError(f"No magnet found with ID {magnet_id}")
|
||||
|
||||
raise AllDebridError(f"API error: {response.get('error', 'Unknown')}")
|
||||
except AllDebridError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise AllDebridError(f"Failed to get magnet status: {exc}")
|
||||
|
||||
def magnet_status_live(self, magnet_id: int, session: int = None, counter: int = 0) -> Dict[str, Any]:
|
||||
"""Get live status of a magnet using delta sync mode.
|
||||
|
||||
The live mode endpoint provides real-time progress by only sending
|
||||
deltas (changed fields) instead of full status on each call. This
|
||||
reduces bandwidth and server load compared to regular polling.
|
||||
|
||||
Note: The "live" designation refers to the delta-sync mode where you
|
||||
maintain state locally and apply diffs from the API, not a streaming
|
||||
endpoint. Regular magnet_status() polling is simpler for single magnets.
|
||||
|
||||
Docs: https://docs.alldebrid.com/#get-status-live-mode
|
||||
|
||||
Args:
|
||||
magnet_id: Magnet ID from magnet_add()
|
||||
session: Session ID (use same ID across multiple calls). If None, will query current status
|
||||
counter: Counter value from previous response (starts at 0)
|
||||
|
||||
Returns:
|
||||
Dict with magnet status. May contain only changed fields if counter > 0.
|
||||
For single-magnet tracking, use magnet_status() instead.
|
||||
|
||||
Raises:
|
||||
AllDebridError: If request fails
|
||||
"""
|
||||
if not isinstance(magnet_id, int) or magnet_id <= 0:
|
||||
raise AllDebridError(f"Invalid magnet ID: {magnet_id}")
|
||||
|
||||
try:
|
||||
# For single magnet queries, just use regular endpoint with ID
|
||||
# The "live mode" with session/counter is for multi-magnet dashboards
|
||||
# where bandwidth savings from diffs matter
|
||||
response = self._request('magnet/status', {'id': magnet_id})
|
||||
|
||||
if response.get('status') == 'success':
|
||||
data = response.get('data', {})
|
||||
magnets = data.get('magnets', [])
|
||||
|
||||
# Handle list response
|
||||
if isinstance(magnets, list) and len(magnets) > 0:
|
||||
return magnets[0]
|
||||
|
||||
raise AllDebridError(f"No magnet found with ID {magnet_id}")
|
||||
|
||||
raise AllDebridError(f"API error: {response.get('error', 'Unknown')}")
|
||||
except AllDebridError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise AllDebridError(f"Failed to get magnet live status: {exc}")
|
||||
|
||||
def magnet_links(self, magnet_ids: list) -> Dict[str, Any]:
|
||||
"""Get files and download links for one or more magnets.
|
||||
|
||||
Use this after magnet_status shows statusCode == 4 (Ready).
|
||||
Returns the file tree structure with direct download links.
|
||||
|
||||
Args:
|
||||
magnet_ids: List of magnet IDs to get files for
|
||||
|
||||
Returns:
|
||||
Dict mapping magnet_id (as string) -> magnet_info:
|
||||
- id: Magnet ID
|
||||
- files: Array of file/folder objects
|
||||
File: {n: name, s: size, l: direct_download_link}
|
||||
Folder: {n: name, e: [sub_items]}
|
||||
|
||||
Raises:
|
||||
AllDebridError: If request fails
|
||||
"""
|
||||
if not magnet_ids:
|
||||
raise AllDebridError("No magnet IDs provided")
|
||||
|
||||
try:
|
||||
# Build parameter: id[]=123&id[]=456 style
|
||||
params = {}
|
||||
for i, magnet_id in enumerate(magnet_ids):
|
||||
params[f'id[{i}]'] = str(magnet_id)
|
||||
|
||||
response = self._request('magnet/files', params)
|
||||
|
||||
if response.get('status') == 'success':
|
||||
data = response.get('data', {})
|
||||
magnets = data.get('magnets', [])
|
||||
|
||||
# Convert list to dict keyed by ID (as string) for easier access
|
||||
result = {}
|
||||
for magnet_info in magnets:
|
||||
magnet_id = magnet_info.get('id')
|
||||
if magnet_id:
|
||||
result[str(magnet_id)] = magnet_info
|
||||
|
||||
return result
|
||||
|
||||
raise AllDebridError(f"API error: {response.get('error', 'Unknown')}")
|
||||
except AllDebridError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise AllDebridError(f"Failed to get magnet files: {exc}")
|
||||
|
||||
def instant_available(self, magnet_hash: str) -> Optional[List[Dict[str, Any]]]:
|
||||
"""Check if magnet is available for instant streaming without downloading.
|
||||
|
||||
AllDebrid's "instant" feature checks if a magnet can be streamed directly
|
||||
without downloading all the data. Returns available video/audio files.
|
||||
|
||||
Args:
|
||||
magnet_hash: Torrent hash (with or without magnet: prefix)
|
||||
|
||||
Returns:
|
||||
List of available files for streaming, or None if not available
|
||||
Each file: {n: name, s: size, e: extension, t: type}
|
||||
Returns empty list if torrent not found or not available
|
||||
|
||||
Raises:
|
||||
AllDebridError: If API request fails
|
||||
"""
|
||||
try:
|
||||
# Parse magnet hash if needed
|
||||
if magnet_hash.startswith('magnet:'):
|
||||
# Extract hash from magnet URI
|
||||
import re
|
||||
match = re.search(r'xt=urn:btih:([a-fA-F0-9]+)', magnet_hash)
|
||||
if not match:
|
||||
return None
|
||||
hash_value = match.group(1)
|
||||
else:
|
||||
hash_value = magnet_hash.strip()
|
||||
|
||||
if not hash_value or len(hash_value) < 32:
|
||||
return None
|
||||
|
||||
response = self._request('magnet/instant', {'magnet': hash_value})
|
||||
|
||||
if response.get('status') == 'success':
|
||||
data = response.get('data', {})
|
||||
# Returns 'files' array if available, or empty
|
||||
return data.get('files', [])
|
||||
|
||||
# Not available is not an error, just return empty list
|
||||
return []
|
||||
|
||||
except AllDebridError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
logger.debug(f"[AllDebrid] instant_available check failed: {exc}")
|
||||
return None
|
||||
|
||||
def magnet_delete(self, magnet_id: int) -> bool:
|
||||
"""Delete a magnet from the AllDebrid account.
|
||||
|
||||
Args:
|
||||
magnet_id: Magnet ID to delete
|
||||
|
||||
Returns:
|
||||
True if deletion was successful
|
||||
|
||||
Raises:
|
||||
AllDebridError: If deletion fails
|
||||
"""
|
||||
if not isinstance(magnet_id, int) or magnet_id <= 0:
|
||||
raise AllDebridError(f"Invalid magnet ID: {magnet_id}")
|
||||
|
||||
try:
|
||||
response = self._request('magnet/delete', {'id': str(magnet_id)})
|
||||
|
||||
if response.get('status') == 'success':
|
||||
return True
|
||||
|
||||
raise AllDebridError(f"API error: {response.get('error', 'Unknown')}")
|
||||
except AllDebridError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise AllDebridError(f"Failed to delete magnet: {exc}")
|
||||
|
||||
|
||||
def _get_cached_supported_hosters(api_key: str) -> Set[str]:
|
||||
"""Get cached list of supported hoster domains.
|
||||
|
||||
Uses AllDebrid API to fetch the list once per hour,
|
||||
caching the result to avoid repeated API calls.
|
||||
|
||||
Args:
|
||||
api_key: AllDebrid API key
|
||||
|
||||
Returns:
|
||||
Set of supported domain names (lowercased)
|
||||
"""
|
||||
global _SUPPORTED_HOSTERS_CACHE, _CACHE_TIMESTAMP
|
||||
|
||||
now = time.time()
|
||||
|
||||
# Return cached result if still valid
|
||||
if _SUPPORTED_HOSTERS_CACHE is not None and (now - _CACHE_TIMESTAMP) < _CACHE_DURATION:
|
||||
return set(_SUPPORTED_HOSTERS_CACHE.keys())
|
||||
|
||||
# Fetch fresh list from API
|
||||
try:
|
||||
client = AllDebridClient(api_key)
|
||||
hosters_dict = client.get_supported_hosters()
|
||||
|
||||
if hosters_dict:
|
||||
# API returns: hosts (list), streams (list), redirectors (list)
|
||||
# Combine all into a single set
|
||||
all_domains: Set[str] = set()
|
||||
|
||||
# Add hosts
|
||||
if 'hosts' in hosters_dict and isinstance(hosters_dict['hosts'], list):
|
||||
all_domains.update(hosters_dict['hosts'])
|
||||
|
||||
# Add streams
|
||||
if 'streams' in hosters_dict and isinstance(hosters_dict['streams'], list):
|
||||
all_domains.update(hosters_dict['streams'])
|
||||
|
||||
# Add redirectors
|
||||
if 'redirectors' in hosters_dict and isinstance(hosters_dict['redirectors'], list):
|
||||
all_domains.update(hosters_dict['redirectors'])
|
||||
|
||||
# Cache as dict for consistency
|
||||
_SUPPORTED_HOSTERS_CACHE = {domain: {} for domain in all_domains}
|
||||
_CACHE_TIMESTAMP = now
|
||||
|
||||
if all_domains:
|
||||
debug(f"✓ Cached {len(all_domains)} supported hosters")
|
||||
|
||||
return all_domains
|
||||
except Exception as exc:
|
||||
log(f"⚠ Failed to fetch supported hosters: {exc}", file=sys.stderr)
|
||||
# Return any cached hosters even if expired
|
||||
if _SUPPORTED_HOSTERS_CACHE:
|
||||
return set(_SUPPORTED_HOSTERS_CACHE.keys())
|
||||
|
||||
# Fallback: empty set if no cache available
|
||||
return set()
|
||||
|
||||
|
||||
def is_link_restrictable_hoster(url: str, api_key: str) -> bool:
|
||||
"""Check if a URL is from a hoster that AllDebrid can unlock.
|
||||
|
||||
Intelligently queries the AllDebrid API to detect if the URL is
|
||||
from a supported restricted hoster.
|
||||
|
||||
Args:
|
||||
url: URL to check
|
||||
api_key: AllDebrid API key
|
||||
|
||||
Returns:
|
||||
True if URL is from a supported restrictable hoster
|
||||
"""
|
||||
if not url or not api_key:
|
||||
return False
|
||||
|
||||
try:
|
||||
# Extract domain from URL
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc.lower()
|
||||
|
||||
# Remove www. prefix for comparison
|
||||
if domain.startswith('www.'):
|
||||
domain = domain[4:]
|
||||
|
||||
# Get supported hosters (cached)
|
||||
supported = _get_cached_supported_hosters(api_key)
|
||||
|
||||
if not supported:
|
||||
# API check failed, fall back to manual detection
|
||||
# Check for common restricted hosters
|
||||
common_hosters = {
|
||||
'uploadhaven.com', 'uploaded.to', 'uploaded.net',
|
||||
'datafile.com', 'rapidfile.io', 'nitroflare.com',
|
||||
'1fichier.com', 'mega.nz', 'mediafire.com'
|
||||
}
|
||||
return any(host in url.lower() for host in common_hosters)
|
||||
|
||||
# Check if domain is in supported list
|
||||
# Need to check exact match and with/without www
|
||||
return domain in supported or f"www.{domain}" in supported
|
||||
except Exception as exc:
|
||||
log(f"⚠ Hoster detection failed: {exc}", file=sys.stderr)
|
||||
return False
|
||||
|
||||
|
||||
def convert_link_with_debrid(link: str, api_key: str) -> Optional[str]:
|
||||
"""Convert a restricted link to a direct download URL using AllDebrid.
|
||||
|
||||
Args:
|
||||
link: Restricted link
|
||||
api_key: AllDebrid API key
|
||||
|
||||
Returns:
|
||||
Direct download URL, or original link if already unrestricted
|
||||
"""
|
||||
if not api_key:
|
||||
return None
|
||||
|
||||
try:
|
||||
client = AllDebridClient(api_key)
|
||||
direct_link = client.unlock_link(link)
|
||||
|
||||
if direct_link and direct_link != link:
|
||||
debug(f"✓ Converted link: {link[:60]}... → {direct_link[:60]}...")
|
||||
return direct_link
|
||||
|
||||
return None
|
||||
except AllDebridError as exc:
|
||||
log(f"⚠ Failed to convert link: {exc}", file=sys.stderr)
|
||||
return None
|
||||
except Exception as exc:
|
||||
log(f"⚠ Unexpected error: {exc}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def is_magnet_link(uri: str) -> bool:
|
||||
"""Check if a URI is a magnet link.
|
||||
|
||||
Magnet links start with 'magnet:?xt=urn:btih:' or just 'magnet:'
|
||||
|
||||
Args:
|
||||
uri: URI to check
|
||||
|
||||
Returns:
|
||||
True if URI is a magnet link
|
||||
"""
|
||||
if not uri:
|
||||
return False
|
||||
return uri.lower().startswith('magnet:')
|
||||
|
||||
|
||||
def is_torrent_hash(text: str) -> bool:
|
||||
"""Check if text looks like a torrent hash (40 or 64 hex characters).
|
||||
|
||||
Common formats:
|
||||
- Info hash v1: 40 hex chars (SHA-1)
|
||||
- Info hash v2: 64 hex chars (SHA-256)
|
||||
|
||||
Args:
|
||||
text: Text to check
|
||||
|
||||
Returns:
|
||||
True if text matches torrent hash format
|
||||
"""
|
||||
if not text or not isinstance(text, str):
|
||||
return False
|
||||
|
||||
text = text.strip()
|
||||
|
||||
# Check if it's 40 hex chars (SHA-1) or 64 hex chars (SHA-256)
|
||||
if len(text) not in (40, 64):
|
||||
return False
|
||||
|
||||
try:
|
||||
# Try to parse as hex
|
||||
int(text, 16)
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
|
||||
def is_torrent_file(path: str) -> bool:
|
||||
"""Check if a file path is a .torrent file.
|
||||
|
||||
Args:
|
||||
path: File path to check
|
||||
|
||||
Returns:
|
||||
True if file has .torrent extension
|
||||
"""
|
||||
if not path:
|
||||
return False
|
||||
return path.lower().endswith('.torrent')
|
||||
|
||||
|
||||
def parse_magnet_or_hash(uri: str) -> Optional[str]:
|
||||
"""Parse a magnet URI or hash into a format for AllDebrid API.
|
||||
|
||||
AllDebrid's magnet/upload endpoint accepts:
|
||||
- Full magnet URIs: magnet:?xt=urn:btih:...
|
||||
- Info hashes: 40 or 64 hex characters
|
||||
|
||||
Args:
|
||||
uri: Magnet URI or hash
|
||||
|
||||
Returns:
|
||||
Normalized input for AllDebrid API, or None if invalid
|
||||
"""
|
||||
if not uri:
|
||||
return None
|
||||
|
||||
uri = uri.strip()
|
||||
|
||||
# Already a magnet link - just return it
|
||||
if is_magnet_link(uri):
|
||||
return uri
|
||||
|
||||
# Check if it's a valid hash
|
||||
if is_torrent_hash(uri):
|
||||
return uri
|
||||
|
||||
# Not a recognized format
|
||||
return None
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Cmdlet: unlock_link
|
||||
# ============================================================================
|
||||
|
||||
def unlock_link_cmdlet(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
"""Unlock a restricted link using AllDebrid.
|
||||
|
||||
Converts free hosters and restricted links to direct download URLs.
|
||||
|
||||
Usage:
|
||||
unlock-link <link>
|
||||
unlock-link # Uses URL from pipeline result
|
||||
|
||||
Requires:
|
||||
- AllDebrid API key in config under Debrid.All-debrid
|
||||
|
||||
Args:
|
||||
result: Pipeline result object
|
||||
args: Command arguments
|
||||
config: Configuration dictionary
|
||||
|
||||
Returns:
|
||||
0 on success, 1 on failure
|
||||
"""
|
||||
try:
|
||||
from .link_utils import (
|
||||
extract_link,
|
||||
get_api_key,
|
||||
add_direct_link_to_result,
|
||||
)
|
||||
except ImportError as e:
|
||||
log(f"Required modules unavailable: {e}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Get link from args or result
|
||||
link = extract_link(result, args)
|
||||
|
||||
if not link:
|
||||
log("No valid URL provided", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Get AllDebrid API key from config
|
||||
api_key = get_api_key(config, "AllDebrid", "Debrid.All-debrid")
|
||||
|
||||
if not api_key:
|
||||
log("AllDebrid API key not configured in Debrid.All-debrid", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Try to unlock the link
|
||||
debug(f"Unlocking: {link}")
|
||||
direct_link = convert_link_with_debrid(link, api_key)
|
||||
|
||||
if direct_link:
|
||||
debug(f"✓ Direct link: {direct_link}")
|
||||
|
||||
# Update result with direct link
|
||||
add_direct_link_to_result(result, direct_link, link)
|
||||
|
||||
# Return the updated result via pipeline context
|
||||
# Note: The cmdlet wrapper will handle emitting to pipeline
|
||||
return 0
|
||||
else:
|
||||
log(f"❌ Failed to unlock link or already unrestricted", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Cmdlet Registration
|
||||
# ============================================================================
|
||||
|
||||
def _register_unlock_link():
|
||||
"""Register unlock-link command with cmdlet registry if available."""
|
||||
try:
|
||||
from cmdlets import register
|
||||
|
||||
@register(["unlock-link"])
|
||||
def unlock_link_wrapper(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
"""Wrapper to make unlock_link_cmdlet available as cmdlet."""
|
||||
import pipeline as ctx
|
||||
|
||||
ret_code = unlock_link_cmdlet(result, args, config)
|
||||
|
||||
# If successful, emit the result
|
||||
if ret_code == 0:
|
||||
ctx.emit(result)
|
||||
|
||||
return ret_code
|
||||
|
||||
return unlock_link_wrapper
|
||||
except ImportError:
|
||||
# If cmdlets module not available, just return None
|
||||
return None
|
||||
|
||||
|
||||
# Register when module is imported
|
||||
_unlock_link_registration = _register_unlock_link()
|
||||
567
helper/archive_client.py
Normal file
567
helper/archive_client.py
Normal file
@@ -0,0 +1,567 @@
|
||||
"""Archive.org API client for borrowing and downloading books.
|
||||
|
||||
This module provides low-level functions for interacting with Archive.org:
|
||||
- Authentication (login, credential management)
|
||||
- Borrowing (loan, return_loan)
|
||||
- Book metadata extraction (get_book_infos, get_book_metadata)
|
||||
- Image downloading and deobfuscation
|
||||
- PDF creation with metadata
|
||||
|
||||
Used by unified_book_downloader.py for the borrowing workflow.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from concurrent import futures
|
||||
from typing import Any, Dict, List, Optional, Sequence, Tuple
|
||||
|
||||
import requests
|
||||
|
||||
from helper.logger import log, debug
|
||||
|
||||
try:
|
||||
from Crypto.Cipher import AES # type: ignore
|
||||
from Crypto.Util import Counter # type: ignore
|
||||
except ImportError:
|
||||
AES = None # type: ignore
|
||||
Counter = None # type: ignore
|
||||
|
||||
try:
|
||||
from tqdm import tqdm # type: ignore
|
||||
except ImportError:
|
||||
tqdm = None # type: ignore
|
||||
|
||||
|
||||
def credential_openlibrary(config: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""Get OpenLibrary/Archive.org email and password from config.
|
||||
|
||||
Supports both formats:
|
||||
- New: {"provider": {"openlibrary": {"email": "...", "password": "..."}}}
|
||||
- Old: {"Archive": {"email": "...", "password": "..."}}
|
||||
{"archive_org_email": "...", "archive_org_password": "..."}
|
||||
|
||||
Returns: (email, password) tuple, each can be None
|
||||
"""
|
||||
if not isinstance(config, dict):
|
||||
return None, None
|
||||
|
||||
# Try new format first
|
||||
provider_config = config.get("provider", {})
|
||||
if isinstance(provider_config, dict):
|
||||
openlibrary_config = provider_config.get("openlibrary", {})
|
||||
if isinstance(openlibrary_config, dict):
|
||||
email = openlibrary_config.get("email")
|
||||
password = openlibrary_config.get("password")
|
||||
if email or password:
|
||||
return email, password
|
||||
|
||||
# Try old nested format
|
||||
archive_config = config.get("Archive")
|
||||
if isinstance(archive_config, dict):
|
||||
email = archive_config.get("email")
|
||||
password = archive_config.get("password")
|
||||
if email or password:
|
||||
return email, password
|
||||
|
||||
# Fall back to old flat format
|
||||
email = config.get("archive_org_email")
|
||||
password = config.get("archive_org_password")
|
||||
return email, password
|
||||
|
||||
|
||||
def display_error(response: requests.Response, message: str) -> None:
|
||||
"""Display error and exit."""
|
||||
log(message, file=sys.stderr)
|
||||
log(response.text, file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def login(email: str, password: str) -> requests.Session:
|
||||
"""Login to archive.org.
|
||||
|
||||
Args:
|
||||
email: Archive.org email
|
||||
password: Archive.org password
|
||||
|
||||
Returns:
|
||||
Authenticated requests.Session
|
||||
|
||||
Raises:
|
||||
SystemExit on login failure
|
||||
"""
|
||||
session = requests.Session()
|
||||
session.get("https://archive.org/account/login", timeout=30)
|
||||
|
||||
data = {"username": email, "password": password}
|
||||
response = session.post("https://archive.org/account/login", data=data, timeout=30)
|
||||
|
||||
if "bad_login" in response.text:
|
||||
log("Invalid credentials!", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
if "Successful login" in response.text:
|
||||
debug("Successful login")
|
||||
return session
|
||||
display_error(response, "[-] Error while login:")
|
||||
sys.exit(1) # Unreachable but satisfies type checker
|
||||
|
||||
|
||||
def loan(session: requests.Session, book_id: str, verbose: bool = True) -> requests.Session:
|
||||
"""Borrow a book from archive.org (14-day loan).
|
||||
|
||||
Args:
|
||||
session: Authenticated requests.Session from login()
|
||||
book_id: Archive.org book identifier (e.g., 'ia_book_id')
|
||||
verbose: Whether to log messages
|
||||
|
||||
Returns:
|
||||
Session with active loan
|
||||
|
||||
Raises:
|
||||
SystemExit on loan failure
|
||||
"""
|
||||
data = {"action": "grant_access", "identifier": book_id}
|
||||
response = session.post("https://archive.org/services/loans/loan/searchInside.php", data=data, timeout=30)
|
||||
data["action"] = "browse_book"
|
||||
response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30)
|
||||
|
||||
if response.status_code == 400:
|
||||
try:
|
||||
if response.json()["error"] == "This book is not available to borrow at this time. Please try again later.":
|
||||
debug("This book doesn't need to be borrowed")
|
||||
return session
|
||||
display_error(response, "Something went wrong when trying to borrow the book.")
|
||||
except:
|
||||
display_error(response, "The book cannot be borrowed")
|
||||
|
||||
data["action"] = "create_token"
|
||||
response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30)
|
||||
|
||||
if "token" in response.text:
|
||||
if verbose:
|
||||
debug("Successful loan")
|
||||
return session
|
||||
display_error(response, "Something went wrong when trying to borrow the book.")
|
||||
sys.exit(1) # Unreachable but satisfies type checker
|
||||
|
||||
|
||||
def return_loan(session: requests.Session, book_id: str) -> None:
|
||||
"""Return a borrowed book.
|
||||
|
||||
Args:
|
||||
session: Authenticated requests.Session with active loan
|
||||
book_id: Archive.org book identifier
|
||||
"""
|
||||
data = {"action": "return_loan", "identifier": book_id}
|
||||
response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30)
|
||||
if response.status_code == 200 and response.json()["success"]:
|
||||
debug("Book returned")
|
||||
else:
|
||||
display_error(response, "Something went wrong when trying to return the book")
|
||||
|
||||
|
||||
def get_book_infos(session: requests.Session, url: str) -> Tuple[str, List[str], Dict[str, Any]]:
|
||||
"""Extract book information and page links from archive.org viewer.
|
||||
|
||||
Args:
|
||||
session: Authenticated requests.Session
|
||||
url: Book URL (e.g., https://archive.org/borrow/book_id or /details/book_id)
|
||||
|
||||
Returns:
|
||||
Tuple of (title, page_links, metadata)
|
||||
|
||||
Raises:
|
||||
RuntimeError: If page data cannot be extracted
|
||||
"""
|
||||
r = session.get(url, timeout=30).text
|
||||
|
||||
# Try to extract the infos URL from the response
|
||||
try:
|
||||
# Look for the "url" field in the response
|
||||
if '"url":"' not in r:
|
||||
raise ValueError("No 'url' field found in response")
|
||||
infos_url = "https:" + r.split('"url":"')[1].split('"')[0].replace("\\u0026", "&")
|
||||
except (IndexError, ValueError) as e:
|
||||
# If URL extraction fails, raise with better error message
|
||||
raise RuntimeError(f"Failed to extract book info URL from response: {e}")
|
||||
|
||||
response = session.get(infos_url, timeout=30)
|
||||
data = response.json()["data"]
|
||||
title = data["brOptions"]["bookTitle"].strip().replace(" ", "_")
|
||||
title = "".join(c for c in title if c not in '<>:"/\\|?*') # Filter forbidden chars
|
||||
title = title[:150] # Trim to avoid long file names
|
||||
metadata = data["metadata"]
|
||||
links = []
|
||||
|
||||
# Safely extract page links from brOptions data
|
||||
try:
|
||||
br_data = data.get("brOptions", {}).get("data", [])
|
||||
for item in br_data:
|
||||
if isinstance(item, list):
|
||||
for page in item:
|
||||
if isinstance(page, dict) and "uri" in page:
|
||||
links.append(page["uri"])
|
||||
elif isinstance(item, dict) and "uri" in item:
|
||||
links.append(item["uri"])
|
||||
except (KeyError, IndexError, TypeError) as e:
|
||||
log(f"Warning: Error parsing page links: {e}", file=sys.stderr)
|
||||
# Continue with whatever links we found
|
||||
|
||||
if len(links) > 1:
|
||||
debug(f"Found {len(links)} pages")
|
||||
return title, links, metadata
|
||||
elif len(links) == 1:
|
||||
debug(f"Found {len(links)} page")
|
||||
return title, links, metadata
|
||||
else:
|
||||
log("Error while getting image links - no pages found", file=sys.stderr)
|
||||
raise RuntimeError("No pages found in book data")
|
||||
|
||||
|
||||
def image_name(pages: int, page: int, directory: str) -> str:
|
||||
"""Generate image filename for page.
|
||||
|
||||
Args:
|
||||
pages: Total number of pages
|
||||
page: Current page number (0-indexed)
|
||||
directory: Directory to save to
|
||||
|
||||
Returns:
|
||||
Full path to image file
|
||||
"""
|
||||
return f"{directory}/{(len(str(pages)) - len(str(page))) * '0'}{page}.jpg"
|
||||
|
||||
|
||||
def deobfuscate_image(image_data: bytes, link: str, obf_header: str) -> bytes:
|
||||
"""Decrypt obfuscated image data using AES-CTR.
|
||||
|
||||
This handles Archive.org's image obfuscation for borrowed books.
|
||||
Based on: https://github.com/justimm
|
||||
|
||||
Args:
|
||||
image_data: Encrypted image bytes
|
||||
link: Image URL (used to derive AES key)
|
||||
obf_header: X-Obfuscate header value (format: "1|BASE64_COUNTER")
|
||||
|
||||
Returns:
|
||||
Decrypted image bytes
|
||||
"""
|
||||
if not AES or not Counter:
|
||||
raise RuntimeError("Crypto library not available")
|
||||
|
||||
try:
|
||||
version, counter_b64 = obf_header.split("|")
|
||||
except Exception as e:
|
||||
raise ValueError("Invalid X-Obfuscate header format") from e
|
||||
|
||||
if version != "1":
|
||||
raise ValueError("Unsupported obfuscation version: " + version)
|
||||
|
||||
# Derive AES key from URL
|
||||
aesKey = re.sub(r"^https?:\/\/.*?\/", "/", link)
|
||||
sha1_digest = hashlib.sha1(aesKey.encode("utf-8")).digest()
|
||||
key = sha1_digest[:16]
|
||||
|
||||
# Decode counter
|
||||
counter_bytes = base64.b64decode(counter_b64)
|
||||
if len(counter_bytes) != 16:
|
||||
raise ValueError(f"Expected counter to be 16 bytes, got {len(counter_bytes)}")
|
||||
|
||||
prefix = counter_bytes[:8]
|
||||
initial_value = int.from_bytes(counter_bytes[8:], byteorder="big")
|
||||
|
||||
# Create AES-CTR cipher
|
||||
ctr = Counter.new(64, prefix=prefix, initial_value=initial_value, little_endian=False) # type: ignore
|
||||
cipher = AES.new(key, AES.MODE_CTR, counter=ctr) # type: ignore
|
||||
|
||||
decrypted_part = cipher.decrypt(image_data[:1024])
|
||||
new_data = decrypted_part + image_data[1024:]
|
||||
return new_data
|
||||
|
||||
|
||||
def download_one_image(
|
||||
session: requests.Session,
|
||||
link: str,
|
||||
i: int,
|
||||
directory: str,
|
||||
book_id: str,
|
||||
pages: int,
|
||||
) -> None:
|
||||
"""Download a single book page image.
|
||||
|
||||
Handles obfuscated images and re-borrowing on 403 errors.
|
||||
|
||||
Args:
|
||||
session: Authenticated requests.Session
|
||||
link: Direct image URL
|
||||
i: Page index (0-based)
|
||||
directory: Directory to save to
|
||||
book_id: Archive.org book ID (for re-borrowing on 403)
|
||||
pages: Total number of pages
|
||||
"""
|
||||
headers = {
|
||||
"Referer": "https://archive.org/",
|
||||
"Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
|
||||
"Sec-Fetch-Site": "same-site",
|
||||
"Sec-Fetch-Mode": "no-cors",
|
||||
"Sec-Fetch-Dest": "image",
|
||||
}
|
||||
retry = True
|
||||
response = None
|
||||
while retry:
|
||||
try:
|
||||
response = session.get(link, headers=headers, timeout=30)
|
||||
if response.status_code == 403:
|
||||
session = loan(session, book_id, verbose=False)
|
||||
raise Exception("Borrow again")
|
||||
if response.status_code == 200:
|
||||
retry = False
|
||||
except:
|
||||
time.sleep(1)
|
||||
|
||||
image = image_name(pages, i, directory)
|
||||
|
||||
if response is None:
|
||||
log(f"Failed to download page {i}", file=sys.stderr)
|
||||
return
|
||||
|
||||
obf_header = response.headers.get("X-Obfuscate")
|
||||
image_content = None
|
||||
if obf_header:
|
||||
try:
|
||||
image_content = deobfuscate_image(response.content, link, obf_header)
|
||||
except Exception as e:
|
||||
log(f"Deobfuscation failed: {e}", file=sys.stderr)
|
||||
return
|
||||
else:
|
||||
image_content = response.content
|
||||
|
||||
with open(image, "wb") as f:
|
||||
f.write(image_content)
|
||||
|
||||
|
||||
def download(
|
||||
session: requests.Session,
|
||||
n_threads: int,
|
||||
directory: str,
|
||||
links: List[str],
|
||||
scale: int,
|
||||
book_id: str,
|
||||
) -> List[str]:
|
||||
"""Download all book pages as images.
|
||||
|
||||
Uses thread pool for parallel downloads.
|
||||
|
||||
Args:
|
||||
session: Authenticated requests.Session
|
||||
n_threads: Number of download threads
|
||||
directory: Directory to save images to
|
||||
links: List of image URLs
|
||||
scale: Image resolution (0=highest, 10=lowest)
|
||||
book_id: Archive.org book ID (for re-borrowing)
|
||||
|
||||
Returns:
|
||||
List of downloaded image file paths
|
||||
"""
|
||||
debug("Downloading pages...")
|
||||
links = [f"{link}&rotate=0&scale={scale}" for link in links]
|
||||
pages = len(links)
|
||||
|
||||
tasks = []
|
||||
with futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
|
||||
for link in links:
|
||||
i = links.index(link)
|
||||
tasks.append(
|
||||
executor.submit(
|
||||
download_one_image,
|
||||
session=session,
|
||||
link=link,
|
||||
i=i,
|
||||
directory=directory,
|
||||
book_id=book_id,
|
||||
pages=pages,
|
||||
)
|
||||
)
|
||||
if tqdm:
|
||||
for _ in tqdm(futures.as_completed(tasks), total=len(tasks)): # type: ignore
|
||||
pass
|
||||
else:
|
||||
for _ in futures.as_completed(tasks):
|
||||
pass
|
||||
|
||||
images = [image_name(pages, i, directory) for i in range(len(links))]
|
||||
return images
|
||||
|
||||
|
||||
def check_direct_download(book_id: str) -> Tuple[bool, str]:
|
||||
"""Check if a book can be downloaded directly without borrowing.
|
||||
|
||||
Searches Archive.org metadata for downloadable PDF files.
|
||||
|
||||
Args:
|
||||
book_id: Archive.org book identifier
|
||||
|
||||
Returns:
|
||||
Tuple of (can_download: bool, pdf_url: str)
|
||||
"""
|
||||
try:
|
||||
# First, try to get the metadata to find the actual PDF filename
|
||||
metadata_url = f"https://archive.org/metadata/{book_id}"
|
||||
response = requests.get(metadata_url, timeout=10)
|
||||
response.raise_for_status()
|
||||
metadata = response.json()
|
||||
|
||||
# Find PDF file in files list
|
||||
if "files" in metadata:
|
||||
for file_info in metadata["files"]:
|
||||
filename = file_info.get("name", "")
|
||||
if filename.endswith(".pdf") and file_info.get("source") == "original":
|
||||
# Found the original PDF
|
||||
pdf_filename = filename
|
||||
pdf_url = f"https://archive.org/download/{book_id}/{pdf_filename.replace(' ', '%20')}"
|
||||
|
||||
# Verify it's accessible
|
||||
check_response = requests.head(pdf_url, timeout=5, allow_redirects=True)
|
||||
if check_response.status_code == 200:
|
||||
return True, pdf_url
|
||||
|
||||
return False, ""
|
||||
|
||||
except Exception as e:
|
||||
log(f"Error checking direct download: {e}", file=sys.stderr)
|
||||
return False, ""
|
||||
|
||||
|
||||
def get_openlibrary_by_isbn(isbn: str) -> Dict[str, Any]:
|
||||
"""Fetch book data from OpenLibrary using ISBN.
|
||||
|
||||
Args:
|
||||
isbn: ISBN-10 or ISBN-13 to search for
|
||||
|
||||
Returns:
|
||||
Dictionary with book metadata from OpenLibrary
|
||||
"""
|
||||
try:
|
||||
# Try ISBN API first
|
||||
api_url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn}&jscmd=data&format=json"
|
||||
response = requests.get(api_url, timeout=10)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
if data:
|
||||
# Get first result
|
||||
key = list(data.keys())[0]
|
||||
return data[key]
|
||||
return {}
|
||||
except Exception as e:
|
||||
log(f"Error fetching OpenLibrary data by ISBN: {e}", file=sys.stderr)
|
||||
return {}
|
||||
|
||||
|
||||
def extract_isbn_from_metadata(metadata: Dict[str, Any]) -> str:
|
||||
"""Extract ISBN from archive.org metadata.
|
||||
|
||||
Looks for ISBN in various metadata fields.
|
||||
|
||||
Args:
|
||||
metadata: Archive.org metadata dictionary
|
||||
|
||||
Returns:
|
||||
ISBN string (clean, no hyphens) or empty string if not found
|
||||
"""
|
||||
# Try various common metadata fields
|
||||
isbn_fields = [
|
||||
"isbn", "ISBN", "isbn_13", "isbn_10", "isbns",
|
||||
"isbn-10", "isbn-13", "identifer_isbn"
|
||||
]
|
||||
|
||||
for field in isbn_fields:
|
||||
if field in metadata:
|
||||
isbn_val = metadata[field]
|
||||
if isinstance(isbn_val, list):
|
||||
isbn_val = isbn_val[0] if isbn_val else None
|
||||
if isbn_val and isinstance(isbn_val, str):
|
||||
# Clean ISBN (remove hyphens, spaces)
|
||||
isbn_clean = isbn_val.replace("-", "").replace(" ", "")
|
||||
if len(isbn_clean) in [10, 13]:
|
||||
return isbn_clean
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
def normalize_url(url: str) -> str:
|
||||
"""Convert openlibrary.org URL to archive.org URL.
|
||||
|
||||
Looks up the actual Archive.org ID from OpenLibrary API.
|
||||
|
||||
Args:
|
||||
url: Book URL (archive.org or openlibrary.org format)
|
||||
|
||||
Returns:
|
||||
Normalized archive.org URL
|
||||
"""
|
||||
url = url.strip()
|
||||
|
||||
# Already archive.org format
|
||||
if url.startswith("https://archive.org/details/"):
|
||||
return url
|
||||
|
||||
# Convert openlibrary.org format by querying the OpenLibrary API
|
||||
if "openlibrary.org/books/" in url:
|
||||
try:
|
||||
# Extract the book ID (e.g., OL6796852M)
|
||||
parts = url.split("/books/")
|
||||
if len(parts) > 1:
|
||||
book_id = parts[1].split("/")[0]
|
||||
|
||||
# Query OpenLibrary API to get the book metadata
|
||||
api_url = f"https://openlibrary.org/books/{book_id}.json"
|
||||
response = requests.get(api_url, timeout=10)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
# Look for identifiers including internet_archive or ocaid
|
||||
# First try ocaid (Open Content Alliance ID) - this is most common
|
||||
if "ocaid" in data:
|
||||
ocaid = data["ocaid"]
|
||||
return f"https://archive.org/details/{ocaid}"
|
||||
|
||||
# Check for identifiers object
|
||||
if "identifiers" in data:
|
||||
identifiers = data["identifiers"]
|
||||
|
||||
# Look for internet_archive ID
|
||||
if "internet_archive" in identifiers:
|
||||
ia_ids = identifiers["internet_archive"]
|
||||
if isinstance(ia_ids, list) and ia_ids:
|
||||
ia_id = ia_ids[0]
|
||||
else:
|
||||
ia_id = ia_ids
|
||||
return f"https://archive.org/details/{ia_id}"
|
||||
|
||||
# If no IA identifier found, use the book ID as fallback
|
||||
log(f"No Internet Archive ID found for {book_id}. Attempting with OpenLibrary ID.", file=sys.stderr)
|
||||
return f"https://archive.org/details/{book_id}"
|
||||
|
||||
except requests.RequestException as e:
|
||||
log(f"Could not fetch OpenLibrary metadata: {e}", file=sys.stderr)
|
||||
# Fallback to using the book ID directly
|
||||
parts = url.split("/books/")
|
||||
if len(parts) > 1:
|
||||
book_id = parts[1].split("/")[0]
|
||||
return f"https://archive.org/details/{book_id}"
|
||||
except (KeyError, IndexError) as e:
|
||||
log(f"Error parsing OpenLibrary response: {e}", file=sys.stderr)
|
||||
# Fallback to using the book ID directly
|
||||
parts = url.split("/books/")
|
||||
if len(parts) > 1:
|
||||
book_id = parts[1].split("/")[0]
|
||||
return f"https://archive.org/details/{book_id}"
|
||||
|
||||
# Return original if can't parse
|
||||
return url
|
||||
730
helper/download.py
Normal file
730
helper/download.py
Normal file
@@ -0,0 +1,730 @@
|
||||
"""Download media files using yt-dlp with support for direct file downloads.
|
||||
|
||||
Lean, focused downloader without event infrastructure overhead.
|
||||
- yt-dlp integration for streaming sites
|
||||
- Direct file download fallback for PDFs, images, documents
|
||||
- Tag extraction via metadata.extract_ytdlp_tags()
|
||||
- Logging via helper.logger.log()
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re # noqa: F401
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterator, List, Optional
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import httpx
|
||||
|
||||
from helper.logger import log, debug
|
||||
from .utils import ensure_directory, sha256_file
|
||||
from .http_client import HTTPClient
|
||||
from models import DownloadError, DownloadOptions, DownloadMediaResult, DebugLogger, ProgressBar
|
||||
|
||||
try:
|
||||
import yt_dlp # type: ignore
|
||||
from yt_dlp.extractor import gen_extractors # type: ignore
|
||||
except Exception as exc:
|
||||
yt_dlp = None # type: ignore
|
||||
YTDLP_IMPORT_ERROR = exc
|
||||
else:
|
||||
YTDLP_IMPORT_ERROR = None
|
||||
|
||||
try:
|
||||
from metadata import extract_ytdlp_tags
|
||||
except ImportError:
|
||||
extract_ytdlp_tags = None
|
||||
|
||||
_EXTRACTOR_CACHE: List[Any] | None = None
|
||||
|
||||
|
||||
def _ensure_yt_dlp_ready() -> None:
|
||||
"""Verify yt-dlp is available, raise if not."""
|
||||
if yt_dlp is not None:
|
||||
return
|
||||
detail = str(YTDLP_IMPORT_ERROR or "yt-dlp is not installed")
|
||||
raise DownloadError(f"yt-dlp module not available: {detail}")
|
||||
|
||||
|
||||
def _progress_callback(status: Dict[str, Any]) -> None:
|
||||
"""Simple progress callback using logger."""
|
||||
event = status.get("status")
|
||||
if event == "downloading":
|
||||
percent = status.get("_percent_str", "?")
|
||||
speed = status.get("_speed_str", "?")
|
||||
debug(f"Downloading {percent} at {speed}")
|
||||
elif event == "finished":
|
||||
debug(f"✓ Download finished: {status.get('filename')}")
|
||||
elif event in ("postprocessing", "processing"):
|
||||
debug(f"Post-processing: {status.get('postprocessor')}")
|
||||
|
||||
|
||||
def is_url_supported_by_ytdlp(url: str) -> bool:
|
||||
"""Check if URL is supported by yt-dlp."""
|
||||
if yt_dlp is None:
|
||||
return False
|
||||
global _EXTRACTOR_CACHE
|
||||
if _EXTRACTOR_CACHE is None:
|
||||
try:
|
||||
_EXTRACTOR_CACHE = [ie for ie in gen_extractors()] # type: ignore[arg-type]
|
||||
except Exception:
|
||||
_EXTRACTOR_CACHE = []
|
||||
for extractor in _EXTRACTOR_CACHE:
|
||||
try:
|
||||
if not extractor.suitable(url):
|
||||
continue
|
||||
except Exception:
|
||||
continue
|
||||
name = getattr(extractor, "IE_NAME", "")
|
||||
if name.lower() == "generic":
|
||||
continue
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def list_formats(url: str, no_playlist: bool = False, playlist_items: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
|
||||
"""Get list of available formats for a URL using yt-dlp.
|
||||
|
||||
Args:
|
||||
url: URL to get formats for
|
||||
no_playlist: If True, ignore playlists and list formats for single video
|
||||
playlist_items: If specified, only list formats for these playlist items (e.g., "1,3,5-8")
|
||||
|
||||
Returns:
|
||||
List of format dictionaries with keys: format_id, format, resolution, fps, vcodec, acodec, filesize, etc.
|
||||
Returns None if yt-dlp is not available or format listing fails.
|
||||
"""
|
||||
_ensure_yt_dlp_ready()
|
||||
|
||||
try:
|
||||
ydl_opts = {
|
||||
"quiet": False,
|
||||
"no_warnings": False,
|
||||
"socket_timeout": 30,
|
||||
}
|
||||
|
||||
# Add no_playlist option if specified
|
||||
if no_playlist:
|
||||
ydl_opts["noplaylist"] = True
|
||||
|
||||
# Add playlist_items filter if specified
|
||||
if playlist_items:
|
||||
ydl_opts["playlist_items"] = playlist_items
|
||||
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||
debug(f"Fetching format list for: {url}")
|
||||
info = ydl.extract_info(url, download=False)
|
||||
|
||||
formats = info.get("formats", [])
|
||||
if not formats:
|
||||
log("No formats available", file=sys.stderr)
|
||||
return None
|
||||
|
||||
# Parse and extract relevant format info
|
||||
result_formats = []
|
||||
for fmt in formats:
|
||||
format_info = {
|
||||
"format_id": fmt.get("format_id", ""),
|
||||
"format": fmt.get("format", ""),
|
||||
"ext": fmt.get("ext", ""),
|
||||
"resolution": fmt.get("resolution", ""),
|
||||
"width": fmt.get("width"),
|
||||
"height": fmt.get("height"),
|
||||
"fps": fmt.get("fps"),
|
||||
"vcodec": fmt.get("vcodec", "none"),
|
||||
"acodec": fmt.get("acodec", "none"),
|
||||
"filesize": fmt.get("filesize"),
|
||||
"tbr": fmt.get("tbr"), # Total bitrate
|
||||
}
|
||||
result_formats.append(format_info)
|
||||
|
||||
debug(f"Found {len(result_formats)} available formats")
|
||||
return result_formats
|
||||
|
||||
except Exception as e:
|
||||
log(f"✗ Error fetching formats: {e}", file=sys.stderr)
|
||||
return None
|
||||
def _build_ytdlp_options(opts: DownloadOptions) -> Dict[str, Any]:
|
||||
"""Build yt-dlp download options."""
|
||||
ensure_directory(opts.output_dir)
|
||||
|
||||
outtmpl = str((opts.output_dir / "%(title)s.%(ext)s").resolve())
|
||||
|
||||
base_options: Dict[str, Any] = {
|
||||
"outtmpl": outtmpl,
|
||||
"quiet": False,
|
||||
"no_warnings": False,
|
||||
"noprogress": False,
|
||||
"socket_timeout": 30,
|
||||
"retries": 10,
|
||||
"fragment_retries": 10,
|
||||
"http_chunk_size": 10_485_760,
|
||||
"restrictfilenames": True,
|
||||
"progress_hooks": [_progress_callback],
|
||||
}
|
||||
|
||||
if opts.cookies_path and opts.cookies_path.is_file():
|
||||
base_options["cookiefile"] = str(opts.cookies_path)
|
||||
|
||||
# Add no-playlist option if specified (for single video from playlist URLs)
|
||||
if opts.no_playlist:
|
||||
base_options["noplaylist"] = True
|
||||
|
||||
# Configure based on mode
|
||||
if opts.mode == "audio":
|
||||
base_options["format"] = opts.ytdl_format or "251/140/bestaudio"
|
||||
base_options["postprocessors"] = [{"key": "FFmpegExtractAudio"}]
|
||||
else: # video
|
||||
base_options["format"] = opts.ytdl_format or "bestvideo+bestaudio/best"
|
||||
base_options["format_sort"] = [
|
||||
"res:4320", "res:2880", "res:2160", "res:1440", "res:1080", "res:720", "res"
|
||||
]
|
||||
|
||||
# Add clip sections if provided
|
||||
if opts.clip_sections:
|
||||
base_options["download_sections"] = opts.clip_sections
|
||||
|
||||
# Add playlist items selection if provided
|
||||
if opts.playlist_items:
|
||||
base_options["playlist_items"] = opts.playlist_items
|
||||
|
||||
debug(f"yt-dlp: mode={opts.mode}, format={base_options.get('format')}")
|
||||
return base_options
|
||||
|
||||
|
||||
def _iter_download_entries(info: Dict[str, Any]) -> Iterator[Dict[str, Any]]:
|
||||
"""Iterate through download entries, handling playlists."""
|
||||
queue: List[Dict[str, Any]] = [info]
|
||||
seen: set[int] = set()
|
||||
while queue:
|
||||
current = queue.pop(0)
|
||||
obj_id = id(current)
|
||||
if obj_id in seen:
|
||||
continue
|
||||
seen.add(obj_id)
|
||||
entries = current.get("entries")
|
||||
if isinstance(entries, list):
|
||||
for entry in entries:
|
||||
if isinstance(entry, dict):
|
||||
queue.append(entry)
|
||||
if current.get("requested_downloads") or not entries:
|
||||
yield current
|
||||
|
||||
|
||||
def _candidate_paths(entry: Dict[str, Any], output_dir: Path) -> Iterator[Path]:
|
||||
"""Get candidate file paths for downloaded media."""
|
||||
requested = entry.get("requested_downloads")
|
||||
if isinstance(requested, list):
|
||||
for item in requested:
|
||||
if isinstance(item, dict):
|
||||
for key in ("filepath", "_filename", "filename"):
|
||||
value = item.get(key)
|
||||
if value:
|
||||
yield Path(value)
|
||||
for key in ("filepath", "_filename", "filename"):
|
||||
value = entry.get(key)
|
||||
if value:
|
||||
yield Path(value)
|
||||
if entry.get("filename"):
|
||||
yield output_dir / entry["filename"]
|
||||
|
||||
|
||||
def _resolve_entry_and_path(info: Dict[str, Any], output_dir: Path) -> tuple[Dict[str, Any], Path]:
|
||||
"""Find downloaded file in yt-dlp metadata."""
|
||||
for entry in _iter_download_entries(info):
|
||||
for candidate in _candidate_paths(entry, output_dir):
|
||||
if candidate.is_file():
|
||||
return entry, candidate
|
||||
if not candidate.is_absolute():
|
||||
resolved = output_dir / candidate
|
||||
if resolved.is_file():
|
||||
return entry, resolved
|
||||
raise FileNotFoundError("yt-dlp did not report a downloaded media file")
|
||||
|
||||
|
||||
def _extract_sha256(info: Dict[str, Any]) -> Optional[str]:
|
||||
"""Extract SHA256 hash from yt-dlp metadata."""
|
||||
for payload in [info] + info.get("entries", []):
|
||||
if not isinstance(payload, dict):
|
||||
continue
|
||||
hashes = payload.get("hashes")
|
||||
if isinstance(hashes, dict):
|
||||
for key in ("sha256", "sha-256", "sha_256"):
|
||||
value = hashes.get(key)
|
||||
if isinstance(value, str) and value.strip():
|
||||
return value.strip().lower()
|
||||
for key in ("sha256", "sha-256", "sha_256"):
|
||||
value = payload.get(key)
|
||||
if isinstance(value, str) and value.strip():
|
||||
return value.strip().lower()
|
||||
return None
|
||||
|
||||
|
||||
def _get_libgen_download_url(libgen_url: str) -> Optional[str]:
|
||||
"""Extract the actual download link from LibGen redirect URL.
|
||||
|
||||
LibGen URLs like https://libgen.gl/file.php?id=123456 redirect to
|
||||
actual mirror URLs. This follows the redirect chain to get the real file.
|
||||
|
||||
Args:
|
||||
libgen_url: LibGen file.php URL
|
||||
|
||||
Returns:
|
||||
Actual download URL or None if extraction fails
|
||||
"""
|
||||
try:
|
||||
import requests
|
||||
from urllib.parse import urlparse
|
||||
|
||||
# Check if this is a LibGen URL
|
||||
parsed = urlparse(libgen_url)
|
||||
if 'libgen' not in parsed.netloc.lower():
|
||||
return None
|
||||
|
||||
if '/file.php' not in parsed.path.lower():
|
||||
return None
|
||||
|
||||
# LibGen redirects to actual mirrors, follow redirects to get final URL
|
||||
session = requests.Session()
|
||||
session.headers.update({
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||
})
|
||||
|
||||
debug(f"Following LibGen redirect chain for: {libgen_url}")
|
||||
|
||||
# First, get the page and look for direct download link
|
||||
try:
|
||||
response = session.get(libgen_url, timeout=10, allow_redirects=True)
|
||||
final_url = response.url
|
||||
|
||||
# Try to find actual download link in the page
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
# Look for download links - LibGen typically has forms with download buttons
|
||||
# Look for all links and forms that might lead to download
|
||||
for link in soup.find_all('a'):
|
||||
href = link.get('href')
|
||||
if href and isinstance(href, str):
|
||||
# Look for direct file links or get.php redirects
|
||||
if 'get.php' in href.lower() or href.endswith(('.pdf', '.epub', '.djvu', '.mobi')):
|
||||
download_url = href if href.startswith('http') else urljoin(final_url, href)
|
||||
debug(f"Found download link: {download_url}")
|
||||
return download_url
|
||||
except ImportError:
|
||||
pass # BeautifulSoup not available
|
||||
|
||||
# If we followed redirects successfully, return the final URL
|
||||
# This handles cases where libgen redirects to a direct download mirror
|
||||
if final_url != libgen_url:
|
||||
debug(f"LibGen resolved to mirror: {final_url}")
|
||||
return final_url
|
||||
|
||||
except requests.RequestException as e:
|
||||
log(f"Error following LibGen redirects: {e}", file=sys.stderr)
|
||||
# Try head request as fallback
|
||||
try:
|
||||
response = session.head(libgen_url, allow_redirects=True, timeout=10)
|
||||
if response.url != libgen_url:
|
||||
debug(f"LibGen HEAD resolved to: {response.url}")
|
||||
return response.url
|
||||
except:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
log(f"Error resolving LibGen URL: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def _download_direct_file(
|
||||
url: str,
|
||||
output_dir: Path,
|
||||
debug_logger: Optional[DebugLogger] = None,
|
||||
) -> DownloadMediaResult:
|
||||
"""Download a direct file (PDF, image, document, etc.) without yt-dlp."""
|
||||
ensure_directory(output_dir)
|
||||
|
||||
from urllib.parse import unquote, urlparse, parse_qs
|
||||
import re
|
||||
|
||||
# Extract filename from URL
|
||||
parsed_url = urlparse(url)
|
||||
url_path = parsed_url.path
|
||||
|
||||
# Try to get filename from query parameters first (for LibGen and similar services)
|
||||
# e.g., ?filename=Book+Title.pdf or &download=filename.pdf
|
||||
filename = None
|
||||
if parsed_url.query:
|
||||
query_params = parse_qs(parsed_url.query)
|
||||
for param_name in ('filename', 'download', 'file', 'name'):
|
||||
if param_name in query_params and query_params[param_name]:
|
||||
filename = query_params[param_name][0]
|
||||
filename = unquote(filename)
|
||||
break
|
||||
|
||||
# If not found in query params, extract from URL path
|
||||
if not filename or not filename.strip():
|
||||
filename = url_path.split("/")[-1] if url_path else ""
|
||||
filename = unquote(filename)
|
||||
|
||||
# Remove query strings from filename if any
|
||||
if "?" in filename:
|
||||
filename = filename.split("?")[0]
|
||||
|
||||
# Try to get real filename from Content-Disposition header (HEAD request)
|
||||
try:
|
||||
with HTTPClient(timeout=10.0) as client:
|
||||
response = client._request("HEAD", url, follow_redirects=True)
|
||||
content_disposition = response.headers.get("content-disposition", "")
|
||||
if content_disposition:
|
||||
# Extract filename from Content-Disposition header
|
||||
# Format: attachment; filename="filename.pdf" or filename=filename.pdf
|
||||
match = re.search(r'filename\*?=(?:"([^"]*)"|([^;\s]*))', content_disposition)
|
||||
if match:
|
||||
extracted_name = match.group(1) or match.group(2)
|
||||
if extracted_name:
|
||||
filename = unquote(extracted_name)
|
||||
debug(f"Filename from Content-Disposition: {filename}")
|
||||
except Exception as e:
|
||||
log(f"Could not get filename from headers: {e}", file=sys.stderr)
|
||||
|
||||
# Fallback if we still don't have a good filename
|
||||
if not filename or "." not in filename:
|
||||
filename = "downloaded_file.bin"
|
||||
|
||||
file_path = output_dir / filename
|
||||
progress_bar = ProgressBar()
|
||||
|
||||
debug(f"Direct download: {filename}")
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
downloaded_bytes = [0]
|
||||
total_bytes = [0]
|
||||
last_progress_time = [start_time]
|
||||
|
||||
def progress_callback(bytes_downloaded: int, content_length: int) -> None:
|
||||
downloaded_bytes[0] = bytes_downloaded
|
||||
total_bytes[0] = content_length
|
||||
|
||||
now = time.time()
|
||||
if now - last_progress_time[0] >= 0.5 and total_bytes[0] > 0:
|
||||
elapsed = now - start_time
|
||||
percent = (bytes_downloaded / content_length) * 100 if content_length > 0 else 0
|
||||
speed = bytes_downloaded / elapsed if elapsed > 0 else 0
|
||||
eta_seconds = (content_length - bytes_downloaded) / speed if speed > 0 else 0
|
||||
|
||||
speed_str = progress_bar.format_bytes(speed) + "/s"
|
||||
minutes, seconds = divmod(int(eta_seconds), 60)
|
||||
hours, minutes = divmod(minutes, 60)
|
||||
eta_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
|
||||
|
||||
progress_line = progress_bar.format_progress(
|
||||
percent_str=f"{percent:.1f}%",
|
||||
downloaded=bytes_downloaded,
|
||||
total=content_length,
|
||||
speed_str=speed_str,
|
||||
eta_str=eta_str,
|
||||
)
|
||||
debug(progress_line)
|
||||
last_progress_time[0] = now
|
||||
|
||||
with HTTPClient(timeout=30.0) as client:
|
||||
client.download(url, str(file_path), progress_callback=progress_callback)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
avg_speed_str = progress_bar.format_bytes(downloaded_bytes[0] / elapsed if elapsed > 0 else 0) + "/s"
|
||||
debug(f"✓ Downloaded in {elapsed:.1f}s at {avg_speed_str}")
|
||||
|
||||
# For direct file downloads, create minimal info dict without filename as title
|
||||
# This prevents creating duplicate title: tags when filename gets auto-generated
|
||||
# We'll add title back later only if we couldn't extract meaningful tags
|
||||
info = {
|
||||
"id": filename.rsplit(".", 1)[0],
|
||||
"ext": filename.rsplit(".", 1)[1] if "." in filename else "bin",
|
||||
"webpage_url": url,
|
||||
}
|
||||
|
||||
hash_value = None
|
||||
try:
|
||||
hash_value = sha256_file(file_path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
tags = []
|
||||
if extract_ytdlp_tags:
|
||||
try:
|
||||
tags = extract_ytdlp_tags(info)
|
||||
except Exception as e:
|
||||
log(f"Error extracting tags: {e}", file=sys.stderr)
|
||||
|
||||
# Only use filename as a title tag if we couldn't extract any meaningful tags
|
||||
# This prevents duplicate title: tags when the filename could be mistaken for metadata
|
||||
if not any(t.startswith('title:') for t in tags):
|
||||
# Re-extract tags with filename as title only if needed
|
||||
info['title'] = filename
|
||||
tags = []
|
||||
if extract_ytdlp_tags:
|
||||
try:
|
||||
tags = extract_ytdlp_tags(info)
|
||||
except Exception as e:
|
||||
log(f"Error extracting tags with filename: {e}", file=sys.stderr)
|
||||
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record(
|
||||
"direct-file-downloaded",
|
||||
{"url": url, "path": str(file_path), "hash": hash_value},
|
||||
)
|
||||
|
||||
return DownloadMediaResult(
|
||||
path=file_path,
|
||||
info=info,
|
||||
tags=tags,
|
||||
source_url=url,
|
||||
hash_value=hash_value,
|
||||
)
|
||||
|
||||
except (httpx.HTTPError, httpx.RequestError) as exc:
|
||||
log(f"Download error: {exc}", file=sys.stderr)
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record(
|
||||
"exception",
|
||||
{"phase": "direct-file", "url": url, "error": str(exc)},
|
||||
)
|
||||
raise DownloadError(f"Failed to download {url}: {exc}") from exc
|
||||
except Exception as exc:
|
||||
log(f"Error downloading file: {exc}", file=sys.stderr)
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record(
|
||||
"exception",
|
||||
{
|
||||
"phase": "direct-file",
|
||||
"url": url,
|
||||
"error": str(exc),
|
||||
"traceback": traceback.format_exc(),
|
||||
},
|
||||
)
|
||||
raise DownloadError(f"Error downloading file: {exc}") from exc
|
||||
|
||||
|
||||
def probe_url(url: str, no_playlist: bool = False) -> Optional[Dict[str, Any]]:
|
||||
"""Probe URL to extract metadata WITHOUT downloading.
|
||||
|
||||
Args:
|
||||
url: URL to probe
|
||||
no_playlist: If True, ignore playlists and probe only the single video
|
||||
|
||||
Returns:
|
||||
Dict with keys: extractor, title, entries (if playlist), duration, etc.
|
||||
Returns None if not supported by yt-dlp.
|
||||
"""
|
||||
if not is_url_supported_by_ytdlp(url):
|
||||
return None
|
||||
|
||||
_ensure_yt_dlp_ready()
|
||||
|
||||
assert yt_dlp is not None
|
||||
try:
|
||||
# Extract info without downloading
|
||||
# Use extract_flat='in_playlist' to get full metadata for playlist items
|
||||
ydl_opts = {
|
||||
"quiet": True, # Suppress all output
|
||||
"no_warnings": True,
|
||||
"socket_timeout": 10,
|
||||
"retries": 3,
|
||||
"skip_download": True, # Don't actually download
|
||||
"extract_flat": "in_playlist", # Get playlist with metadata for each entry
|
||||
"noprogress": True, # No progress bars
|
||||
"quiet": True,
|
||||
}
|
||||
|
||||
# Add no_playlist option if specified
|
||||
if no_playlist:
|
||||
ydl_opts["noplaylist"] = True
|
||||
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl: # type: ignore[arg-type]
|
||||
info = ydl.extract_info(url, download=False)
|
||||
|
||||
if not isinstance(info, dict):
|
||||
return None
|
||||
|
||||
# Extract relevant fields
|
||||
return {
|
||||
"extractor": info.get("extractor", ""),
|
||||
"title": info.get("title", ""),
|
||||
"entries": info.get("entries", []), # Will be populated if playlist
|
||||
"duration": info.get("duration"),
|
||||
"uploader": info.get("uploader"),
|
||||
"description": info.get("description"),
|
||||
"url": url,
|
||||
}
|
||||
except Exception as exc:
|
||||
log(f"Probe failed for {url}: {exc}")
|
||||
return None
|
||||
|
||||
|
||||
def download_media(
|
||||
opts: DownloadOptions,
|
||||
*,
|
||||
debug_logger: Optional[DebugLogger] = None,
|
||||
) -> DownloadMediaResult:
|
||||
"""Download media from URL using yt-dlp or direct HTTP download.
|
||||
|
||||
Args:
|
||||
opts: DownloadOptions with url, mode, output_dir, etc.
|
||||
debug_logger: Optional debug logger for troubleshooting
|
||||
|
||||
Returns:
|
||||
DownloadMediaResult with path, info, tags, hash
|
||||
|
||||
Raises:
|
||||
DownloadError: If download fails
|
||||
"""
|
||||
# Handle LibGen URLs specially
|
||||
# file.php redirects to mirrors, get.php is direct from modern API
|
||||
if 'libgen' in opts.url.lower():
|
||||
if '/get.php' in opts.url.lower():
|
||||
# Modern API get.php links are direct downloads from mirrors (not file redirects)
|
||||
log(f"Detected LibGen get.php URL, downloading directly...")
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record("libgen-direct", {"url": opts.url})
|
||||
return _download_direct_file(opts.url, opts.output_dir, debug_logger)
|
||||
elif '/file.php' in opts.url.lower():
|
||||
# Old-style file.php redirects to mirrors, we need to resolve
|
||||
log(f"Detected LibGen file.php URL, resolving to actual mirror...")
|
||||
actual_url = _get_libgen_download_url(opts.url)
|
||||
if actual_url and actual_url != opts.url:
|
||||
log(f"Resolved LibGen URL to mirror: {actual_url}")
|
||||
opts.url = actual_url
|
||||
# After resolution, this will typically be an onion link or direct file
|
||||
# Skip yt-dlp for this (it won't support onion/mirrors), go direct
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record("libgen-resolved", {"original": opts.url, "resolved": actual_url})
|
||||
return _download_direct_file(opts.url, opts.output_dir, debug_logger)
|
||||
else:
|
||||
log(f"Could not resolve LibGen URL, trying direct download anyway", file=sys.stderr)
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record("libgen-resolve-failed", {"url": opts.url})
|
||||
return _download_direct_file(opts.url, opts.output_dir, debug_logger)
|
||||
|
||||
# Try yt-dlp first if URL is supported
|
||||
if not is_url_supported_by_ytdlp(opts.url):
|
||||
log(f"URL not supported by yt-dlp, trying direct download: {opts.url}")
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record("direct-file-attempt", {"url": opts.url})
|
||||
return _download_direct_file(opts.url, opts.output_dir, debug_logger)
|
||||
|
||||
_ensure_yt_dlp_ready()
|
||||
|
||||
ytdl_options = _build_ytdlp_options(opts)
|
||||
log(f"Starting yt-dlp download: {opts.url}")
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record("ytdlp-start", {"url": opts.url})
|
||||
|
||||
assert yt_dlp is not None
|
||||
try:
|
||||
with yt_dlp.YoutubeDL(ytdl_options) as ydl: # type: ignore[arg-type]
|
||||
info = ydl.extract_info(opts.url, download=True)
|
||||
except Exception as exc:
|
||||
log(f"yt-dlp failed: {exc}", file=sys.stderr)
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record(
|
||||
"exception",
|
||||
{
|
||||
"phase": "yt-dlp",
|
||||
"error": str(exc),
|
||||
"traceback": traceback.format_exc(),
|
||||
},
|
||||
)
|
||||
raise DownloadError("yt-dlp download failed") from exc
|
||||
|
||||
if not isinstance(info, dict):
|
||||
log(f"Unexpected yt-dlp response: {type(info)}", file=sys.stderr)
|
||||
raise DownloadError("Unexpected yt-dlp response type")
|
||||
|
||||
info_dict: Dict[str, Any] = info
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record(
|
||||
"ytdlp-info",
|
||||
{
|
||||
"keys": sorted(info_dict.keys()),
|
||||
"is_playlist": bool(info_dict.get("entries")),
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
entry, media_path = _resolve_entry_and_path(info_dict, opts.output_dir)
|
||||
except FileNotFoundError as exc:
|
||||
log(f"Error: {exc}", file=sys.stderr)
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record(
|
||||
"exception",
|
||||
{"phase": "resolve-path", "error": str(exc)},
|
||||
)
|
||||
raise DownloadError(str(exc)) from exc
|
||||
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record(
|
||||
"resolved-media",
|
||||
{"path": str(media_path), "entry_keys": sorted(entry.keys())},
|
||||
)
|
||||
|
||||
# Extract hash from metadata or compute
|
||||
hash_value = _extract_sha256(entry) or _extract_sha256(info_dict)
|
||||
if not hash_value:
|
||||
try:
|
||||
hash_value = sha256_file(media_path)
|
||||
except OSError as exc:
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record(
|
||||
"hash-error",
|
||||
{"path": str(media_path), "error": str(exc)},
|
||||
)
|
||||
|
||||
# Extract tags using metadata.py
|
||||
tags = []
|
||||
if extract_ytdlp_tags:
|
||||
try:
|
||||
tags = extract_ytdlp_tags(entry)
|
||||
except Exception as e:
|
||||
log(f"Error extracting tags: {e}", file=sys.stderr)
|
||||
|
||||
source_url = (
|
||||
entry.get("webpage_url")
|
||||
or entry.get("original_url")
|
||||
or entry.get("url")
|
||||
)
|
||||
|
||||
log(f"✓ Downloaded: {media_path.name} ({len(tags)} tags)")
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record(
|
||||
"downloaded",
|
||||
{
|
||||
"path": str(media_path),
|
||||
"tag_count": len(tags),
|
||||
"source_url": source_url,
|
||||
"sha256": hash_value,
|
||||
},
|
||||
)
|
||||
|
||||
return DownloadMediaResult(
|
||||
path=media_path,
|
||||
info=entry,
|
||||
tags=tags,
|
||||
source_url=source_url,
|
||||
hash_value=hash_value,
|
||||
)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"download_media",
|
||||
"is_url_supported_by_ytdlp",
|
||||
"DownloadError",
|
||||
"DownloadOptions",
|
||||
"DownloadMediaResult",
|
||||
]
|
||||
180
helper/file_server.py
Normal file
180
helper/file_server.py
Normal file
@@ -0,0 +1,180 @@
|
||||
"""Simple HTTP file server for serving files in web mode."""
|
||||
|
||||
import threading
|
||||
import socket
|
||||
import logging
|
||||
from http.server import HTTPServer, SimpleHTTPRequestHandler
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
import mimetypes
|
||||
import urllib.parse
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Global server instance
|
||||
_file_server: Optional[HTTPServer] = None
|
||||
_server_thread: Optional[threading.Thread] = None
|
||||
_server_port: int = 8001
|
||||
|
||||
|
||||
class FileServerHandler(SimpleHTTPRequestHandler):
|
||||
"""HTTP request handler for file serving."""
|
||||
|
||||
def do_GET(self):
|
||||
"""Handle GET requests."""
|
||||
# Parse the path
|
||||
parsed_path = urllib.parse.urlparse(self.path)
|
||||
file_path = urllib.parse.unquote(parsed_path.path)
|
||||
|
||||
# Remove leading slash
|
||||
if file_path.startswith('/'):
|
||||
file_path = file_path[1:]
|
||||
|
||||
# Decode the file path (it's URL encoded)
|
||||
try:
|
||||
full_path = Path(file_path).resolve()
|
||||
|
||||
# Security check: ensure the path is within allowed directories
|
||||
# For now, allow all paths (can be restricted later)
|
||||
|
||||
if full_path.is_file() and full_path.exists():
|
||||
# Serve the file
|
||||
logger.debug(f"Serving file: {full_path}")
|
||||
|
||||
# Determine content type
|
||||
content_type, _ = mimetypes.guess_type(str(full_path))
|
||||
if content_type is None:
|
||||
content_type = 'application/octet-stream'
|
||||
|
||||
try:
|
||||
with open(full_path, 'rb') as f:
|
||||
file_content = f.read()
|
||||
|
||||
self.send_response(200)
|
||||
self.send_header('Content-type', content_type)
|
||||
self.send_header('Content-Length', str(len(file_content)))
|
||||
self.send_header('Content-Disposition', f'attachment; filename="{full_path.name}"')
|
||||
self.end_headers()
|
||||
self.wfile.write(file_content)
|
||||
logger.info(f"Successfully served file: {full_path.name}")
|
||||
return
|
||||
except Exception as e:
|
||||
logger.error(f"Error serving file: {e}")
|
||||
self.send_error(500, "Internal server error")
|
||||
return
|
||||
else:
|
||||
logger.warning(f"File not found: {full_path}")
|
||||
self.send_error(404, "File not found")
|
||||
return
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error handling request: {e}")
|
||||
self.send_error(400, "Bad request")
|
||||
|
||||
def log_message(self, format, *args):
|
||||
"""Override to use our logger instead of stderr."""
|
||||
logger.debug(format % args)
|
||||
|
||||
|
||||
def get_local_ip() -> Optional[str]:
|
||||
"""Get the local IP address that's accessible from other devices."""
|
||||
try:
|
||||
# Connect to a remote server to determine local IP
|
||||
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||||
s.connect(("8.8.8.8", 80))
|
||||
ip = s.getsockname()[0]
|
||||
s.close()
|
||||
return ip
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to determine local IP: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def start_file_server(port: int = 8001) -> Optional[str]:
|
||||
"""Start the HTTP file server.
|
||||
|
||||
Args:
|
||||
port: Port to serve on
|
||||
|
||||
Returns:
|
||||
Server URL if successful, None otherwise
|
||||
"""
|
||||
global _file_server, _server_thread, _server_port
|
||||
|
||||
if _file_server is not None:
|
||||
logger.debug(f"File server already running on port {_server_port}")
|
||||
local_ip = get_local_ip()
|
||||
if local_ip:
|
||||
return f"http://{local_ip}:{_server_port}"
|
||||
return None
|
||||
|
||||
try:
|
||||
_server_port = port
|
||||
|
||||
# Create server
|
||||
server_address = ('', port)
|
||||
_file_server = HTTPServer(server_address, FileServerHandler)
|
||||
|
||||
# Start in daemon thread
|
||||
_server_thread = threading.Thread(target=_file_server.serve_forever, daemon=True)
|
||||
_server_thread.start()
|
||||
|
||||
logger.info(f"File server started on port {port}")
|
||||
|
||||
# Get local IP
|
||||
local_ip = get_local_ip()
|
||||
if local_ip:
|
||||
server_url = f"http://{local_ip}:{port}"
|
||||
logger.info(f"File server accessible at: {server_url}")
|
||||
return server_url
|
||||
else:
|
||||
logger.warning("Could not determine local IP")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to start file server: {e}")
|
||||
_file_server = None
|
||||
_server_thread = None
|
||||
return None
|
||||
|
||||
|
||||
def stop_file_server():
|
||||
"""Stop the HTTP file server."""
|
||||
global _file_server, _server_thread
|
||||
|
||||
if _file_server is not None:
|
||||
try:
|
||||
_file_server.shutdown()
|
||||
_file_server.server_close()
|
||||
logger.info("File server stopped")
|
||||
except Exception as e:
|
||||
logger.error(f"Error stopping file server: {e}")
|
||||
finally:
|
||||
_file_server = None
|
||||
_server_thread = None
|
||||
|
||||
|
||||
def get_file_url(file_path: Path, server_url: Optional[str] = None) -> Optional[str]:
|
||||
"""Get the HTTP URL for a file.
|
||||
|
||||
Args:
|
||||
file_path: Path to the file
|
||||
server_url: Base server URL (gets determined if None)
|
||||
|
||||
Returns:
|
||||
HTTP URL to the file, or None if server not running
|
||||
"""
|
||||
if not file_path.exists():
|
||||
logger.warning(f"File does not exist: {file_path}")
|
||||
return None
|
||||
|
||||
if server_url is None:
|
||||
local_ip = get_local_ip()
|
||||
if not local_ip:
|
||||
logger.error("Cannot determine local IP for file URL")
|
||||
return None
|
||||
server_url = f"http://{local_ip}:{_server_port}"
|
||||
|
||||
# URL encode the file path
|
||||
encoded_path = urllib.parse.quote(str(file_path.resolve()))
|
||||
return f"{server_url}/{encoded_path}"
|
||||
1039
helper/file_storage.py
Normal file
1039
helper/file_storage.py
Normal file
File diff suppressed because it is too large
Load Diff
579
helper/http_client.py
Normal file
579
helper/http_client.py
Normal file
@@ -0,0 +1,579 @@
|
||||
"""
|
||||
Unified HTTP client for downlow using httpx.
|
||||
|
||||
Provides synchronous and asynchronous HTTP operations with:
|
||||
- Automatic retries on transient failures
|
||||
- Configurable timeouts and headers
|
||||
- Built-in progress tracking for downloads
|
||||
- Request/response logging support
|
||||
"""
|
||||
|
||||
import httpx
|
||||
import asyncio
|
||||
from typing import Optional, Dict, Any, Callable, BinaryIO
|
||||
from pathlib import Path
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Default configuration
|
||||
DEFAULT_TIMEOUT = 30.0
|
||||
DEFAULT_RETRIES = 3
|
||||
DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||||
|
||||
|
||||
class HTTPClient:
|
||||
"""Unified HTTP client with sync support."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
timeout: float = DEFAULT_TIMEOUT,
|
||||
retries: int = DEFAULT_RETRIES,
|
||||
user_agent: str = DEFAULT_USER_AGENT,
|
||||
verify_ssl: bool = True,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
"""
|
||||
Initialize HTTP client.
|
||||
|
||||
Args:
|
||||
timeout: Request timeout in seconds
|
||||
retries: Number of retries on transient failures
|
||||
user_agent: User-Agent header value
|
||||
verify_ssl: Whether to verify SSL certificates
|
||||
headers: Additional headers to include in all requests
|
||||
"""
|
||||
self.timeout = timeout
|
||||
self.retries = retries
|
||||
self.user_agent = user_agent
|
||||
self.verify_ssl = verify_ssl
|
||||
self.base_headers = headers or {}
|
||||
self._client: Optional[httpx.Client] = None
|
||||
|
||||
def __enter__(self):
|
||||
"""Context manager entry."""
|
||||
self._client = httpx.Client(
|
||||
timeout=self.timeout,
|
||||
verify=self.verify_ssl,
|
||||
headers=self._get_headers(),
|
||||
)
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
"""Context manager exit."""
|
||||
if self._client:
|
||||
self._client.close()
|
||||
self._client = None
|
||||
|
||||
def _get_headers(self) -> Dict[str, str]:
|
||||
"""Get request headers with user-agent."""
|
||||
headers = {"User-Agent": self.user_agent}
|
||||
headers.update(self.base_headers)
|
||||
return headers
|
||||
|
||||
def get(
|
||||
self,
|
||||
url: str,
|
||||
params: Optional[Dict[str, Any]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
allow_redirects: bool = True,
|
||||
) -> httpx.Response:
|
||||
"""
|
||||
Make a GET request.
|
||||
|
||||
Args:
|
||||
url: Request URL
|
||||
params: Query parameters
|
||||
headers: Additional headers
|
||||
allow_redirects: Follow redirects
|
||||
|
||||
Returns:
|
||||
httpx.Response object
|
||||
"""
|
||||
return self._request(
|
||||
"GET",
|
||||
url,
|
||||
params=params,
|
||||
headers=headers,
|
||||
follow_redirects=allow_redirects,
|
||||
)
|
||||
|
||||
def post(
|
||||
self,
|
||||
url: str,
|
||||
data: Optional[Any] = None,
|
||||
json: Optional[Dict] = None,
|
||||
files: Optional[Dict] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
) -> httpx.Response:
|
||||
"""
|
||||
Make a POST request.
|
||||
|
||||
Args:
|
||||
url: Request URL
|
||||
data: Form data
|
||||
json: JSON data
|
||||
files: Files to upload
|
||||
headers: Additional headers
|
||||
|
||||
Returns:
|
||||
httpx.Response object
|
||||
"""
|
||||
return self._request(
|
||||
"POST",
|
||||
url,
|
||||
data=data,
|
||||
json=json,
|
||||
files=files,
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
def put(
|
||||
self,
|
||||
url: str,
|
||||
data: Optional[Any] = None,
|
||||
json: Optional[Dict] = None,
|
||||
content: Optional[Any] = None,
|
||||
files: Optional[Dict] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
) -> httpx.Response:
|
||||
"""
|
||||
Make a PUT request.
|
||||
|
||||
Args:
|
||||
url: Request URL
|
||||
data: Form data
|
||||
json: JSON data
|
||||
content: Raw content
|
||||
files: Files to upload
|
||||
headers: Additional headers
|
||||
|
||||
Returns:
|
||||
httpx.Response object
|
||||
"""
|
||||
return self._request(
|
||||
"PUT",
|
||||
url,
|
||||
data=data,
|
||||
json=json,
|
||||
content=content,
|
||||
files=files,
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
def delete(
|
||||
self,
|
||||
url: str,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
) -> httpx.Response:
|
||||
"""
|
||||
Make a DELETE request.
|
||||
|
||||
Args:
|
||||
url: Request URL
|
||||
headers: Additional headers
|
||||
|
||||
Returns:
|
||||
httpx.Response object
|
||||
"""
|
||||
return self._request(
|
||||
"DELETE",
|
||||
url,
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
def request(
|
||||
self,
|
||||
method: str,
|
||||
url: str,
|
||||
**kwargs
|
||||
) -> httpx.Response:
|
||||
"""
|
||||
Make a generic HTTP request.
|
||||
|
||||
Args:
|
||||
method: HTTP method
|
||||
url: Request URL
|
||||
**kwargs: Additional arguments
|
||||
|
||||
Returns:
|
||||
httpx.Response object
|
||||
"""
|
||||
return self._request(method, url, **kwargs)
|
||||
|
||||
def download(
|
||||
self,
|
||||
url: str,
|
||||
file_path: str,
|
||||
chunk_size: int = 8192,
|
||||
progress_callback: Optional[Callable[[int, int], None]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
) -> Path:
|
||||
"""
|
||||
Download a file from URL with optional progress tracking.
|
||||
|
||||
Args:
|
||||
url: File URL
|
||||
file_path: Local file path to save to
|
||||
chunk_size: Download chunk size
|
||||
progress_callback: Callback(bytes_downloaded, total_bytes)
|
||||
headers: Additional headers
|
||||
|
||||
Returns:
|
||||
Path object of downloaded file
|
||||
"""
|
||||
path = Path(file_path)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with self._request_stream("GET", url, headers=headers, follow_redirects=True) as response:
|
||||
response.raise_for_status()
|
||||
total_bytes = int(response.headers.get("content-length", 0))
|
||||
bytes_downloaded = 0
|
||||
|
||||
with open(path, "wb") as f:
|
||||
for chunk in response.iter_bytes(chunk_size):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
bytes_downloaded += len(chunk)
|
||||
if progress_callback:
|
||||
progress_callback(bytes_downloaded, total_bytes)
|
||||
|
||||
return path
|
||||
|
||||
def _request(
|
||||
self,
|
||||
method: str,
|
||||
url: str,
|
||||
**kwargs
|
||||
) -> httpx.Response:
|
||||
"""
|
||||
Make an HTTP request with automatic retries.
|
||||
|
||||
Args:
|
||||
method: HTTP method
|
||||
url: Request URL
|
||||
**kwargs: Additional arguments for httpx.Client.request()
|
||||
|
||||
Returns:
|
||||
httpx.Response object
|
||||
"""
|
||||
if not self._client:
|
||||
raise RuntimeError("HTTPClient must be used with context manager (with statement)")
|
||||
|
||||
# Merge headers
|
||||
if "headers" in kwargs and kwargs["headers"]:
|
||||
headers = self._get_headers()
|
||||
headers.update(kwargs["headers"])
|
||||
kwargs["headers"] = headers
|
||||
else:
|
||||
kwargs["headers"] = self._get_headers()
|
||||
|
||||
last_exception = None
|
||||
|
||||
for attempt in range(self.retries):
|
||||
try:
|
||||
response = self._client.request(method, url, **kwargs)
|
||||
response.raise_for_status()
|
||||
return response
|
||||
except httpx.TimeoutException as e:
|
||||
last_exception = e
|
||||
logger.warning(f"Timeout on attempt {attempt + 1}/{self.retries}: {url}")
|
||||
if attempt < self.retries - 1:
|
||||
continue
|
||||
except httpx.HTTPStatusError as e:
|
||||
# Don't retry on 4xx errors
|
||||
if 400 <= e.response.status_code < 500:
|
||||
try:
|
||||
response_text = e.response.text[:500]
|
||||
except:
|
||||
response_text = "<unable to read response>"
|
||||
logger.error(f"HTTP {e.response.status_code} from {url}: {response_text}")
|
||||
raise
|
||||
last_exception = e
|
||||
try:
|
||||
response_text = e.response.text[:200]
|
||||
except:
|
||||
response_text = "<unable to read response>"
|
||||
logger.warning(f"HTTP {e.response.status_code} on attempt {attempt + 1}/{self.retries}: {url} - {response_text}")
|
||||
if attempt < self.retries - 1:
|
||||
continue
|
||||
except (httpx.RequestError, httpx.ConnectError) as e:
|
||||
last_exception = e
|
||||
logger.warning(f"Connection error on attempt {attempt + 1}/{self.retries}: {url} - {e}")
|
||||
if attempt < self.retries - 1:
|
||||
continue
|
||||
|
||||
if last_exception:
|
||||
logger.error(f"Request failed after {self.retries} attempts: {url} - {last_exception}")
|
||||
raise last_exception
|
||||
|
||||
raise RuntimeError("Request failed after retries")
|
||||
|
||||
def _request_stream(self, method: str, url: str, **kwargs):
|
||||
"""Make a streaming request."""
|
||||
if not self._client:
|
||||
raise RuntimeError("HTTPClient must be used with context manager (with statement)")
|
||||
|
||||
# Merge headers
|
||||
if "headers" in kwargs and kwargs["headers"]:
|
||||
headers = self._get_headers()
|
||||
headers.update(kwargs["headers"])
|
||||
kwargs["headers"] = headers
|
||||
else:
|
||||
kwargs["headers"] = self._get_headers()
|
||||
|
||||
return self._client.stream(method, url, **kwargs)
|
||||
|
||||
|
||||
class AsyncHTTPClient:
|
||||
"""Unified async HTTP client with asyncio support."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
timeout: float = DEFAULT_TIMEOUT,
|
||||
retries: int = DEFAULT_RETRIES,
|
||||
user_agent: str = DEFAULT_USER_AGENT,
|
||||
verify_ssl: bool = True,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
"""
|
||||
Initialize async HTTP client.
|
||||
|
||||
Args:
|
||||
timeout: Request timeout in seconds
|
||||
retries: Number of retries on transient failures
|
||||
user_agent: User-Agent header value
|
||||
verify_ssl: Whether to verify SSL certificates
|
||||
headers: Additional headers to include in all requests
|
||||
"""
|
||||
self.timeout = timeout
|
||||
self.retries = retries
|
||||
self.user_agent = user_agent
|
||||
self.verify_ssl = verify_ssl
|
||||
self.base_headers = headers or {}
|
||||
self._client: Optional[httpx.AsyncClient] = None
|
||||
|
||||
async def __aenter__(self):
|
||||
"""Async context manager entry."""
|
||||
self._client = httpx.AsyncClient(
|
||||
timeout=self.timeout,
|
||||
verify=self.verify_ssl,
|
||||
headers=self._get_headers(),
|
||||
)
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||
"""Async context manager exit."""
|
||||
if self._client:
|
||||
await self._client.aclose()
|
||||
self._client = None
|
||||
|
||||
def _get_headers(self) -> Dict[str, str]:
|
||||
"""Get request headers with user-agent."""
|
||||
headers = {"User-Agent": self.user_agent}
|
||||
headers.update(self.base_headers)
|
||||
return headers
|
||||
|
||||
async def get(
|
||||
self,
|
||||
url: str,
|
||||
params: Optional[Dict[str, Any]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
allow_redirects: bool = True,
|
||||
) -> httpx.Response:
|
||||
"""
|
||||
Make an async GET request.
|
||||
|
||||
Args:
|
||||
url: Request URL
|
||||
params: Query parameters
|
||||
headers: Additional headers
|
||||
allow_redirects: Follow redirects
|
||||
|
||||
Returns:
|
||||
httpx.Response object
|
||||
"""
|
||||
return await self._request(
|
||||
"GET",
|
||||
url,
|
||||
params=params,
|
||||
headers=headers,
|
||||
follow_redirects=allow_redirects,
|
||||
)
|
||||
|
||||
async def post(
|
||||
self,
|
||||
url: str,
|
||||
data: Optional[Any] = None,
|
||||
json: Optional[Dict] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
) -> httpx.Response:
|
||||
"""
|
||||
Make an async POST request.
|
||||
|
||||
Args:
|
||||
url: Request URL
|
||||
data: Form data
|
||||
json: JSON data
|
||||
headers: Additional headers
|
||||
|
||||
Returns:
|
||||
httpx.Response object
|
||||
"""
|
||||
return await self._request(
|
||||
"POST",
|
||||
url,
|
||||
data=data,
|
||||
json=json,
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
async def download(
|
||||
self,
|
||||
url: str,
|
||||
file_path: str,
|
||||
chunk_size: int = 8192,
|
||||
progress_callback: Optional[Callable[[int, int], None]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
) -> Path:
|
||||
"""
|
||||
Download a file from URL asynchronously with optional progress tracking.
|
||||
|
||||
Args:
|
||||
url: File URL
|
||||
file_path: Local file path to save to
|
||||
chunk_size: Download chunk size
|
||||
progress_callback: Callback(bytes_downloaded, total_bytes)
|
||||
headers: Additional headers
|
||||
|
||||
Returns:
|
||||
Path object of downloaded file
|
||||
"""
|
||||
path = Path(file_path)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
async with self._request_stream("GET", url, headers=headers) as response:
|
||||
response.raise_for_status()
|
||||
total_bytes = int(response.headers.get("content-length", 0))
|
||||
bytes_downloaded = 0
|
||||
|
||||
with open(path, "wb") as f:
|
||||
async for chunk in response.aiter_bytes(chunk_size):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
bytes_downloaded += len(chunk)
|
||||
if progress_callback:
|
||||
progress_callback(bytes_downloaded, total_bytes)
|
||||
|
||||
return path
|
||||
|
||||
async def _request(
|
||||
self,
|
||||
method: str,
|
||||
url: str,
|
||||
**kwargs
|
||||
) -> httpx.Response:
|
||||
"""
|
||||
Make an async HTTP request with automatic retries.
|
||||
|
||||
Args:
|
||||
method: HTTP method
|
||||
url: Request URL
|
||||
**kwargs: Additional arguments for httpx.AsyncClient.request()
|
||||
|
||||
Returns:
|
||||
httpx.Response object
|
||||
"""
|
||||
if not self._client:
|
||||
raise RuntimeError("AsyncHTTPClient must be used with async context manager")
|
||||
|
||||
# Merge headers
|
||||
if "headers" in kwargs and kwargs["headers"]:
|
||||
headers = self._get_headers()
|
||||
headers.update(kwargs["headers"])
|
||||
kwargs["headers"] = headers
|
||||
else:
|
||||
kwargs["headers"] = self._get_headers()
|
||||
|
||||
last_exception = None
|
||||
|
||||
for attempt in range(self.retries):
|
||||
try:
|
||||
response = await self._client.request(method, url, **kwargs)
|
||||
response.raise_for_status()
|
||||
return response
|
||||
except httpx.TimeoutException as e:
|
||||
last_exception = e
|
||||
logger.warning(f"Timeout on attempt {attempt + 1}/{self.retries}: {url}")
|
||||
if attempt < self.retries - 1:
|
||||
await asyncio.sleep(0.5) # Brief delay before retry
|
||||
continue
|
||||
except httpx.HTTPStatusError as e:
|
||||
# Don't retry on 4xx errors
|
||||
if 400 <= e.response.status_code < 500:
|
||||
try:
|
||||
response_text = e.response.text[:500]
|
||||
except:
|
||||
response_text = "<unable to read response>"
|
||||
logger.error(f"HTTP {e.response.status_code} from {url}: {response_text}")
|
||||
raise
|
||||
last_exception = e
|
||||
try:
|
||||
response_text = e.response.text[:200]
|
||||
except:
|
||||
response_text = "<unable to read response>"
|
||||
logger.warning(f"HTTP {e.response.status_code} on attempt {attempt + 1}/{self.retries}: {url} - {response_text}")
|
||||
if attempt < self.retries - 1:
|
||||
await asyncio.sleep(0.5)
|
||||
continue
|
||||
except (httpx.RequestError, httpx.ConnectError) as e:
|
||||
last_exception = e
|
||||
logger.warning(f"Connection error on attempt {attempt + 1}/{self.retries}: {url} - {e}")
|
||||
if attempt < self.retries - 1:
|
||||
await asyncio.sleep(0.5)
|
||||
continue
|
||||
|
||||
if last_exception:
|
||||
logger.error(f"Request failed after {self.retries} attempts: {url} - {last_exception}")
|
||||
raise last_exception
|
||||
|
||||
raise RuntimeError("Request failed after retries")
|
||||
|
||||
def _request_stream(self, method: str, url: str, **kwargs):
|
||||
"""Make a streaming request."""
|
||||
if not self._client:
|
||||
raise RuntimeError("AsyncHTTPClient must be used with async context manager")
|
||||
|
||||
# Merge headers
|
||||
if "headers" in kwargs and kwargs["headers"]:
|
||||
headers = self._get_headers()
|
||||
headers.update(kwargs["headers"])
|
||||
kwargs["headers"] = headers
|
||||
else:
|
||||
kwargs["headers"] = self._get_headers()
|
||||
|
||||
return self._client.stream(method, url, **kwargs)
|
||||
|
||||
|
||||
# Convenience function for quick sync requests
|
||||
def get(url: str, **kwargs) -> httpx.Response:
|
||||
"""Quick GET request without context manager."""
|
||||
with HTTPClient() as client:
|
||||
return client.get(url, **kwargs)
|
||||
|
||||
|
||||
def post(url: str, **kwargs) -> httpx.Response:
|
||||
"""Quick POST request without context manager."""
|
||||
with HTTPClient() as client:
|
||||
return client.post(url, **kwargs)
|
||||
|
||||
|
||||
def download(
|
||||
url: str,
|
||||
file_path: str,
|
||||
progress_callback: Optional[Callable[[int, int], None]] = None,
|
||||
**kwargs
|
||||
) -> Path:
|
||||
"""Quick file download without context manager."""
|
||||
with HTTPClient() as client:
|
||||
return client.download(url, file_path, progress_callback=progress_callback, **kwargs)
|
||||
1553
helper/hydrus.py
Normal file
1553
helper/hydrus.py
Normal file
File diff suppressed because it is too large
Load Diff
377
helper/libgen_service.py
Normal file
377
helper/libgen_service.py
Normal file
@@ -0,0 +1,377 @@
|
||||
"""Shared Library Genesis search and download helpers."""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, Iterable, List, Optional
|
||||
import logging
|
||||
import requests
|
||||
from urllib.parse import quote, urljoin
|
||||
|
||||
from libgen import search_sync, LibgenError
|
||||
|
||||
LogFn = Optional[Callable[[str], None]]
|
||||
ErrorFn = Optional[Callable[[str], None]]
|
||||
|
||||
DEFAULT_TIMEOUT = 10.0
|
||||
DEFAULT_LIMIT = 50
|
||||
|
||||
logging.getLogger(__name__).setLevel(logging.WARNING)
|
||||
|
||||
|
||||
def _call(logger: LogFn, message: str) -> None:
|
||||
if logger:
|
||||
logger(message)
|
||||
|
||||
|
||||
def search_libgen_no_ads(query: str, session: Optional[requests.Session] = None) -> List[Dict[str, Any]]:
|
||||
"""Search Libgen without triggering ads.php requests."""
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
except ImportError: # pragma: no cover
|
||||
logging.warning("BeautifulSoup not available; falling back to standard search")
|
||||
return []
|
||||
|
||||
mirrors = [
|
||||
"https://libgen.gl",
|
||||
"https://libgen.vg",
|
||||
"https://libgen.la",
|
||||
"https://libgen.bz",
|
||||
"https://libgen.gs",
|
||||
]
|
||||
|
||||
session = session or requests.Session()
|
||||
session.headers.setdefault(
|
||||
"User-Agent",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
||||
)
|
||||
|
||||
for mirror in mirrors:
|
||||
try:
|
||||
search_url = f"{mirror}/index.php?req={quote(query)}&res=100&covers=on&filesuns=all"
|
||||
response = session.get(search_url, timeout=DEFAULT_TIMEOUT)
|
||||
if response.status_code != 200:
|
||||
continue
|
||||
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
table = soup.find("table", {"class": "catalog"})
|
||||
if table is None:
|
||||
for candidate in soup.find_all("table"):
|
||||
rows = candidate.find_all("tr")
|
||||
if len(rows) > 2:
|
||||
table = candidate
|
||||
break
|
||||
if table is None:
|
||||
logging.debug("[libgen_no_ads] No results table on %s", mirror)
|
||||
continue
|
||||
|
||||
rows = table.find_all("tr")[1:]
|
||||
results: List[Dict[str, Any]] = []
|
||||
for row in rows:
|
||||
try:
|
||||
cells = row.find_all("td")
|
||||
if len(cells) < 9:
|
||||
continue
|
||||
|
||||
size_cell = cells[7]
|
||||
file_link = size_cell.find("a")
|
||||
mirror_link = ""
|
||||
if file_link:
|
||||
href = str(file_link.get("href", ""))
|
||||
if href.startswith("/"):
|
||||
mirror_link = mirror + href
|
||||
elif href:
|
||||
mirror_link = urljoin(mirror, href)
|
||||
|
||||
if not mirror_link:
|
||||
title_link = cells[1].find("a") if len(cells) > 1 else None
|
||||
if title_link:
|
||||
href = str(title_link.get("href", ""))
|
||||
if href.startswith("/"):
|
||||
mirror_link = mirror + href
|
||||
elif href:
|
||||
mirror_link = urljoin(mirror, href)
|
||||
|
||||
if not mirror_link:
|
||||
continue
|
||||
|
||||
results.append(
|
||||
{
|
||||
"id": "",
|
||||
"mirror": mirror_link,
|
||||
"cover": "",
|
||||
"title": cells[1].get_text(strip=True) if len(cells) > 1 else "Unknown",
|
||||
"authors": [cells[2].get_text(strip=True)]
|
||||
if len(cells) > 2
|
||||
else ["Unknown"],
|
||||
"publisher": cells[3].get_text(strip=True) if len(cells) > 3 else "",
|
||||
"year": cells[4].get_text(strip=True) if len(cells) > 4 else "",
|
||||
"pages": cells[6].get_text(strip=True) if len(cells) > 6 else "",
|
||||
"language": cells[5].get_text(strip=True) if len(cells) > 5 else "",
|
||||
"size": cells[7].get_text(strip=True) if len(cells) > 7 else "",
|
||||
"extension": cells[8].get_text(strip=True) if len(cells) > 8 else "",
|
||||
"isbn": "",
|
||||
}
|
||||
)
|
||||
except Exception as exc: # pragma: no cover - defensive
|
||||
logging.debug("[libgen_no_ads] Error parsing row: %s", exc)
|
||||
continue
|
||||
|
||||
if results:
|
||||
logging.info("[libgen_no_ads] %d results from %s", len(results), mirror)
|
||||
return results
|
||||
except Exception as exc: # pragma: no cover - mirror issues
|
||||
logging.debug("[libgen_no_ads] Mirror %s failed: %s", mirror, exc)
|
||||
continue
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def format_book_info(book: Any) -> Dict[str, Any]:
|
||||
"""Format Libgen search result into a consistent dictionary."""
|
||||
filesize_bytes = 0
|
||||
size_str = getattr(book, "size", "") or ""
|
||||
if size_str:
|
||||
parts = size_str.strip().split()
|
||||
try:
|
||||
value = float(parts[0])
|
||||
unit = parts[1].upper() if len(parts) > 1 else "B"
|
||||
if unit in {"MB", "M"}:
|
||||
filesize_bytes = int(value * 1024 * 1024)
|
||||
elif unit in {"GB", "G"}:
|
||||
filesize_bytes = int(value * 1024 * 1024 * 1024)
|
||||
elif unit in {"KB", "K"}:
|
||||
filesize_bytes = int(value * 1024)
|
||||
else:
|
||||
filesize_bytes = int(value)
|
||||
except (ValueError, IndexError): # pragma: no cover - defensive
|
||||
filesize_bytes = 0
|
||||
|
||||
title = getattr(book, "title", "") or ""
|
||||
isbn = getattr(book, "isbn", "") or ""
|
||||
if not isbn and title:
|
||||
import re
|
||||
|
||||
match = re.search(
|
||||
r"((?:[\d]{10,13}(?:\s*[;,]\s*[\d]{10,13})+)|(?:[\d]{10,13})(?:\s*[;,]?\s*[\d\-]{0,50})?)\s*(?:\b|$)",
|
||||
title,
|
||||
)
|
||||
if match:
|
||||
potential_isbn = match.group(0).strip()
|
||||
if re.search(r"\d{10,13}", potential_isbn):
|
||||
isbn = potential_isbn
|
||||
title = re.sub(r"\s+[a-z]\s*$", "", title[: match.start()].strip(), flags=re.IGNORECASE)
|
||||
|
||||
authors_value = getattr(book, "authors", None)
|
||||
if isinstance(authors_value, Iterable) and not isinstance(authors_value, str):
|
||||
authors_str = ", ".join(str(author) for author in authors_value)
|
||||
else:
|
||||
authors_str = str(authors_value or "Unknown")
|
||||
|
||||
download_links = getattr(book, "download_links", None)
|
||||
mirror_url = None
|
||||
if download_links and getattr(download_links, "get_link", None):
|
||||
mirror_url = download_links.get_link
|
||||
|
||||
return {
|
||||
"title": title or "Unknown",
|
||||
"author": authors_str,
|
||||
"publisher": getattr(book, "publisher", "") or "",
|
||||
"year": getattr(book, "year", "") or "",
|
||||
"pages": getattr(book, "pages", "") or "",
|
||||
"language": getattr(book, "language", "") or "",
|
||||
"filesize": filesize_bytes,
|
||||
"filesize_str": size_str or "Unknown",
|
||||
"extension": getattr(book, "extension", "") or "",
|
||||
"isbn": isbn,
|
||||
"mirror_url": mirror_url,
|
||||
}
|
||||
|
||||
|
||||
def search_libgen(
|
||||
query: str,
|
||||
limit: int = DEFAULT_LIMIT,
|
||||
*,
|
||||
log_info: LogFn = None,
|
||||
log_error: ErrorFn = None,
|
||||
session: Optional[requests.Session] = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Search Libgen returning formatted dictionaries with multiple mirrors.
|
||||
|
||||
Uses HTML scraper (search_libgen_no_ads) to find books quickly.
|
||||
Returns mirror URLs and book IDs that can be used to generate alternative mirrors.
|
||||
"""
|
||||
try:
|
||||
_call(log_info, f"[search] Searching Libgen for: {query}")
|
||||
session = session or requests.Session()
|
||||
|
||||
# Use HTML scraper - more reliable and doesn't hang on mirror resolution
|
||||
_call(log_info, "[search] Using HTML scraper (search_libgen_no_ads)...")
|
||||
results: List[Any] = search_libgen_no_ads(query, session=session)
|
||||
|
||||
if not results:
|
||||
_call(log_info, "[search] No results from HTML scraper")
|
||||
return []
|
||||
|
||||
formatted: List[Dict[str, Any]] = []
|
||||
mirrors_list = [
|
||||
"https://libgen.gl",
|
||||
"https://libgen.vg",
|
||||
"https://libgen.la",
|
||||
"https://libgen.bz",
|
||||
"https://libgen.gs",
|
||||
]
|
||||
|
||||
for book in results[:limit]:
|
||||
if isinstance(book, dict):
|
||||
# Result from search_libgen_no_ads (HTML scraper)
|
||||
authors = book.get("authors", ["Unknown"])
|
||||
if isinstance(authors, list):
|
||||
author_value = ", ".join(str(a) for a in authors)
|
||||
else:
|
||||
author_value = str(authors)
|
||||
|
||||
# Extract book ID from mirror URL if available
|
||||
mirror = book.get("mirror", "")
|
||||
book_id = ""
|
||||
if mirror and "/file.php?id=" in mirror:
|
||||
try:
|
||||
book_id = mirror.split("/file.php?id=")[1].split("&")[0]
|
||||
except (IndexError, ValueError):
|
||||
pass
|
||||
|
||||
# Build list of alternative mirrors based on book ID
|
||||
mirrors_dict = {}
|
||||
if book_id:
|
||||
for mirror_base in mirrors_list:
|
||||
mirrors_dict[mirror_base] = f"{mirror_base}/file.php?id={book_id}"
|
||||
elif mirror:
|
||||
# Fallback: use the mirror we found
|
||||
mirrors_dict["primary"] = mirror
|
||||
|
||||
formatted.append(
|
||||
{
|
||||
"title": book.get("title", "Unknown"),
|
||||
"author": author_value,
|
||||
"publisher": book.get("publisher", ""),
|
||||
"year": book.get("year", ""),
|
||||
"pages": book.get("pages", ""),
|
||||
"language": book.get("language", ""),
|
||||
"filesize": 0,
|
||||
"filesize_str": book.get("size", "Unknown"),
|
||||
"extension": book.get("extension", ""),
|
||||
"isbn": book.get("isbn", ""),
|
||||
"mirror_url": mirror, # Primary mirror
|
||||
"mirrors": mirrors_dict, # Alternative mirrors
|
||||
"book_id": book_id,
|
||||
}
|
||||
)
|
||||
else:
|
||||
# Fallback: try to format as book object
|
||||
try:
|
||||
formatted.append(format_book_info(book))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
_call(log_info, f"[search] Found {len(formatted)} result(s)")
|
||||
return formatted
|
||||
except LibgenError as exc:
|
||||
_call(log_error, f"[search] Libgen error: {exc}")
|
||||
return []
|
||||
except Exception as exc: # pragma: no cover - defensive
|
||||
_call(log_error, f"[search] Error: {exc}")
|
||||
return []
|
||||
|
||||
|
||||
def download_from_mirror(
|
||||
mirror_url: str,
|
||||
output_path: str | Path,
|
||||
*,
|
||||
log_info: LogFn = None,
|
||||
log_error: ErrorFn = None,
|
||||
session: Optional[requests.Session] = None,
|
||||
) -> bool:
|
||||
"""Download a Libgen file and write it to disk.
|
||||
|
||||
Handles Libgen redirects and ensures proper file download by:
|
||||
- Following all redirects (default behavior)
|
||||
- Setting User-Agent header (required by some mirrors)
|
||||
- Validating that we're downloading binary content, not HTML
|
||||
- Attempting alternative download method if HTML is returned
|
||||
"""
|
||||
session = session or requests.Session()
|
||||
try:
|
||||
output_path = Path(output_path)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
_call(log_info, f"[download] Downloading from mirror: {mirror_url}")
|
||||
|
||||
# Ensure session has proper headers for Libgen
|
||||
if 'User-Agent' not in session.headers:
|
||||
session.headers['User-Agent'] = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
)
|
||||
|
||||
# Download with redirects enabled (default) and referer
|
||||
session.headers['Referer'] = 'https://libgen.gs/'
|
||||
response = session.get(mirror_url, stream=True, timeout=30, allow_redirects=True)
|
||||
response.raise_for_status()
|
||||
|
||||
# Check if we got HTML instead of a file (common Libgen issue)
|
||||
content_type = response.headers.get('content-type', '').lower()
|
||||
if 'text/html' in content_type:
|
||||
_call(log_error, f"[download] Server returned HTML. Trying alternative method...")
|
||||
|
||||
# Try to extract file ID and use alternative CDN
|
||||
try:
|
||||
# Parse the HTML to extract MD5 or file ID
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
# Look for download link in the HTML
|
||||
# Common patterns: md5 hash in form, or direct link in anchor tags
|
||||
download_link = None
|
||||
|
||||
# Try to find forms that might contain download functionality
|
||||
forms = soup.find_all('form')
|
||||
for form in forms:
|
||||
action = form.get('action', '')
|
||||
if 'download' in action.lower() or 'get' in action.lower():
|
||||
download_link = action
|
||||
break
|
||||
|
||||
if not download_link:
|
||||
_call(log_error, f"[download] Could not extract alternative download link from HTML")
|
||||
return False
|
||||
|
||||
_call(log_info, f"[download] Using alternative download method: {download_link[:100]}")
|
||||
# Try downloading from alternative link
|
||||
response2 = session.get(download_link, stream=True, timeout=30, allow_redirects=True)
|
||||
response2.raise_for_status()
|
||||
response = response2 # Use the new response
|
||||
|
||||
except Exception as alt_error:
|
||||
_call(log_error, f"[download] Alternative method failed: {alt_error}")
|
||||
return False
|
||||
|
||||
total_size = int(response.headers.get("content-length", 0))
|
||||
downloaded = 0
|
||||
|
||||
with open(output_path, "wb") as handle:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
if not chunk:
|
||||
continue
|
||||
handle.write(chunk)
|
||||
downloaded += len(chunk)
|
||||
if total_size > 0:
|
||||
percent = downloaded / total_size * 100
|
||||
_call(
|
||||
log_info,
|
||||
f"[download] {percent:.1f}% - {downloaded // (1024*1024)}MB / {total_size // (1024*1024)}MB",
|
||||
)
|
||||
|
||||
_call(log_info, f"[download] Downloaded successfully to: {output_path}")
|
||||
return True
|
||||
except Exception as exc: # pragma: no cover - defensive
|
||||
_call(log_error, f"[download] Error: {exc}")
|
||||
return False
|
||||
1395
helper/local_library.py
Normal file
1395
helper/local_library.py
Normal file
File diff suppressed because it is too large
Load Diff
70
helper/logger.py
Normal file
70
helper/logger.py
Normal file
@@ -0,0 +1,70 @@
|
||||
"""Unified logging utility for automatic file and function name tracking."""
|
||||
|
||||
import sys
|
||||
import inspect
|
||||
from pathlib import Path
|
||||
|
||||
_DEBUG_ENABLED = False
|
||||
|
||||
def set_debug(enabled: bool) -> None:
|
||||
"""Enable or disable debug logging."""
|
||||
global _DEBUG_ENABLED
|
||||
_DEBUG_ENABLED = enabled
|
||||
|
||||
def debug(*args, **kwargs) -> None:
|
||||
"""Print debug message if debug logging is enabled.
|
||||
|
||||
Automatically prepends [filename.function_name] to all output.
|
||||
"""
|
||||
if not _DEBUG_ENABLED:
|
||||
return
|
||||
|
||||
# Set default to stderr for debug messages
|
||||
if 'file' not in kwargs:
|
||||
kwargs['file'] = sys.stderr
|
||||
|
||||
# Prepend DEBUG label
|
||||
args = ("DEBUG:", *args)
|
||||
|
||||
# Use the same logic as log()
|
||||
log(*args, **kwargs)
|
||||
|
||||
def log(*args, **kwargs) -> None:
|
||||
"""Print with automatic file.function prefix.
|
||||
|
||||
Automatically prepends [filename.function_name] to all output.
|
||||
Defaults to stdout if not specified.
|
||||
|
||||
Example:
|
||||
log("Upload started") # Output: [add_file.run] Upload started
|
||||
"""
|
||||
# Get the calling frame
|
||||
frame = inspect.currentframe()
|
||||
if frame is None:
|
||||
print(*args, **kwargs)
|
||||
return
|
||||
|
||||
caller_frame = frame.f_back
|
||||
if caller_frame is None:
|
||||
print(*args, **kwargs)
|
||||
return
|
||||
|
||||
try:
|
||||
# Get file name without extension
|
||||
file_name = Path(caller_frame.f_code.co_filename).stem
|
||||
|
||||
# Get function name
|
||||
func_name = caller_frame.f_code.co_name
|
||||
|
||||
# Set default to stdout if not specified
|
||||
if 'file' not in kwargs:
|
||||
kwargs['file'] = sys.stdout
|
||||
|
||||
# Build prefix
|
||||
prefix = f"[{file_name}.{func_name}]"
|
||||
|
||||
# Print with prefix
|
||||
print(prefix, *args, **kwargs)
|
||||
finally:
|
||||
del frame
|
||||
del caller_frame
|
||||
951
helper/mpv_file.py
Normal file
951
helper/mpv_file.py
Normal file
@@ -0,0 +1,951 @@
|
||||
"""MPV file metadata aggregation helpers."""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, Optional, Sequence
|
||||
from urllib.parse import parse_qs, urlparse, unquote
|
||||
|
||||
from config import get_hydrus_url
|
||||
from helper.utils import sha256_file, unique_preserve_order
|
||||
from helper.hydrus import HydrusClient, HydrusRequestError
|
||||
|
||||
import metadata
|
||||
|
||||
|
||||
class MPVFileError(RuntimeError):
|
||||
"""Raised when we cannot construct an MPV file snapshot."""
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class DebridMagnet:
|
||||
"""Represents a magnet result from AllDebrid search.
|
||||
|
||||
This class matches the structure expected by the TUI (like Hydrus results)
|
||||
with title, target, media_kind attributes for compatibility.
|
||||
"""
|
||||
magnet_id: str
|
||||
title: str
|
||||
size: int
|
||||
status_code: int
|
||||
status_text: str
|
||||
progress: float
|
||||
downloaded: int
|
||||
seeders: int
|
||||
dl_speed: int
|
||||
tag_summary: Optional[str] = None
|
||||
metadata: Optional[Dict[str, Any]] = None # Complete magnet file metadata from AllDebrid API
|
||||
|
||||
@property
|
||||
def target(self) -> str:
|
||||
"""Return the target URI for this magnet (used by TUI for access operations)."""
|
||||
return f"alldebrid://{self.magnet_id}"
|
||||
|
||||
@property
|
||||
def media_kind(self) -> str:
|
||||
"""Return media kind for display."""
|
||||
return "magnet"
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary for metadata display."""
|
||||
return {
|
||||
"magnet_id": self.magnet_id,
|
||||
"title": self.title,
|
||||
"size": self.size,
|
||||
"status_code": self.status_code,
|
||||
"status_text": self.status_text,
|
||||
"progress": f"{self.progress:.1f}%",
|
||||
"downloaded": self.downloaded,
|
||||
"seeders": self.seeders,
|
||||
"dl_speed": self.dl_speed,
|
||||
}
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class HydrusSettings:
|
||||
base_url: Optional[str]
|
||||
access_key: Optional[str]
|
||||
timeout: float
|
||||
prefer_service_name: Optional[str]
|
||||
include_relationships: bool
|
||||
|
||||
def as_metadata_options(self) -> Dict[str, Any]:
|
||||
options: Dict[str, Any] = {
|
||||
"timeout": self.timeout,
|
||||
"include_relationships": self.include_relationships,
|
||||
}
|
||||
if self.prefer_service_name:
|
||||
options["prefer_service_name"] = self.prefer_service_name
|
||||
return options
|
||||
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class MPVfile:
|
||||
path: Optional[str] = None
|
||||
filename: Optional[str] = None
|
||||
type: str = "unknown"
|
||||
hash: Optional[str] = None
|
||||
local_path: Optional[str] = None
|
||||
mpv_metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
remote_metadata: Optional[Dict[str, Any]] = None
|
||||
relationships: Optional[Dict[str, Any]] = None
|
||||
relationship_metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
tags: List[str] = field(default_factory=list)
|
||||
original_tags: Dict[str, str] = field(default_factory=dict)
|
||||
known_urls: List[str] = field(default_factory=list)
|
||||
title: Optional[str] = None
|
||||
source_url: Optional[str] = None
|
||||
clip_time: Optional[str] = None
|
||||
duration: Optional[float] = None
|
||||
filesize_mb: Optional[float] = None
|
||||
is_video: bool = False
|
||||
is_audio: bool = False
|
||||
is_deleted: Optional[bool] = None
|
||||
is_local: Optional[bool] = None
|
||||
has_current_file_service: Optional[bool] = None
|
||||
tag_service_key: Optional[str] = None
|
||||
swap_recommended: bool = False
|
||||
warnings: List[str] = field(default_factory=list)
|
||||
# New relationship fields for menu
|
||||
king: Optional[str] = None
|
||||
alts: List[str] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
payload: Dict[str, Any] = {
|
||||
"path": self.path,
|
||||
"filename": self.filename,
|
||||
"type": self.type,
|
||||
"hash": self.hash,
|
||||
"local_path": self.local_path,
|
||||
"mpv_metadata": self.mpv_metadata,
|
||||
"metadata": self.metadata,
|
||||
"remote_metadata": self.remote_metadata,
|
||||
"relationships": self.relationships,
|
||||
"relationship_metadata": self.relationship_metadata,
|
||||
"tags": self.tags,
|
||||
"original_tags": self.original_tags,
|
||||
"known_urls": self.known_urls,
|
||||
"title": self.title,
|
||||
"source_url": self.source_url,
|
||||
"clip_time": self.clip_time,
|
||||
"duration": self.duration,
|
||||
"filesize_mb": self.filesize_mb,
|
||||
"is_video": self.is_video,
|
||||
"is_audio": self.is_audio,
|
||||
"is_deleted": self.is_deleted,
|
||||
"is_local": self.is_local,
|
||||
"has_current_file_service": self.has_current_file_service,
|
||||
"tag_service_key": self.tag_service_key,
|
||||
"swap_recommended": self.swap_recommended,
|
||||
"warnings": self.warnings,
|
||||
# relationship summary fields for easier Lua consumption
|
||||
"king": self.king,
|
||||
"alts": self.alts,
|
||||
}
|
||||
# Remove empty optional values for terser payloads.
|
||||
for key in list(payload.keys()):
|
||||
value = payload[key]
|
||||
if value in (None, [], {}, ""):
|
||||
del payload[key]
|
||||
return payload
|
||||
|
||||
|
||||
def _normalise_string_list(values: Optional[Iterable[Any]]) -> List[str]:
|
||||
if not values:
|
||||
return []
|
||||
seen: set[str] = set()
|
||||
result: List[str] = []
|
||||
for value in values:
|
||||
if value is None:
|
||||
continue
|
||||
text = str(value).strip()
|
||||
if not text or text in seen:
|
||||
continue
|
||||
seen.add(text)
|
||||
result.append(text)
|
||||
return result
|
||||
|
||||
|
||||
def _looks_like_hash(value: Optional[str]) -> bool:
|
||||
if not value:
|
||||
return False
|
||||
candidate = value.strip().lower()
|
||||
return len(candidate) == 64 and all(ch in "0123456789abcdef" for ch in candidate)
|
||||
|
||||
|
||||
class MPVFileBuilder:
|
||||
def __init__(self, payload: Dict[str, Any], config: Dict[str, Any]):
|
||||
self.payload = payload or {}
|
||||
self.config = config or {}
|
||||
self.state = MPVfile()
|
||||
self.hydrus_settings = self._resolve_hydrus_settings()
|
||||
self.remote_options = self._resolve_remote_options()
|
||||
self.include_relationships = bool(self.payload.get("include_relationships", True))
|
||||
self.last_url = self._normalise_url(self.payload.get("last_url"))
|
||||
self._initialise_identity()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# public API
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def build(self) -> Dict[str, Any]:
|
||||
if self.state.type == "hydrus":
|
||||
self._populate_hydrus_by_hash()
|
||||
elif self.state.type == "local":
|
||||
self._populate_local()
|
||||
elif self.state.type == "remote":
|
||||
self._populate_remote()
|
||||
else:
|
||||
# Attempt best effort resolution even for unknown types.
|
||||
self._populate_local(best_effort=True)
|
||||
self._finalise()
|
||||
result = self.state.to_dict()
|
||||
# Append King and Alts info to mpv_metadata for info menu
|
||||
king = self.state.king
|
||||
alts = self.state.alts
|
||||
if king:
|
||||
result.setdefault("mpv_metadata", {})["King"] = king
|
||||
if alts:
|
||||
result.setdefault("mpv_metadata", {})["Alts"] = ", ".join(alts)
|
||||
return result
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# configuration helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _resolve_hydrus_settings(self) -> HydrusSettings:
|
||||
overrides = self.payload.get("hydrus")
|
||||
overrides = overrides if isinstance(overrides, dict) else {}
|
||||
base_url = overrides.get("url") or overrides.get("base_url")
|
||||
access_key = overrides.get("access_key")
|
||||
timeout_raw = overrides.get("timeout") or overrides.get("hydrus_timeout")
|
||||
prefer_service = overrides.get("prefer_service_name")
|
||||
include_relationships = overrides.get("include_relationships")
|
||||
if base_url is None:
|
||||
base_url = get_hydrus_url(self.config)
|
||||
if access_key is None:
|
||||
raw_key = self.config.get("HydrusNetwork_Access_Key")
|
||||
access_key = str(raw_key) if raw_key is not None else None
|
||||
if timeout_raw is None:
|
||||
timeout_raw = self.config.get("HydrusNetwork_Request_Timeout")
|
||||
try:
|
||||
timeout = float(timeout_raw) if timeout_raw is not None else 60.0
|
||||
except (TypeError, ValueError):
|
||||
timeout = 60.0
|
||||
if prefer_service is None:
|
||||
prefer_service = self.config.get("Hydrus_Tag_Service")
|
||||
if isinstance(prefer_service, str):
|
||||
prefer_service = prefer_service.strip() or None
|
||||
if include_relationships is None:
|
||||
include_relationships = self.payload.get("include_relationships")
|
||||
include_relationships = bool(True if include_relationships is None else include_relationships)
|
||||
base_url = base_url.strip() if isinstance(base_url, str) else None
|
||||
access_key = access_key.strip() if isinstance(access_key, str) else None
|
||||
return HydrusSettings(
|
||||
base_url=base_url or None,
|
||||
access_key=access_key or None,
|
||||
timeout=timeout,
|
||||
prefer_service_name=prefer_service,
|
||||
include_relationships=include_relationships,
|
||||
)
|
||||
|
||||
def _resolve_remote_options(self) -> Dict[str, Any]:
|
||||
remote_payload = self.payload.get("remote")
|
||||
remote_payload = remote_payload if isinstance(remote_payload, dict) else {}
|
||||
options = remote_payload.get("options")
|
||||
options = options if isinstance(options, dict) else {}
|
||||
ytdlp_args = options.get("ytdlp_args")
|
||||
if not ytdlp_args:
|
||||
options["ytdlp_args"] = ["--no-playlist", "--skip-download", "--no-warnings"]
|
||||
existing_timeout = options.get("timeout")
|
||||
if existing_timeout is None:
|
||||
options["timeout"] = min(90.0, max(10.0, float(self.payload.get("remote_timeout") or 45.0)))
|
||||
return options
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# initialisation
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _initialise_identity(self) -> None:
|
||||
s = self.state
|
||||
p = self.payload
|
||||
|
||||
def _str_or_none(v):
|
||||
return str(v) if v is not None and v != "" else None
|
||||
|
||||
def _copy_dict_if_dict(v):
|
||||
return dict(v) if isinstance(v, dict) else {}
|
||||
|
||||
# path and filename
|
||||
s.path = _str_or_none(p.get("path"))
|
||||
s.filename = _str_or_none(p.get("filename"))
|
||||
|
||||
# mpv metadata
|
||||
s.mpv_metadata = _copy_dict_if_dict(p.get("mpv_metadata"))
|
||||
|
||||
# tags (support both "tags" and legacy "existing_tags")
|
||||
existing_tags = p.get("tags") or p.get("existing_tags")
|
||||
s.tags = _normalise_string_list(existing_tags)
|
||||
if s.tags:
|
||||
s.original_tags = {tag: tag for tag in s.tags}
|
||||
|
||||
# known URLs + last_url
|
||||
s.known_urls = _normalise_string_list(p.get("known_urls"))
|
||||
if self.last_url and self.last_url not in s.known_urls:
|
||||
s.known_urls.append(self.last_url)
|
||||
|
||||
# source URL (explicit or fallback to last_url)
|
||||
explicit_source = p.get("source_url")
|
||||
s.source_url = self._normalise_url(explicit_source) or self.last_url
|
||||
|
||||
# hash (validate looks-like-hash)
|
||||
hash_candidate = p.get("hash")
|
||||
if isinstance(hash_candidate, str):
|
||||
candidate = hash_candidate.strip().lower()
|
||||
if _looks_like_hash(candidate):
|
||||
s.hash = candidate
|
||||
|
||||
# local_path (non-empty string)
|
||||
local_path_override = p.get("local_path")
|
||||
if isinstance(local_path_override, str):
|
||||
lp = local_path_override.strip()
|
||||
if lp:
|
||||
s.local_path = lp
|
||||
|
||||
# derive remaining fields from path/filename/type
|
||||
self._derive_filename_from_path()
|
||||
self._determine_type()
|
||||
|
||||
|
||||
def _derive_filename_from_path(self) -> None:
|
||||
if self.state.filename or not self.state.path:
|
||||
return
|
||||
parsed = urlparse(self.state.path)
|
||||
if parsed.scheme in ("http", "https", "ytdl") and parsed.path:
|
||||
candidate = Path(parsed.path).name
|
||||
if candidate:
|
||||
self.state.filename = candidate
|
||||
elif parsed.scheme == "file":
|
||||
decoded = self._decode_file_url(self.state.path)
|
||||
if decoded:
|
||||
self.state.filename = Path(decoded).name
|
||||
else:
|
||||
try:
|
||||
self.state.filename = Path(self.state.path).name
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _determine_type(self) -> None:
|
||||
s = self.state
|
||||
p = self.payload
|
||||
|
||||
def _set_local_from_path(pth: str | None):
|
||||
if not pth:
|
||||
return
|
||||
# Prefer resolved local path when available
|
||||
resolved = self._resolve_local_path(pth)
|
||||
s.local_path = resolved if resolved else pth
|
||||
s.type = "local"
|
||||
|
||||
# 1) Respect explicit type when valid
|
||||
explicit = p.get("type")
|
||||
if isinstance(explicit, str):
|
||||
lowered = explicit.strip().lower()
|
||||
if lowered in {"local", "hydrus", "remote"}:
|
||||
s.type = lowered
|
||||
if lowered == "local":
|
||||
s.local_path = self._resolve_local_path(s.path)
|
||||
return
|
||||
|
||||
# 2) Work from path
|
||||
path = s.path or ""
|
||||
if not path:
|
||||
s.type = "unknown"
|
||||
return
|
||||
|
||||
# 3) Hydrus-specific quick checks
|
||||
if self._looks_like_hydrus_url(path):
|
||||
s.type = "hydrus"
|
||||
return
|
||||
|
||||
parsed = urlparse(path)
|
||||
scheme = (parsed.scheme or "").lower()
|
||||
|
||||
# 4) scheme-based handling
|
||||
if scheme == "hydrus":
|
||||
s.type = "hydrus"
|
||||
return
|
||||
|
||||
if scheme in {"http", "https", "rtmp", "rtsp", "magnet", "ytdl"}:
|
||||
s.type = "hydrus" if self._looks_like_hydrus_url(path) else "remote"
|
||||
return
|
||||
|
||||
if scheme == "file":
|
||||
decoded = self._decode_file_url(path)
|
||||
if decoded:
|
||||
s.local_path = decoded
|
||||
s.type = "local"
|
||||
return
|
||||
|
||||
# 5) Windows/UNC absolute paths
|
||||
if re.match(r"^[A-Za-z]:[\\/]", path) or path.startswith(("\\\\", "//")):
|
||||
s.type = "local"
|
||||
s.local_path = path
|
||||
return
|
||||
|
||||
# 6) Fallback: if it looks like a URL with a scheme separator treat as remote/hydrus
|
||||
if "://" in path:
|
||||
s.type = "hydrus" if self._looks_like_hydrus_url(path) else "remote"
|
||||
return
|
||||
|
||||
# 7) Otherwise treat as a local path
|
||||
_set_local_from_path(path)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# population helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _populate_local(self, best_effort: bool = False) -> None:
|
||||
local_path = self.state.local_path or self._resolve_local_path(self.state.path)
|
||||
if local_path:
|
||||
self.state.local_path = local_path
|
||||
self._load_sidecar_tags(local_path)
|
||||
if not self.state.hash:
|
||||
self._compute_local_hash(local_path)
|
||||
# If Hydrus is configured and we have a hash, enrich from Hydrus; otherwise keep local tags only
|
||||
if self.state.hash and self.hydrus_settings.base_url and self.hydrus_settings.access_key:
|
||||
self._populate_hydrus_by_hash()
|
||||
elif best_effort and self.hydrus_settings.base_url and self.state.source_url and self.hydrus_settings.access_key:
|
||||
self._populate_hydrus_by_url(self.state.source_url)
|
||||
|
||||
# (helpers for resolving local path and loading sidecars already exist below)
|
||||
|
||||
def _populate_remote(self) -> None:
|
||||
source_url = self.state.source_url or self.last_url or self.state.path
|
||||
source_url = self._normalise_url(source_url)
|
||||
if source_url:
|
||||
self.state.source_url = source_url
|
||||
remote_payload = {
|
||||
"source_url": self.state.source_url,
|
||||
"existing_tags": self.state.tags,
|
||||
"metadata": self.payload.get("remote_metadata"),
|
||||
"mpv_metadata": self.state.mpv_metadata,
|
||||
"options": self.remote_options,
|
||||
}
|
||||
try:
|
||||
remote_result = metadata.resolve_remote_metadata(remote_payload)
|
||||
except Exception as exc: # pragma: no cover - surfaced to the caller
|
||||
self.state.warnings.append(str(exc))
|
||||
remote_result = None
|
||||
if remote_result:
|
||||
tags = remote_result.get("tags") or []
|
||||
self._merge_tags(tags)
|
||||
self.state.remote_metadata = remote_result.get("metadata")
|
||||
self.state.title = remote_result.get("title") or self.state.title
|
||||
self.state.duration = remote_result.get("duration") or self.state.duration
|
||||
self.state.source_url = remote_result.get("source_url") or self.state.source_url
|
||||
warnings = remote_result.get("warnings") or []
|
||||
if warnings:
|
||||
self.state.warnings.extend(warnings)
|
||||
if self.hydrus_settings.base_url and self.state.source_url:
|
||||
self._populate_hydrus_by_url(self.state.source_url)
|
||||
|
||||
def _populate_hydrus_by_hash(self) -> None:
|
||||
hash_hex = self.state.hash or self._extract_hash_from_path(self.state.path)
|
||||
if hash_hex and not _looks_like_hash(hash_hex):
|
||||
hash_hex = None
|
||||
if not hash_hex:
|
||||
return
|
||||
self.state.hash = hash_hex
|
||||
if not self.hydrus_settings.base_url:
|
||||
return
|
||||
payload: Dict[str, Any] = {
|
||||
"api_url": self.hydrus_settings.base_url,
|
||||
"access_key": self.hydrus_settings.access_key or "",
|
||||
"options": self.hydrus_settings.as_metadata_options(),
|
||||
"hash": hash_hex,
|
||||
}
|
||||
try:
|
||||
result = metadata.fetch_hydrus_metadata(payload)
|
||||
except Exception as exc: # pragma: no cover - surfaced to caller
|
||||
self.state.warnings.append(str(exc))
|
||||
return
|
||||
self._apply_hydrus_result(result)
|
||||
# Enrich relationships using the dedicated Hydrus endpoint (robust GET)
|
||||
if self.include_relationships and self.state.hash and self.hydrus_settings.base_url:
|
||||
self._enrich_relationships_from_api(self.state.hash)
|
||||
|
||||
def _populate_hydrus_by_url(self, url: str) -> None:
|
||||
if not self.hydrus_settings.base_url:
|
||||
return
|
||||
payload: Dict[str, Any] = {
|
||||
"api_url": self.hydrus_settings.base_url,
|
||||
"access_key": self.hydrus_settings.access_key or "",
|
||||
"options": self.hydrus_settings.as_metadata_options(),
|
||||
"url": url,
|
||||
}
|
||||
try:
|
||||
result = metadata.fetch_hydrus_metadata_by_url(payload)
|
||||
except Exception as exc: # pragma: no cover - surfaced to caller
|
||||
self.state.warnings.append(str(exc))
|
||||
return
|
||||
if result.get("error") == "not_found":
|
||||
self.state.warnings.extend(result.get("warnings") or [])
|
||||
return
|
||||
self._apply_hydrus_result(result)
|
||||
self.state.type = "hydrus"
|
||||
matched_url = result.get("matched_url") or result.get("url")
|
||||
if matched_url and matched_url not in self.state.known_urls:
|
||||
self.state.known_urls.append(matched_url)
|
||||
# Enrich relationships once we know the hash
|
||||
if self.include_relationships and self.state.hash and self.hydrus_settings.base_url:
|
||||
self._enrich_relationships_from_api(self.state.hash)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# state modification helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
def _apply_hydrus_result(self, result: Dict[str, Any]) -> None:
|
||||
metadata_payload = result.get("metadata")
|
||||
if isinstance(metadata_payload, dict):
|
||||
# Process mime into type for Lua
|
||||
mime = metadata_payload.get("mime")
|
||||
if isinstance(mime, str):
|
||||
if mime.startswith("video/"):
|
||||
metadata_payload["type"] = "video"
|
||||
elif mime.startswith("audio/"):
|
||||
metadata_payload["type"] = "audio"
|
||||
elif mime.startswith("image/"):
|
||||
metadata_payload["type"] = "image"
|
||||
else:
|
||||
metadata_payload["type"] = "other"
|
||||
self.state.metadata = metadata_payload
|
||||
# Do NOT overwrite MPVfile.type with metadata.type
|
||||
self._merge_known_urls(metadata_payload.get("known_urls") or metadata_payload.get("known_urls_set"))
|
||||
source_url = metadata_payload.get("original_url") or metadata_payload.get("source_url")
|
||||
if source_url and not self.state.source_url:
|
||||
self.state.source_url = self._normalise_url(source_url)
|
||||
# If file_relationships are embedded in metadata, capture as relationships when missing
|
||||
if self.state.relationships is None:
|
||||
embedded = metadata_payload.get("file_relationships")
|
||||
if isinstance(embedded, dict) and embedded:
|
||||
self.state.relationships = embedded
|
||||
tags = result.get("tags") or []
|
||||
self._merge_tags(tags)
|
||||
hash_value = result.get("hash") or result.get("matched_hash")
|
||||
if isinstance(hash_value, str) and _looks_like_hash(hash_value):
|
||||
self.state.hash = hash_value.lower()
|
||||
self.state.tag_service_key = result.get("tag_service_key") or self.state.tag_service_key
|
||||
self.state.duration = result.get("duration") or self.state.duration
|
||||
self.state.filesize_mb = result.get("filesize_mb") or self.state.filesize_mb
|
||||
self.state.is_video = bool(result.get("is_video") or self.state.is_video)
|
||||
self.state.is_audio = bool(result.get("is_audio") or self.state.is_audio)
|
||||
if result.get("is_deleted") is not None:
|
||||
self.state.is_deleted = bool(result.get("is_deleted"))
|
||||
if result.get("is_local") is not None:
|
||||
self.state.is_local = bool(result.get("is_local"))
|
||||
if result.get("has_current_file_service") is not None:
|
||||
self.state.has_current_file_service = bool(result.get("has_current_file_service"))
|
||||
# Consolidate relationships from explicit result or embedded metadata
|
||||
relationships_obj: Optional[Dict[str, Any]] = None
|
||||
if isinstance(result.get("relationships"), dict):
|
||||
relationships_obj = result["relationships"]
|
||||
self.state.relationships = relationships_obj
|
||||
elif isinstance(self.state.relationships, dict):
|
||||
relationships_obj = self.state.relationships
|
||||
|
||||
# Helper to flatten any hashes from the relationships object
|
||||
def _collect_hashes(obj: Any, acc: set[str]) -> None:
|
||||
if obj is None:
|
||||
return
|
||||
if isinstance(obj, dict):
|
||||
for v in obj.values():
|
||||
_collect_hashes(v, acc)
|
||||
elif isinstance(obj, (list, tuple, set)):
|
||||
for v in obj:
|
||||
_collect_hashes(v, acc)
|
||||
elif isinstance(obj, str) and _looks_like_hash(obj):
|
||||
acc.add(obj.lower())
|
||||
|
||||
# Derive king and alts robustly from available data
|
||||
king: Optional[str] = None
|
||||
alts: list[str] = []
|
||||
|
||||
# 1) Try direct king fields on relationships object
|
||||
rels = relationships_obj or {}
|
||||
if isinstance(rels, dict):
|
||||
# Common variants
|
||||
for key in ("king", "king_hash", "duplicate_king", "best", "best_hash"):
|
||||
val = rels.get(key)
|
||||
if isinstance(val, str) and _looks_like_hash(val):
|
||||
king = val.lower()
|
||||
break
|
||||
if isinstance(val, list):
|
||||
for h in val:
|
||||
if isinstance(h, str) and _looks_like_hash(h):
|
||||
king = h.lower()
|
||||
break
|
||||
if king:
|
||||
break
|
||||
# 2) Extract alternates from known fields: numeric "3" (clips), or textual synonyms
|
||||
for alt_key in ("3", "alternates", "alts", "clips"):
|
||||
val = rels.get(alt_key)
|
||||
if isinstance(val, list):
|
||||
for h in val:
|
||||
if isinstance(h, str) and _looks_like_hash(h):
|
||||
h_low = h.lower()
|
||||
if not king or h_low != king:
|
||||
alts.append(h_low)
|
||||
# some APIs might nest
|
||||
elif isinstance(val, dict):
|
||||
tmp: set[str] = set()
|
||||
_collect_hashes(val, tmp)
|
||||
for h in sorted(tmp):
|
||||
if not king or h != king:
|
||||
alts.append(h)
|
||||
|
||||
# 3) Use relationship_metadata keys as additional alternates and king hint
|
||||
rel_meta = result.get("relationship_metadata")
|
||||
if isinstance(rel_meta, dict):
|
||||
# prefer king candidate with no clip_time if not set
|
||||
if not king:
|
||||
for h, meta in rel_meta.items():
|
||||
if isinstance(h, str) and _looks_like_hash(h) and isinstance(meta, dict):
|
||||
if not meta.get("clip_time"):
|
||||
king = h.lower()
|
||||
break
|
||||
for h in rel_meta.keys():
|
||||
if isinstance(h, str) and _looks_like_hash(h):
|
||||
h_low = h.lower()
|
||||
if not king or h_low != king:
|
||||
alts.append(h_low)
|
||||
|
||||
# 4) As a last resort, flatten all relationship hashes
|
||||
if not alts and relationships_obj:
|
||||
tmp: set[str] = set()
|
||||
_collect_hashes(relationships_obj, tmp)
|
||||
for h in sorted(tmp):
|
||||
if not king or h != king:
|
||||
alts.append(h)
|
||||
|
||||
# 5) Include current file when appropriate
|
||||
if self.state.hash and (not king or self.state.hash != king) and self.state.hash not in alts:
|
||||
alts.append(self.state.hash)
|
||||
|
||||
# 6) Sort alternates by clip start time when available
|
||||
rel_meta_all = result.get("relationship_metadata") if isinstance(result.get("relationship_metadata"), dict) else {}
|
||||
def _clip_start_for(h: str) -> float:
|
||||
meta = rel_meta_all.get(h) if isinstance(rel_meta_all, dict) else None
|
||||
clip = meta.get("clip_time") if isinstance(meta, dict) else None
|
||||
if isinstance(clip, str):
|
||||
m = re.match(r"^(\d+)-(\d+)$", clip)
|
||||
if m:
|
||||
try:
|
||||
return float(m.group(1))
|
||||
except Exception:
|
||||
return float("inf")
|
||||
return float("inf")
|
||||
|
||||
if alts:
|
||||
# de-duplicate while preserving earliest clip time ordering
|
||||
seen: set[str] = set()
|
||||
alts = [h for h in sorted(alts, key=_clip_start_for) if (h not in seen and not seen.add(h))]
|
||||
|
||||
self.state.king = king
|
||||
self.state.alts = alts
|
||||
if isinstance(result.get("relationship_metadata"), dict):
|
||||
self.state.relationship_metadata = result["relationship_metadata"]
|
||||
self.state.title = result.get("title") or self.state.title
|
||||
self.state.clip_time = result.get("clip_time") or self.state.clip_time
|
||||
if result.get("swap_recommended"):
|
||||
self.state.swap_recommended = True
|
||||
warnings = result.get("warnings") or []
|
||||
if warnings:
|
||||
self.state.warnings.extend(warnings)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# relationships enrichment (Hydrus endpoint + alt metadata)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _enrich_relationships_from_api(self, file_hash: str) -> None:
|
||||
"""Fetch relationships for the given hash and enrich state's king/alts and alt metadata.
|
||||
|
||||
- Uses GET /manage_file_relationships/get_file_relationships?hash=...
|
||||
- If alts exist, batch-fetch their metadata via GET /get_files/file_metadata?hashes=[...]
|
||||
- Extracts title, duration, size, tags (cleaned: title: kept with namespace, others stripped)
|
||||
"""
|
||||
base_url = self.hydrus_settings.base_url or ""
|
||||
access_key = self.hydrus_settings.access_key or ""
|
||||
if not base_url:
|
||||
return
|
||||
try:
|
||||
client = HydrusClient(base_url, access_key, timeout=self.hydrus_settings.timeout)
|
||||
except Exception as exc: # pragma: no cover - construction should rarely fail
|
||||
self.state.warnings.append(f"Hydrus client init failed: {exc}")
|
||||
return
|
||||
try:
|
||||
rel_resp = client.get_file_relationships(file_hash)
|
||||
except HydrusRequestError as hre: # pragma: no cover - surfaced but non-fatal
|
||||
self.state.warnings.append(f"relationships api: {hre}")
|
||||
return
|
||||
except Exception as exc: # pragma: no cover
|
||||
self.state.warnings.append(f"relationships api: {exc}")
|
||||
return
|
||||
|
||||
rel_map = rel_resp.get("file_relationships") or {}
|
||||
rel_obj = None
|
||||
if isinstance(rel_map, dict):
|
||||
rel_obj = rel_map.get(file_hash) or next((v for v in rel_map.values() if isinstance(v, dict)), None)
|
||||
if isinstance(rel_obj, dict):
|
||||
# Preserve the full relationships object
|
||||
self.state.relationships = rel_obj
|
||||
# Update king and alts from canonical fields
|
||||
king = rel_obj.get("king")
|
||||
alts = rel_obj.get("3") or []
|
||||
if isinstance(king, str) and _looks_like_hash(king):
|
||||
self.state.king = king.lower()
|
||||
if isinstance(alts, list):
|
||||
self.state.alts = [h.lower() for h in alts if isinstance(h, str) and _looks_like_hash(h)]
|
||||
|
||||
# Fetch alt metadata if we have alts
|
||||
if not self.state.alts:
|
||||
return
|
||||
try:
|
||||
meta_resp = client.fetch_file_metadata(
|
||||
hashes=self.state.alts,
|
||||
include_service_keys_to_tags=True,
|
||||
include_duration=True,
|
||||
include_size=True,
|
||||
include_file_urls=False,
|
||||
include_mime=False,
|
||||
)
|
||||
except HydrusRequestError as hre: # pragma: no cover
|
||||
self.state.warnings.append(f"metadata api: {hre}")
|
||||
return
|
||||
except Exception as exc: # pragma: no cover
|
||||
self.state.warnings.append(f"metadata api: {exc}")
|
||||
return
|
||||
|
||||
if not isinstance(meta_resp, dict):
|
||||
return
|
||||
entries = meta_resp.get("metadata") or []
|
||||
if not isinstance(entries, list):
|
||||
return
|
||||
|
||||
def _extract_tags(meta: Dict[str, Any]) -> list[str]:
|
||||
tags: list[str] = []
|
||||
tag_root = meta.get("tags") or meta.get("service_keys_to_statuses_to_tags") or {}
|
||||
if isinstance(tag_root, dict):
|
||||
for service_dict in tag_root.values():
|
||||
if not isinstance(service_dict, dict):
|
||||
continue
|
||||
# Prefer storage_tags but fall back to any list values under known keys
|
||||
storage = service_dict.get("storage_tags")
|
||||
if isinstance(storage, dict):
|
||||
for vals in storage.values():
|
||||
if isinstance(vals, list):
|
||||
tags.extend([str(t) for t in vals if isinstance(t, str)])
|
||||
else:
|
||||
# fall back: inspect lists directly under service_dict
|
||||
for vals in service_dict.values():
|
||||
if isinstance(vals, list):
|
||||
tags.extend([str(t) for t in vals if isinstance(t, str)])
|
||||
return tags
|
||||
|
||||
def _clean_tags_and_title(all_tags: list[str]) -> tuple[Optional[str], list[str]]:
|
||||
title_val: Optional[str] = None
|
||||
cleaned: list[str] = []
|
||||
for tag in all_tags:
|
||||
if not isinstance(tag, str):
|
||||
continue
|
||||
if tag.startswith("title:"):
|
||||
if title_val is None:
|
||||
title_val = tag.split(":", 1)[1]
|
||||
cleaned.append(tag) # keep namespaced title
|
||||
else:
|
||||
if ":" in tag:
|
||||
cleaned.append(tag.split(":", 1)[1])
|
||||
else:
|
||||
cleaned.append(tag)
|
||||
return title_val, cleaned
|
||||
|
||||
for meta in entries:
|
||||
if not isinstance(meta, dict):
|
||||
continue
|
||||
h = meta.get("hash")
|
||||
if not (isinstance(h, str) and _looks_like_hash(h)):
|
||||
continue
|
||||
tags_all = _extract_tags(meta)
|
||||
title_val, tags_clean = _clean_tags_and_title(tags_all)
|
||||
alt_info = {
|
||||
"title": title_val,
|
||||
"duration": meta.get("duration"),
|
||||
"size": meta.get("size"),
|
||||
"tags": tags_clean,
|
||||
}
|
||||
self.state.relationship_metadata[h.lower()] = alt_info
|
||||
|
||||
def _merge_tags(self, tags: Sequence[Any]) -> None:
|
||||
incoming = _normalise_string_list(tags)
|
||||
if not incoming:
|
||||
return
|
||||
combined = list(self.state.tags or []) + incoming
|
||||
self.state.tags = unique_preserve_order(combined)
|
||||
for tag in incoming:
|
||||
if tag not in self.state.original_tags:
|
||||
self.state.original_tags[tag] = tag
|
||||
|
||||
def _merge_known_urls(self, urls: Optional[Iterable[Any]]) -> None:
|
||||
if not urls:
|
||||
return
|
||||
combined = list(self.state.known_urls or []) + _normalise_string_list(urls)
|
||||
self.state.known_urls = unique_preserve_order(combined)
|
||||
|
||||
def _load_sidecar_tags(self, local_path: str) -> None:
|
||||
try:
|
||||
media_path = Path(local_path)
|
||||
except Exception:
|
||||
return
|
||||
if not media_path.exists():
|
||||
return
|
||||
candidates = [media_path.with_suffix(".tags"), media_path.with_suffix(".tags.txt")]
|
||||
for candidate in candidates:
|
||||
if candidate.exists():
|
||||
hash_value, tags, known = self._read_sidecar(candidate)
|
||||
if hash_value and not self.state.hash and _looks_like_hash(hash_value):
|
||||
self.state.hash = hash_value.lower()
|
||||
self._merge_tags(tags)
|
||||
self._merge_known_urls(known)
|
||||
break
|
||||
|
||||
def _read_sidecar(self, sidecar_path: Path) -> tuple[Optional[str], List[str], List[str]]:
|
||||
try:
|
||||
raw = sidecar_path.read_text(encoding="utf-8", errors="ignore")
|
||||
except OSError:
|
||||
return None, [], []
|
||||
hash_value: Optional[str] = None
|
||||
tags: List[str] = []
|
||||
known_urls: List[str] = []
|
||||
for line in raw.splitlines():
|
||||
trimmed = line.strip()
|
||||
if not trimmed:
|
||||
continue
|
||||
lowered = trimmed.lower()
|
||||
if lowered.startswith("hash:"):
|
||||
candidate = trimmed.split(":", 1)[1].strip() if ":" in trimmed else ""
|
||||
if candidate:
|
||||
hash_value = candidate
|
||||
elif lowered.startswith("known_url:") or lowered.startswith("url:"):
|
||||
candidate = trimmed.split(":", 1)[1].strip() if ":" in trimmed else ""
|
||||
if candidate:
|
||||
known_urls.append(candidate)
|
||||
else:
|
||||
tags.append(trimmed)
|
||||
return hash_value, tags, known_urls
|
||||
|
||||
def _compute_local_hash(self, local_path: str) -> None:
|
||||
try:
|
||||
digest = sha256_file(Path(local_path))
|
||||
except OSError as exc:
|
||||
self.state.warnings.append(f"sha256 failed: {exc}")
|
||||
return
|
||||
self.state.hash = digest.lower()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# finalisation helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _finalise(self) -> None:
|
||||
if self.state.tags:
|
||||
self.state.tags = unique_preserve_order(self.state.tags)
|
||||
if self.state.known_urls:
|
||||
self.state.known_urls = unique_preserve_order(self.state.known_urls)
|
||||
# Ensure metadata.type is always present for Lua, but do NOT overwrite MPVfile.type
|
||||
if not self.state.title:
|
||||
if self.state.metadata.get("title"):
|
||||
self.state.title = str(self.state.metadata["title"]).strip()
|
||||
elif self.state.filename:
|
||||
self.state.title = self.state.filename
|
||||
if self.state.hash and not _looks_like_hash(self.state.hash):
|
||||
self.state.hash = None
|
||||
if self.state.relationship_metadata is None:
|
||||
self.state.relationship_metadata = {}
|
||||
if self.state.relationships is not None and not isinstance(self.state.relationships, dict):
|
||||
self.state.relationships = None
|
||||
if self.state.original_tags is None:
|
||||
self.state.original_tags = {}
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# util helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def _normalise_url(value: Any) -> Optional[str]:
|
||||
if value is None:
|
||||
return None
|
||||
text = str(value).strip()
|
||||
if not text:
|
||||
return None
|
||||
return text
|
||||
|
||||
@staticmethod
|
||||
def _resolve_local_path(path: Optional[str]) -> Optional[str]:
|
||||
if not path:
|
||||
return None
|
||||
parsed = urlparse(path)
|
||||
if parsed.scheme == "file":
|
||||
decoded = MPVFileBuilder._decode_file_url(path)
|
||||
return decoded
|
||||
return path
|
||||
|
||||
@staticmethod
|
||||
def _decode_file_url(value: str) -> Optional[str]:
|
||||
parsed = urlparse(value)
|
||||
if parsed.scheme != "file":
|
||||
return None
|
||||
netloc = parsed.netloc or ""
|
||||
path = unquote(parsed.path or "")
|
||||
if netloc:
|
||||
path = f"//{netloc}{path}"
|
||||
if os.name == "nt" and path.startswith("/") and re.match(r"/[A-Za-z]:", path):
|
||||
path = path[1:]
|
||||
path = path.replace("/", os.sep)
|
||||
return path
|
||||
|
||||
def _looks_like_hydrus_url(self, url: str) -> bool:
|
||||
if not url:
|
||||
return False
|
||||
if url.startswith("hydrus://"):
|
||||
return True
|
||||
if "Hydrus-Client-API-Access-Key=" in url:
|
||||
return True
|
||||
base = self.hydrus_settings.base_url
|
||||
if base and url.startswith(base) and "/get_files/" in url:
|
||||
return True
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def _extract_hash_from_path(path: Optional[str]) -> Optional[str]:
|
||||
if not path:
|
||||
return None
|
||||
parsed = urlparse(path)
|
||||
query = parse_qs(parsed.query)
|
||||
if "hash" in query and query["hash"]:
|
||||
candidate = query["hash"][0].strip()
|
||||
if candidate:
|
||||
return candidate.lower()
|
||||
match = re.search(r"hash=([0-9a-fA-F]{64})", path)
|
||||
if match:
|
||||
return match.group(1).lower()
|
||||
return None
|
||||
|
||||
|
||||
def build_mpv_file_state(payload: Dict[str, Any], config: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
||||
builder = MPVFileBuilder(payload or {}, config or {})
|
||||
return builder.build()
|
||||
143
helper/progress.py
Normal file
143
helper/progress.py
Normal file
@@ -0,0 +1,143 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Text-based progress bar utilities for consistent display across all downloads."""
|
||||
|
||||
import sys
|
||||
|
||||
from helper.logger import log, debug
|
||||
|
||||
|
||||
def format_progress_bar(current: int, total: int, width: int = 40, label: str = "") -> str:
|
||||
"""Create a text-based progress bar.
|
||||
|
||||
Args:
|
||||
current: Current progress (bytes/items)
|
||||
total: Total to complete (bytes/items)
|
||||
width: Width of the bar in characters (default 40)
|
||||
label: Optional label prefix
|
||||
|
||||
Returns:
|
||||
Formatted progress bar string
|
||||
|
||||
Examples:
|
||||
format_progress_bar(50, 100)
|
||||
# Returns: "[████████████████░░░░░░░░░░░░░░░░░░░░] 50.0%"
|
||||
|
||||
format_progress_bar(256*1024*1024, 1024*1024*1024, label="download.zip")
|
||||
# Returns: "download.zip: [████████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 25.0%"
|
||||
"""
|
||||
if total <= 0:
|
||||
percentage = 0
|
||||
filled = 0
|
||||
else:
|
||||
percentage = (current / total) * 100
|
||||
filled = int((current / total) * width)
|
||||
|
||||
# Create bar: filled blocks + empty blocks
|
||||
bar = "█" * filled + "░" * (width - filled)
|
||||
|
||||
# Format percentage
|
||||
pct_str = f"{percentage:.1f}%"
|
||||
|
||||
# Build result
|
||||
if label:
|
||||
result = f"{label}: [{bar}] {pct_str}"
|
||||
else:
|
||||
result = f"[{bar}] {pct_str}"
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def format_size(bytes_val: float) -> str:
|
||||
"""Format bytes to human-readable size.
|
||||
|
||||
Examples:
|
||||
format_size(1024) -> "1.00 KB"
|
||||
format_size(1024*1024) -> "1.00 MB"
|
||||
format_size(1024*1024*1024) -> "1.00 GB"
|
||||
"""
|
||||
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
|
||||
if bytes_val < 1024:
|
||||
return f"{bytes_val:.2f} {unit}"
|
||||
bytes_val /= 1024
|
||||
return f"{bytes_val:.2f} PB"
|
||||
|
||||
|
||||
def format_download_status(filename: str, current: int, total: int, speed: float = 0) -> str:
|
||||
"""Format download status with progress bar and details.
|
||||
|
||||
Args:
|
||||
filename: Name of file being downloaded
|
||||
current: Current bytes downloaded
|
||||
total: Total file size
|
||||
speed: Download speed in bytes/sec
|
||||
|
||||
Returns:
|
||||
Formatted status line
|
||||
|
||||
Examples:
|
||||
format_download_status("movie.mkv", 512*1024*1024, 2*1024*1024*1024, 10*1024*1024)
|
||||
# Returns: "movie.mkv: [████████████░░░░░░░░░░░░░░░░░░░░░░░░░░] 25.0% (512.00 MB / 2.00 GB @ 10.00 MB/s)"
|
||||
"""
|
||||
bar = format_progress_bar(current, total, width=30)
|
||||
size_current = format_size(current)
|
||||
size_total = format_size(total)
|
||||
|
||||
if speed > 0:
|
||||
speed_str = f" @ {format_size(speed)}/s"
|
||||
else:
|
||||
speed_str = ""
|
||||
|
||||
return f"{bar} ({size_current} / {size_total}{speed_str})"
|
||||
|
||||
|
||||
def print_progress(filename: str, current: int, total: int, speed: float = 0, end: str = "\r") -> None:
|
||||
"""Print download progress to stderr (doesn't interfere with piped output).
|
||||
|
||||
Args:
|
||||
filename: File being downloaded
|
||||
current: Current bytes
|
||||
total: Total bytes
|
||||
speed: Speed in bytes/sec
|
||||
end: Line ending (default "\r" for overwriting, use "\n" for final)
|
||||
"""
|
||||
status = format_download_status(filename, current, total, speed)
|
||||
debug(status, end=end, flush=True)
|
||||
|
||||
|
||||
def print_final_progress(filename: str, total: int, elapsed: float) -> None:
|
||||
"""Print final progress line (100%) with time elapsed.
|
||||
|
||||
Args:
|
||||
filename: File that was downloaded
|
||||
total: Total size
|
||||
elapsed: Time elapsed in seconds
|
||||
"""
|
||||
bar = format_progress_bar(total, total, width=30)
|
||||
size_str = format_size(total)
|
||||
|
||||
# Format elapsed time
|
||||
if elapsed < 60:
|
||||
time_str = f"{elapsed:.1f}s"
|
||||
elif elapsed < 3600:
|
||||
minutes = elapsed / 60
|
||||
time_str = f"{minutes:.1f}m"
|
||||
else:
|
||||
hours = elapsed / 3600
|
||||
time_str = f"{hours:.2f}h"
|
||||
|
||||
debug(f"{bar} ({size_str}) - {time_str}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Demo
|
||||
import time
|
||||
|
||||
log("Progress Bar Demo:", file=sys.stderr)
|
||||
|
||||
# Demo 1: Simple progress
|
||||
for i in range(101):
|
||||
print_progress("demo.bin", i * 10 * 1024 * 1024, 1024 * 1024 * 1024)
|
||||
time.sleep(0.02)
|
||||
|
||||
print_final_progress("demo.bin", 1024 * 1024 * 1024, 2.0)
|
||||
log()
|
||||
159
helper/query_parser.py
Normal file
159
helper/query_parser.py
Normal file
@@ -0,0 +1,159 @@
|
||||
"""Dynamic query parser for filtering and field extraction.
|
||||
|
||||
Supports query syntax like:
|
||||
- isbn:0557677203
|
||||
- author:"Albert Pike"
|
||||
- title:"Morals and Dogma"
|
||||
- year:2010
|
||||
- isbn:0557677203 author:"Albert Pike"
|
||||
- Mixed with free text: "Morals" isbn:0557677203
|
||||
|
||||
This allows flexible query strings that can be parsed by any search provider
|
||||
to extract specific fields for filtering and searching.
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Tuple, Optional, Any
|
||||
import re
|
||||
|
||||
|
||||
def parse_query(query: str) -> Dict[str, Any]:
|
||||
"""Parse a query string into field:value pairs and free text.
|
||||
|
||||
Args:
|
||||
query: Query string like 'isbn:0557677203 author:"Albert Pike" Morals'
|
||||
|
||||
Returns:
|
||||
Dictionary with:
|
||||
- 'fields': Dict[field_name, field_value] for structured fields
|
||||
- 'text': str with remaining free text
|
||||
- 'raw': str original query
|
||||
"""
|
||||
result = {
|
||||
'fields': {},
|
||||
'text': '',
|
||||
'raw': query,
|
||||
}
|
||||
|
||||
if not query or not query.strip():
|
||||
return result
|
||||
|
||||
query = query.strip()
|
||||
remaining_parts = []
|
||||
|
||||
# Pattern to match: field:value or field:"quoted value"
|
||||
# Matches: word: followed by either quoted string or unquoted word
|
||||
pattern = r'(\w+):(?:"([^"]*)"|(\S+))'
|
||||
|
||||
pos = 0
|
||||
for match in re.finditer(pattern, query):
|
||||
# Add any text before this match
|
||||
if match.start() > pos:
|
||||
before_text = query[pos:match.start()].strip()
|
||||
if before_text:
|
||||
remaining_parts.append(before_text)
|
||||
|
||||
field_name = match.group(1).lower()
|
||||
field_value = match.group(2) if match.group(2) is not None else match.group(3)
|
||||
|
||||
result['fields'][field_name] = field_value
|
||||
pos = match.end()
|
||||
|
||||
# Add any remaining text after last match
|
||||
if pos < len(query):
|
||||
remaining_text = query[pos:].strip()
|
||||
if remaining_text:
|
||||
remaining_parts.append(remaining_text)
|
||||
|
||||
result['text'] = ' '.join(remaining_parts)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_field(parsed_query: Dict[str, Any], field_name: str, default: Optional[str] = None) -> Optional[str]:
|
||||
"""Get a field value from parsed query, with optional default.
|
||||
|
||||
Args:
|
||||
parsed_query: Result from parse_query()
|
||||
field_name: Field name to look up (case-insensitive)
|
||||
default: Default value if field not found
|
||||
|
||||
Returns:
|
||||
Field value or default
|
||||
"""
|
||||
return parsed_query.get('fields', {}).get(field_name.lower(), default)
|
||||
|
||||
|
||||
def has_field(parsed_query: Dict[str, Any], field_name: str) -> bool:
|
||||
"""Check if a field exists in parsed query.
|
||||
|
||||
Args:
|
||||
parsed_query: Result from parse_query()
|
||||
field_name: Field name to check (case-insensitive)
|
||||
|
||||
Returns:
|
||||
True if field exists
|
||||
"""
|
||||
return field_name.lower() in parsed_query.get('fields', {})
|
||||
|
||||
|
||||
def get_free_text(parsed_query: Dict[str, Any]) -> str:
|
||||
"""Get the free text portion of a parsed query.
|
||||
|
||||
Args:
|
||||
parsed_query: Result from parse_query()
|
||||
|
||||
Returns:
|
||||
Free text or empty string
|
||||
"""
|
||||
return parsed_query.get('text', '')
|
||||
|
||||
|
||||
def build_query_for_provider(
|
||||
parsed_query: Dict[str, Any],
|
||||
provider: str,
|
||||
extraction_map: Optional[Dict[str, str]] = None
|
||||
) -> Tuple[str, Dict[str, str]]:
|
||||
"""Build a search query and filters dict for a specific provider.
|
||||
|
||||
Different providers have different search syntax. This function
|
||||
extracts the appropriate fields for each provider.
|
||||
|
||||
Args:
|
||||
parsed_query: Result from parse_query()
|
||||
provider: Provider name ('libgen', 'openlibrary', 'soulseek')
|
||||
extraction_map: Optional mapping of field names to provider-specific names
|
||||
e.g. {'isbn': 'isbn', 'author': 'author', 'title': 'title'}
|
||||
|
||||
Returns:
|
||||
Tuple of (search_query: str, extracted_fields: Dict[field, value])
|
||||
"""
|
||||
extraction_map = extraction_map or {}
|
||||
extracted = {}
|
||||
free_text = get_free_text(parsed_query)
|
||||
|
||||
# Extract fields based on map
|
||||
for field_name, provider_key in extraction_map.items():
|
||||
if has_field(parsed_query, field_name):
|
||||
extracted[provider_key] = get_field(parsed_query, field_name)
|
||||
|
||||
# If provider-specific extraction needed, providers can implement it
|
||||
# For now, return the free text as query
|
||||
return free_text, extracted
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Test cases
|
||||
test_queries = [
|
||||
'isbn:0557677203',
|
||||
'isbn:0557677203 author:"Albert Pike"',
|
||||
'Morals and Dogma isbn:0557677203',
|
||||
'title:"Morals and Dogma" author:"Albert Pike" year:2010',
|
||||
'search term without fields',
|
||||
'author:"John Smith" title:"A Book"',
|
||||
]
|
||||
|
||||
for query in test_queries:
|
||||
print(f"\nQuery: {query}")
|
||||
parsed = parse_query(query)
|
||||
print(f" Fields: {parsed['fields']}")
|
||||
print(f" Text: {parsed['text']}")
|
||||
1777
helper/search_provider.py
Normal file
1777
helper/search_provider.py
Normal file
File diff suppressed because it is too large
Load Diff
155
helper/tasks.py
Normal file
155
helper/tasks.py
Normal file
@@ -0,0 +1,155 @@
|
||||
"""Background task handling and IPC helpers for mpv integration."""
|
||||
from __future__ import annotations
|
||||
import errno
|
||||
import json
|
||||
import os
|
||||
import socket
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
from helper.logger import log
|
||||
import threading
|
||||
import time
|
||||
from typing import IO, Iterable
|
||||
def connect_ipc(path: str, timeout: float = 5.0) -> IO[bytes] | None:
|
||||
"""Connect to the mpv IPC server located at *path*."""
|
||||
deadline = time.time() + timeout
|
||||
if not path:
|
||||
return None
|
||||
if os.name == 'nt':
|
||||
# mpv exposes a named pipe on Windows. Keep retrying until it is ready.
|
||||
while True:
|
||||
try:
|
||||
return open(path, 'r+b', buffering=0)
|
||||
except FileNotFoundError:
|
||||
if time.time() > deadline:
|
||||
return None
|
||||
time.sleep(0.05)
|
||||
except OSError as exc: # Pipe busy
|
||||
if exc.errno not in (errno.ENOENT, errno.EPIPE, errno.EBUSY):
|
||||
raise
|
||||
if time.time() > deadline:
|
||||
return None
|
||||
time.sleep(0.05)
|
||||
else:
|
||||
sock = socket.socket(socket.AF_UNIX)
|
||||
while True:
|
||||
try:
|
||||
sock.connect(path)
|
||||
return sock.makefile('r+b', buffering=0)
|
||||
except FileNotFoundError:
|
||||
if time.time() > deadline:
|
||||
return None
|
||||
time.sleep(0.05)
|
||||
except OSError as exc:
|
||||
if exc.errno not in (errno.ENOENT, errno.ECONNREFUSED):
|
||||
raise
|
||||
if time.time() > deadline:
|
||||
return None
|
||||
time.sleep(0.05)
|
||||
def ipc_sender(ipc: IO[bytes] | None):
|
||||
"""Create a helper function for sending script messages via IPC."""
|
||||
if ipc is None:
|
||||
def _noop(_event: str, _payload: dict) -> None:
|
||||
return None
|
||||
return _noop
|
||||
lock = threading.Lock()
|
||||
def _send(event: str, payload: dict) -> None:
|
||||
message = json.dumps({'command': ['script-message', event, json.dumps(payload)]}, ensure_ascii=False)
|
||||
encoded = message.encode('utf-8') + b'\n'
|
||||
with lock:
|
||||
try:
|
||||
ipc.write(encoded)
|
||||
ipc.flush()
|
||||
except OSError:
|
||||
pass
|
||||
return _send
|
||||
def iter_stream(stream: Iterable[str]) -> Iterable[str]:
|
||||
for raw in stream:
|
||||
yield raw.rstrip('\r\n')
|
||||
def _run_task(args, parser) -> int:
|
||||
if not args.command:
|
||||
parser.error('run-task requires a command to execute (use "--" before the command).')
|
||||
env = os.environ.copy()
|
||||
for entry in args.env:
|
||||
key, sep, value = entry.partition('=')
|
||||
if not sep:
|
||||
parser.error(f'Invalid environment variable definition: {entry!r}')
|
||||
env[key] = value
|
||||
command = list(args.command)
|
||||
if command and command[0] == '--':
|
||||
command.pop(0)
|
||||
notifier = ipc_sender(connect_ipc(args.ipc, timeout=args.ipc_timeout))
|
||||
if not command:
|
||||
notifier('downlow-task-event', {
|
||||
'id': args.task_id,
|
||||
'event': 'error',
|
||||
'message': 'No command provided after separator',
|
||||
})
|
||||
log('[downlow.py] No command provided for run-task', file=sys.stderr)
|
||||
return 1
|
||||
if command and isinstance(command[0], str) and sys.executable:
|
||||
first = command[0].lower()
|
||||
if first in {'python', 'python3', 'py', 'python.exe', 'python3.exe', 'py.exe'}:
|
||||
command[0] = sys.executable
|
||||
if os.environ.get('DOWNLOW_DEBUG'):
|
||||
log(f"Launching command: {command}", file=sys.stderr)
|
||||
notifier('downlow-task-event', {
|
||||
'id': args.task_id,
|
||||
'event': 'start',
|
||||
'command': command,
|
||||
'cwd': args.cwd or os.getcwd(),
|
||||
})
|
||||
try:
|
||||
process = subprocess.Popen(
|
||||
command,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
cwd=args.cwd or None,
|
||||
env=env,
|
||||
text=True,
|
||||
bufsize=1,
|
||||
universal_newlines=True,
|
||||
)
|
||||
except FileNotFoundError as exc:
|
||||
notifier('downlow-task-event', {
|
||||
'id': args.task_id,
|
||||
'event': 'error',
|
||||
'message': f'Executable not found: {exc.filename}',
|
||||
})
|
||||
log(f"{exc}", file=sys.stderr)
|
||||
return 1
|
||||
stdout_lines: list[str] = []
|
||||
stderr_lines: list[str] = []
|
||||
def pump(stream: IO[str], label: str, sink: list[str]) -> None:
|
||||
for line in iter_stream(stream):
|
||||
sink.append(line)
|
||||
notifier('downlow-task-event', {
|
||||
'id': args.task_id,
|
||||
'event': label,
|
||||
'line': line,
|
||||
})
|
||||
threads = []
|
||||
if process.stdout:
|
||||
t_out = threading.Thread(target=pump, args=(process.stdout, 'stdout', stdout_lines), daemon=True)
|
||||
t_out.start()
|
||||
threads.append(t_out)
|
||||
if process.stderr:
|
||||
t_err = threading.Thread(target=pump, args=(process.stderr, 'stderr', stderr_lines), daemon=True)
|
||||
t_err.start()
|
||||
threads.append(t_err)
|
||||
return_code = process.wait()
|
||||
for t in threads:
|
||||
t.join(timeout=0.1)
|
||||
notifier('downlow-task-event', {
|
||||
'id': args.task_id,
|
||||
'event': 'exit',
|
||||
'returncode': return_code,
|
||||
'success': return_code == 0,
|
||||
})
|
||||
# Also mirror aggregated output to stdout/stderr for compatibility when IPC is unavailable.
|
||||
if stdout_lines:
|
||||
log('\n'.join(stdout_lines))
|
||||
if stderr_lines:
|
||||
log('\n'.join(stderr_lines), file=sys.stderr)
|
||||
return return_code
|
||||
706
helper/unified_book_downloader.py
Normal file
706
helper/unified_book_downloader.py
Normal file
@@ -0,0 +1,706 @@
|
||||
"""Unified book downloader - handles Archive.org borrowing and Libgen fallback.
|
||||
|
||||
This module provides a single interface for downloading books from multiple sources:
|
||||
1. Try Archive.org direct download (if available)
|
||||
2. Try Archive.org borrowing (if user has credentials)
|
||||
3. Fallback to Libgen search by ISBN
|
||||
4. Attempt Libgen download
|
||||
|
||||
All sources integrated with proper metadata scraping and error handling.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import asyncio
|
||||
import requests
|
||||
from typing import Optional, Dict, Any, Tuple, List, Callable, cast
|
||||
from pathlib import Path
|
||||
|
||||
from helper.logger import debug
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class UnifiedBookDownloader:
|
||||
"""Unified interface for downloading books from multiple sources."""
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None, output_dir: Optional[str] = None):
|
||||
"""Initialize the unified book downloader.
|
||||
|
||||
Args:
|
||||
config: Configuration dict with credentials
|
||||
output_dir: Default output directory
|
||||
"""
|
||||
self.config = config or {}
|
||||
self.output_dir = output_dir
|
||||
self.session = requests.Session()
|
||||
|
||||
# Import download functions from their modules
|
||||
self._init_downloaders()
|
||||
|
||||
def _init_downloaders(self) -> None:
|
||||
"""Initialize downloader functions from their modules."""
|
||||
try:
|
||||
from helper.archive_client import (
|
||||
check_direct_download,
|
||||
get_openlibrary_by_isbn,
|
||||
loan
|
||||
)
|
||||
self.check_direct_download = check_direct_download
|
||||
self.get_openlibrary_by_isbn = get_openlibrary_by_isbn
|
||||
self.loan_func = loan
|
||||
logger.debug("[UnifiedBookDownloader] Loaded archive.org downloaders from archive_client")
|
||||
except Exception as e:
|
||||
logger.warning(f"[UnifiedBookDownloader] Failed to load archive.org functions: {e}")
|
||||
self.check_direct_download = None
|
||||
self.get_openlibrary_by_isbn = None
|
||||
self.loan_func = None
|
||||
|
||||
try:
|
||||
from helper.libgen_service import (
|
||||
DEFAULT_LIMIT as _LIBGEN_DEFAULT_LIMIT,
|
||||
download_from_mirror as _libgen_download,
|
||||
search_libgen as _libgen_search,
|
||||
)
|
||||
|
||||
def _log_info(message: str) -> None:
|
||||
debug(f"[UnifiedBookDownloader] {message}")
|
||||
|
||||
def _log_error(message: str) -> None:
|
||||
logger.error(f"[UnifiedBookDownloader] {message}")
|
||||
|
||||
self.search_libgen = lambda query, limit=_LIBGEN_DEFAULT_LIMIT: _libgen_search(
|
||||
query,
|
||||
limit=limit,
|
||||
log_info=_log_info,
|
||||
log_error=_log_error,
|
||||
)
|
||||
self.download_from_mirror = lambda mirror_url, output_path: _libgen_download(
|
||||
mirror_url,
|
||||
output_path,
|
||||
log_info=_log_info,
|
||||
log_error=_log_error,
|
||||
)
|
||||
logger.debug("[UnifiedBookDownloader] Loaded Libgen helpers")
|
||||
except Exception as e:
|
||||
logger.warning(f"[UnifiedBookDownloader] Failed to load Libgen helpers: {e}")
|
||||
self.search_libgen = None
|
||||
self.download_from_mirror = None
|
||||
|
||||
def get_download_options(self, book_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Get all available download options for a book.
|
||||
|
||||
Checks in priority order:
|
||||
1. Archive.org direct download (public domain)
|
||||
2. Archive.org borrowing (if credentials available and book is borrowable)
|
||||
3. Libgen fallback (by ISBN)
|
||||
|
||||
Args:
|
||||
book_data: Book metadata dict with at least 'openlibrary_id' or 'isbn'
|
||||
|
||||
Returns:
|
||||
Dict with available download methods and metadata
|
||||
"""
|
||||
options = {
|
||||
'book_title': book_data.get('title', 'Unknown'),
|
||||
'book_author': book_data.get('author', 'Unknown'),
|
||||
'isbn': book_data.get('isbn', ''),
|
||||
'openlibrary_id': book_data.get('openlibrary_id', ''),
|
||||
'methods': [], # Will be sorted by priority
|
||||
'metadata': {}
|
||||
}
|
||||
|
||||
# Extract book ID from openlibrary_id (e.g., OL8513721M -> 8513721, OL8513721W -> 8513721)
|
||||
ol_id = book_data.get('openlibrary_id', '')
|
||||
book_id = None
|
||||
|
||||
if ol_id.startswith('OL') and len(ol_id) > 2:
|
||||
# Remove 'OL' prefix (keep everything after it including the suffix letter)
|
||||
# The book_id is all digits after 'OL'
|
||||
book_id = ''.join(c for c in ol_id[2:] if c.isdigit())
|
||||
|
||||
# PRIORITY 1: Check direct download (fastest, no auth needed)
|
||||
if self.check_direct_download:
|
||||
try:
|
||||
can_download, pdf_url = self.check_direct_download(book_id)
|
||||
if can_download:
|
||||
options['methods'].append({
|
||||
'type': 'archive.org_direct',
|
||||
'label': 'Archive.org Direct Download',
|
||||
'requires_auth': False,
|
||||
'pdf_url': pdf_url,
|
||||
'book_id': book_id,
|
||||
'priority': 1 # Highest priority
|
||||
})
|
||||
logger.info(f"[UnifiedBookDownloader] Direct download available for {book_id}")
|
||||
except Exception as e:
|
||||
logger.debug(f"[UnifiedBookDownloader] Direct download check failed: {e}")
|
||||
|
||||
# PRIORITY 2: Check borrowing option (requires auth, 14-day loan)
|
||||
# First verify the book is actually lendable via OpenLibrary API
|
||||
if self._has_archive_credentials():
|
||||
is_lendable, status = self._check_book_lendable_status(ol_id)
|
||||
|
||||
if is_lendable:
|
||||
options['methods'].append({
|
||||
'type': 'archive.org_borrow',
|
||||
'label': 'Archive.org Borrow',
|
||||
'requires_auth': True,
|
||||
'book_id': book_id,
|
||||
'priority': 2 # Second priority
|
||||
})
|
||||
logger.info(f"[UnifiedBookDownloader] Borrow option available for {book_id} (status: {status})")
|
||||
else:
|
||||
logger.debug(f"[UnifiedBookDownloader] Borrow not available for {book_id} (status: {status})")
|
||||
|
||||
# PRIORITY 3: Check Libgen fallback (by ISBN, no auth needed, most reliable)
|
||||
isbn = book_data.get('isbn', '')
|
||||
title = book_data.get('title', '')
|
||||
author = book_data.get('author', '')
|
||||
|
||||
if self.search_libgen:
|
||||
# Can use Libgen if we have ISBN OR title (or both)
|
||||
if isbn or title:
|
||||
options['methods'].append({
|
||||
'type': 'libgen',
|
||||
'label': 'Libgen Search & Download',
|
||||
'requires_auth': False,
|
||||
'isbn': isbn,
|
||||
'title': title,
|
||||
'author': author,
|
||||
'priority': 3 # Third priority (fallback)
|
||||
})
|
||||
logger.info(f"[UnifiedBookDownloader] Libgen fallback available (ISBN: {isbn if isbn else 'N/A'}, Title: {title})")
|
||||
|
||||
# Sort by priority (higher priority first)
|
||||
options['methods'].sort(key=lambda x: x.get('priority', 999))
|
||||
|
||||
return options
|
||||
|
||||
def _has_archive_credentials(self) -> bool:
|
||||
"""Check if Archive.org credentials are available."""
|
||||
try:
|
||||
from helper.archive_client import credential_openlibrary
|
||||
email, password = credential_openlibrary(self.config)
|
||||
return bool(email and password)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _check_book_lendable_status(self, ol_id: str) -> Tuple[bool, Optional[str]]:
|
||||
"""Check if a book is lendable via OpenLibrary API.
|
||||
|
||||
Queries: https://openlibrary.org/api/volumes/brief/json/OLID:{ol_id}
|
||||
Note: Only works with Edition IDs (OL...M), not Work IDs (OL...W)
|
||||
|
||||
Args:
|
||||
ol_id: OpenLibrary ID (e.g., OL8513721M for Edition or OL4801915W for Work)
|
||||
|
||||
Returns:
|
||||
Tuple of (is_lendable: bool, status_reason: Optional[str])
|
||||
"""
|
||||
try:
|
||||
if not ol_id.startswith('OL'):
|
||||
return False, "Invalid OpenLibrary ID format"
|
||||
|
||||
# If this is a Work ID (ends with W), we can't query Volumes API
|
||||
# Work IDs are abstract umbrella records, not specific editions
|
||||
if ol_id.endswith('W'):
|
||||
logger.debug(f"[UnifiedBookDownloader] Work ID {ol_id} - skipping Volumes API (not lendable)")
|
||||
return False, "Work ID not supported by Volumes API (not a specific edition)"
|
||||
|
||||
# If it ends with M, it's an Edition ID - proceed with query
|
||||
if not ol_id.endswith('M'):
|
||||
logger.debug(f"[UnifiedBookDownloader] Unknown ID type {ol_id} (not M or W)")
|
||||
return False, "Invalid OpenLibrary ID type"
|
||||
|
||||
url = f"https://openlibrary.org/api/volumes/brief/json/OLID:{ol_id}"
|
||||
response = self.session.get(url, timeout=10)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
# Empty response means no records found
|
||||
if not data:
|
||||
logger.debug(f"[UnifiedBookDownloader] Empty response for {ol_id}")
|
||||
return False, "No availability data found"
|
||||
|
||||
# The response is wrapped in OLID key
|
||||
olid_key = f"OLID:{ol_id}"
|
||||
if olid_key not in data:
|
||||
logger.debug(f"[UnifiedBookDownloader] OLID key not found in response")
|
||||
return False, "No availability data found"
|
||||
|
||||
olid_data = data[olid_key]
|
||||
|
||||
# Check items array for lendable status
|
||||
if 'items' in olid_data and olid_data['items'] and len(olid_data['items']) > 0:
|
||||
items = olid_data['items']
|
||||
|
||||
# Check the first item for lending status
|
||||
first_item = items[0]
|
||||
|
||||
# Handle both dict and string representations (PowerShell converts to string)
|
||||
if isinstance(first_item, dict):
|
||||
status = first_item.get('status', '')
|
||||
else:
|
||||
# String representation - check if 'lendable' is in it
|
||||
status = str(first_item).lower()
|
||||
|
||||
is_lendable = 'lendable' in str(status).lower()
|
||||
|
||||
if is_lendable:
|
||||
logger.info(f"[UnifiedBookDownloader] Book {ol_id} is lendable")
|
||||
return True, "LENDABLE"
|
||||
else:
|
||||
status_str = status.get('status', 'NOT_LENDABLE') if isinstance(status, dict) else 'NOT_LENDABLE'
|
||||
logger.debug(f"[UnifiedBookDownloader] Book {ol_id} is not lendable (status: {status_str})")
|
||||
return False, status_str
|
||||
else:
|
||||
# No items array or empty
|
||||
logger.debug(f"[UnifiedBookDownloader] No items found for {ol_id}")
|
||||
return False, "Not available for lending"
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
logger.warning(f"[UnifiedBookDownloader] OpenLibrary API timeout for {ol_id}")
|
||||
return False, "API timeout"
|
||||
except Exception as e:
|
||||
logger.debug(f"[UnifiedBookDownloader] Failed to check lendable status for {ol_id}: {e}")
|
||||
return False, f"API error"
|
||||
|
||||
|
||||
async def download_book(self, method: Dict[str, Any], output_dir: Optional[str] = None) -> Tuple[bool, str]:
|
||||
"""Download a book using the specified method.
|
||||
|
||||
Args:
|
||||
method: Download method dict from get_download_options()
|
||||
output_dir: Directory to save the book
|
||||
|
||||
Returns:
|
||||
Tuple of (success: bool, message: str)
|
||||
"""
|
||||
output_dir = output_dir or self.output_dir or str(Path.home() / "Downloads")
|
||||
method_type = method.get('type', '')
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Starting download with method: {method_type}")
|
||||
|
||||
try:
|
||||
if method_type == 'archive.org_direct':
|
||||
return await self._download_archive_direct(method, output_dir)
|
||||
|
||||
elif method_type == 'archive.org_borrow':
|
||||
return await self._download_archive_borrow(method, output_dir)
|
||||
|
||||
elif method_type == 'libgen':
|
||||
return await self._download_libgen(method, output_dir)
|
||||
|
||||
else:
|
||||
return False, f"Unknown download method: {method_type}"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[UnifiedBookDownloader] Download error: {e}", exc_info=True)
|
||||
return False, f"Download failed: {str(e)}"
|
||||
|
||||
async def _download_archive_direct(self, method: Dict[str, Any], output_dir: str) -> Tuple[bool, str]:
|
||||
"""Download directly from Archive.org."""
|
||||
try:
|
||||
pdf_url = method.get('pdf_url', '')
|
||||
book_id = method.get('book_id', '')
|
||||
|
||||
if not pdf_url:
|
||||
return False, "No PDF URL available"
|
||||
|
||||
# Determine output filename
|
||||
filename = f"{book_id}.pdf"
|
||||
output_path = Path(output_dir) / filename
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Downloading PDF from: {pdf_url}")
|
||||
|
||||
# Download in a thread to avoid blocking
|
||||
loop = asyncio.get_event_loop()
|
||||
success = await loop.run_in_executor(
|
||||
None,
|
||||
self._download_file,
|
||||
pdf_url,
|
||||
str(output_path)
|
||||
)
|
||||
|
||||
if success:
|
||||
logger.info(f"[UnifiedBookDownloader] Successfully downloaded to: {output_path}")
|
||||
return True, f"Downloaded to: {output_path}"
|
||||
else:
|
||||
return False, "Failed to download PDF"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[UnifiedBookDownloader] Archive direct download error: {e}")
|
||||
return False, f"Archive download failed: {str(e)}"
|
||||
|
||||
async def _download_archive_borrow(self, method: Dict[str, Any], output_dir: str) -> Tuple[bool, str]:
|
||||
"""Download via Archive.org borrowing (requires credentials).
|
||||
|
||||
Process (follows archive_client.py pattern):
|
||||
1. Login to Archive.org with credentials
|
||||
2. Call loan endpoint to borrow the book (14-day loan)
|
||||
3. Get book info (page links, metadata)
|
||||
4. Download all pages as images
|
||||
5. Merge images into PDF
|
||||
|
||||
The loan function from archive_client.py handles:
|
||||
- Checking if book needs borrowing (status 400 = "doesn't need to be borrowed")
|
||||
- Creating borrow token for access
|
||||
- Handling borrow failures
|
||||
|
||||
get_book_infos() extracts page links from the borrowed book viewer
|
||||
download() downloads all pages using thread pool
|
||||
img2pdf merges pages into searchable PDF
|
||||
"""
|
||||
try:
|
||||
from helper.archive_client import credential_openlibrary
|
||||
|
||||
book_id = method.get('book_id', '')
|
||||
|
||||
# Get credentials
|
||||
email, password = credential_openlibrary(self.config)
|
||||
if not email or not password:
|
||||
return False, "Archive.org credentials not configured"
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Logging into Archive.org...")
|
||||
|
||||
# Login and borrow (in thread, following download_book.py pattern)
|
||||
loop = asyncio.get_event_loop()
|
||||
borrow_result = await loop.run_in_executor(
|
||||
None,
|
||||
self._archive_borrow_and_download,
|
||||
email,
|
||||
password,
|
||||
book_id,
|
||||
output_dir
|
||||
)
|
||||
|
||||
if borrow_result and isinstance(borrow_result, tuple):
|
||||
success, filepath = borrow_result
|
||||
if success:
|
||||
logger.info(f"[UnifiedBookDownloader] Borrow succeeded: {filepath}")
|
||||
return True, filepath
|
||||
else:
|
||||
logger.warning(f"[UnifiedBookDownloader] Borrow failed: {filepath}")
|
||||
return False, filepath
|
||||
else:
|
||||
return False, "Failed to borrow book from Archive.org"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[UnifiedBookDownloader] Archive borrow error: {e}")
|
||||
return False, f"Archive borrow failed: {str(e)}"
|
||||
|
||||
async def _download_libgen(self, method: Dict[str, Any], output_dir: str) -> Tuple[bool, str]:
|
||||
"""Download via Libgen search and download with mirror fallback."""
|
||||
try:
|
||||
isbn = method.get('isbn', '')
|
||||
title = method.get('title', '')
|
||||
|
||||
if not isbn and not title:
|
||||
return False, "Need ISBN or title for Libgen search"
|
||||
|
||||
if not self.search_libgen:
|
||||
return False, "Libgen searcher not available"
|
||||
|
||||
# Define wrapper functions to safely call the methods
|
||||
search_func = self.search_libgen
|
||||
if search_func is None:
|
||||
return False, "Search function not available"
|
||||
|
||||
preloaded_results = method.get('results')
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
if preloaded_results:
|
||||
results = list(preloaded_results)
|
||||
if not results:
|
||||
results = await loop.run_in_executor(None, lambda: search_func(isbn or title, 10))
|
||||
else:
|
||||
results = await loop.run_in_executor(None, lambda: search_func(isbn or title, 10))
|
||||
|
||||
if not results:
|
||||
logger.warning(f"[UnifiedBookDownloader] No Libgen results for: {isbn or title}")
|
||||
return False, f"No Libgen results found for: {isbn or title}"
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Found {len(results)} Libgen results")
|
||||
|
||||
# Determine output filename (use first result for naming)
|
||||
first_result = results[0]
|
||||
filename = f"{first_result.get('title', 'book')}"
|
||||
filename = "".join(c for c in filename if c.isalnum() or c in (' ', '.', '-'))[:100]
|
||||
|
||||
# Try each result's mirror until one succeeds
|
||||
for idx, result in enumerate(results, 1):
|
||||
mirror_url = result.get('mirror_url', '')
|
||||
|
||||
if not mirror_url:
|
||||
logger.debug(f"[UnifiedBookDownloader] Result {idx}: No mirror URL")
|
||||
continue
|
||||
|
||||
# Use extension from this result if available
|
||||
extension = result.get('extension', 'pdf')
|
||||
if extension and not extension.startswith('.'):
|
||||
extension = f".{extension}"
|
||||
elif not extension:
|
||||
extension = '.pdf'
|
||||
|
||||
output_path = Path(output_dir) / (filename + extension)
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Trying mirror {idx}/{len(results)}: {mirror_url}")
|
||||
|
||||
download_func = self.download_from_mirror
|
||||
if download_func is None:
|
||||
return False, "Download function not available"
|
||||
|
||||
download_callable = cast(Callable[[str, str], bool], download_func)
|
||||
|
||||
def download_wrapper():
|
||||
return download_callable(mirror_url, str(output_path))
|
||||
|
||||
# Download (in thread)
|
||||
try:
|
||||
success = await loop.run_in_executor(None, download_wrapper)
|
||||
|
||||
if success:
|
||||
# Validate downloaded file is not HTML (common Libgen issue)
|
||||
if output_path.exists():
|
||||
try:
|
||||
with open(output_path, 'rb') as f:
|
||||
file_start = f.read(1024).decode('utf-8', errors='ignore').lower()
|
||||
if '<!doctype' in file_start or '<html' in file_start:
|
||||
logger.warning(f"[UnifiedBookDownloader] Mirror {idx} returned HTML instead of file, trying next mirror...")
|
||||
output_path.unlink() # Delete the HTML file
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.debug(f"[UnifiedBookDownloader] Could not validate file content: {e}")
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Successfully downloaded from mirror {idx} to: {output_path}")
|
||||
return True, str(output_path)
|
||||
else:
|
||||
logger.warning(f"[UnifiedBookDownloader] Mirror {idx} download failed, trying next...")
|
||||
except Exception as e:
|
||||
logger.warning(f"[UnifiedBookDownloader] Mirror {idx} error: {e}, trying next...")
|
||||
continue
|
||||
|
||||
return False, f"All {len(results)} mirrors failed"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[UnifiedBookDownloader] Libgen download error: {e}")
|
||||
return False, f"Libgen download failed: {str(e)}"
|
||||
|
||||
async def download_libgen_selection(
|
||||
self,
|
||||
selected: Dict[str, Any],
|
||||
remaining: Optional[List[Dict[str, Any]]] = None,
|
||||
output_dir: Optional[str] = None,
|
||||
) -> Tuple[bool, str]:
|
||||
"""Download a specific Libgen result with optional fallbacks."""
|
||||
|
||||
if not isinstance(selected, dict):
|
||||
return False, "Selected result must be a dictionary"
|
||||
|
||||
ordered_results: List[Dict[str, Any]] = [selected]
|
||||
if remaining:
|
||||
for item in remaining:
|
||||
if isinstance(item, dict) and item is not selected:
|
||||
ordered_results.append(item)
|
||||
|
||||
method: Dict[str, Any] = {
|
||||
'type': 'libgen',
|
||||
'isbn': selected.get('isbn', '') or '',
|
||||
'title': selected.get('title', '') or '',
|
||||
'author': selected.get('author', '') or '',
|
||||
'results': ordered_results,
|
||||
}
|
||||
|
||||
return await self.download_book(method, output_dir)
|
||||
|
||||
def download_libgen_selection_sync(
|
||||
self,
|
||||
selected: Dict[str, Any],
|
||||
remaining: Optional[List[Dict[str, Any]]] = None,
|
||||
output_dir: Optional[str] = None,
|
||||
) -> Tuple[bool, str]:
|
||||
"""Synchronous helper for downloading a Libgen selection."""
|
||||
|
||||
async def _run() -> Tuple[bool, str]:
|
||||
return await self.download_libgen_selection(selected, remaining, output_dir)
|
||||
|
||||
loop = asyncio.new_event_loop()
|
||||
try:
|
||||
asyncio.set_event_loop(loop)
|
||||
return loop.run_until_complete(_run())
|
||||
finally:
|
||||
loop.close()
|
||||
asyncio.set_event_loop(None)
|
||||
|
||||
def _download_file(self, url: str, output_path: str) -> bool:
|
||||
"""Download a file from URL."""
|
||||
try:
|
||||
response = requests.get(url, stream=True, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
with open(output_path, 'wb') as f:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"[UnifiedBookDownloader] File download error: {e}")
|
||||
return False
|
||||
|
||||
def _archive_borrow_and_download(self, email: str, password: str, book_id: str, output_dir: str) -> Tuple[bool, str]:
|
||||
"""Borrow a book from Archive.org and download pages as PDF.
|
||||
|
||||
This follows the exact process from archive_client.py:
|
||||
1. Login with credentials
|
||||
2. Call loan() to create 14-day borrow
|
||||
3. Get book info (extract page URLs)
|
||||
4. Download all pages as images
|
||||
5. Merge images into searchable PDF
|
||||
|
||||
Returns tuple of (success: bool, filepath/message: str)
|
||||
"""
|
||||
try:
|
||||
from helper.archive_client import login, loan, get_book_infos, download
|
||||
import tempfile
|
||||
import shutil
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Logging into Archive.org as {email}")
|
||||
session = login(email, password)
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Attempting to borrow book: {book_id}")
|
||||
# Call loan to create the 14-day borrow
|
||||
session = loan(session, book_id, verbose=True)
|
||||
|
||||
# If we get here, borrowing succeeded
|
||||
logger.info(f"[UnifiedBookDownloader] Successfully borrowed book: {book_id}")
|
||||
|
||||
# Now get the book info (page URLs and metadata)
|
||||
logger.info(f"[UnifiedBookDownloader] Extracting book page information...")
|
||||
# Try both URL formats: with /borrow and without
|
||||
book_urls = [
|
||||
f"https://archive.org/borrow/{book_id}", # Try borrow page first (for borrowed books)
|
||||
f"https://archive.org/details/{book_id}" # Fallback to details page
|
||||
]
|
||||
|
||||
title = None
|
||||
links = None
|
||||
metadata = None
|
||||
last_error = None
|
||||
|
||||
for book_url in book_urls:
|
||||
try:
|
||||
logger.debug(f"[UnifiedBookDownloader] Trying to get book info from: {book_url}")
|
||||
response = session.get(book_url, timeout=10)
|
||||
|
||||
# Log response status
|
||||
if response.status_code != 200:
|
||||
logger.debug(f"[UnifiedBookDownloader] URL returned {response.status_code}: {book_url}")
|
||||
# Continue to try next URL
|
||||
continue
|
||||
|
||||
# Try to parse the response
|
||||
title, links, metadata = get_book_infos(session, book_url)
|
||||
logger.info(f"[UnifiedBookDownloader] Successfully got info from: {book_url}")
|
||||
logger.info(f"[UnifiedBookDownloader] Found {len(links)} pages to download")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.debug(f"[UnifiedBookDownloader] Failed with {book_url}: {e}")
|
||||
last_error = e
|
||||
continue
|
||||
|
||||
if links is None:
|
||||
logger.error(f"[UnifiedBookDownloader] Failed to get book info from all URLs: {last_error}")
|
||||
# Borrow extraction failed - return False
|
||||
return False, "Could not extract borrowed book pages"
|
||||
|
||||
# Create temporary directory for images
|
||||
temp_dir = tempfile.mkdtemp(prefix=f"{title}_", dir=output_dir)
|
||||
logger.info(f"[UnifiedBookDownloader] Downloading {len(links)} pages to temporary directory...")
|
||||
|
||||
try:
|
||||
# Download all pages (uses thread pool)
|
||||
images = download(
|
||||
session=session,
|
||||
n_threads=10,
|
||||
directory=temp_dir,
|
||||
links=links,
|
||||
scale=3, # Default resolution
|
||||
book_id=book_id
|
||||
)
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Downloaded {len(images)} pages")
|
||||
|
||||
# Try to merge pages into PDF
|
||||
try:
|
||||
import img2pdf
|
||||
logger.info(f"[UnifiedBookDownloader] Merging pages into PDF...")
|
||||
|
||||
# Prepare PDF metadata
|
||||
pdfmeta = {}
|
||||
if metadata:
|
||||
if "title" in metadata:
|
||||
pdfmeta["title"] = metadata["title"]
|
||||
if "creator" in metadata:
|
||||
pdfmeta["author"] = metadata["creator"]
|
||||
pdfmeta["keywords"] = [f"https://archive.org/details/{book_id}"]
|
||||
pdfmeta["creationdate"] = None # Avoid timezone issues
|
||||
|
||||
# Convert images to PDF
|
||||
pdf_content = img2pdf.convert(images, **pdfmeta) if images else None
|
||||
if not pdf_content:
|
||||
logger.error(f"[UnifiedBookDownloader] PDF conversion failed")
|
||||
return False, "Failed to convert pages to PDF"
|
||||
|
||||
# Save the PDF
|
||||
pdf_filename = f"{title}.pdf" if title else "book.pdf"
|
||||
pdf_path = Path(output_dir) / pdf_filename
|
||||
|
||||
# Handle duplicate filenames
|
||||
i = 1
|
||||
while pdf_path.exists():
|
||||
pdf_path = Path(output_dir) / f"{title or 'book'}({i}).pdf"
|
||||
i += 1
|
||||
|
||||
with open(pdf_path, 'wb') as f:
|
||||
f.write(pdf_content)
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Successfully created PDF: {pdf_path}")
|
||||
|
||||
return True, str(pdf_path)
|
||||
|
||||
except ImportError:
|
||||
logger.warning(f"[UnifiedBookDownloader] img2pdf not available, saving as JPG collection instead")
|
||||
|
||||
# Create JPG collection directory
|
||||
if not title:
|
||||
title = f"book_{book_id}"
|
||||
jpg_dir = Path(output_dir) / title
|
||||
i = 1
|
||||
while jpg_dir.exists():
|
||||
jpg_dir = Path(output_dir) / f"{title}({i})"
|
||||
i += 1
|
||||
|
||||
# Move temporary directory to final location
|
||||
shutil.move(temp_dir, str(jpg_dir))
|
||||
temp_dir = None # Mark as already moved
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Saved as JPG collection: {jpg_dir}")
|
||||
return True, str(jpg_dir)
|
||||
|
||||
finally:
|
||||
# Clean up temporary directory if it still exists
|
||||
if temp_dir and Path(temp_dir).exists():
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
except SystemExit:
|
||||
# loan() function calls sys.exit on failure - catch it
|
||||
logger.error(f"[UnifiedBookDownloader] Borrow process exited (book may not be borrowable)")
|
||||
return False, "Book could not be borrowed (may not be available for borrowing)"
|
||||
except Exception as e:
|
||||
logger.error(f"[UnifiedBookDownloader] Archive borrow error: {e}")
|
||||
return False, f"Borrow failed: {str(e)}"
|
||||
|
||||
def close(self) -> None:
|
||||
"""Close the session."""
|
||||
self.session.close()
|
||||
492
helper/utils.py
Normal file
492
helper/utils.py
Normal file
@@ -0,0 +1,492 @@
|
||||
"""General-purpose helpers used across the downlow CLI."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import hashlib
|
||||
import ffmpeg
|
||||
import base64
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable
|
||||
from datetime import datetime
|
||||
from dataclasses import dataclass, field
|
||||
from fnmatch import fnmatch
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import helper.utils_constant
|
||||
|
||||
try:
|
||||
import cbor2
|
||||
except ImportError:
|
||||
cbor2 = None # type: ignore
|
||||
|
||||
CHUNK_SIZE = 1024 * 1024 # 1 MiB
|
||||
_format_logger = logging.getLogger(__name__)
|
||||
def ensure_directory(path: Path) -> None:
|
||||
"""Ensure *path* exists as a directory."""
|
||||
try:
|
||||
path.mkdir(parents=True, exist_ok=True)
|
||||
except OSError as exc: # pragma: no cover - surfaced to caller
|
||||
raise RuntimeError(f"Failed to create directory {path}: {exc}") from exc
|
||||
def unique_path(path: Path) -> Path:
|
||||
"""Return a unique path by appending " (n)" if needed."""
|
||||
if not path.exists():
|
||||
return path
|
||||
stem = path.stem
|
||||
suffix = path.suffix
|
||||
parent = path.parent
|
||||
counter = 1
|
||||
while True:
|
||||
candidate = parent / f"{stem} ({counter}){suffix}"
|
||||
if not candidate.exists():
|
||||
return candidate
|
||||
counter += 1
|
||||
|
||||
def sanitize_metadata_value(value: Any) -> str | None:
|
||||
if value is None:
|
||||
return None
|
||||
if not isinstance(value, str):
|
||||
value = str(value)
|
||||
value = value.replace('\x00', ' ').replace('\r', ' ').replace('\n', ' ').strip()
|
||||
if not value:
|
||||
return None
|
||||
return value
|
||||
def unique_preserve_order(values: Iterable[str]) -> list[str]:
|
||||
seen: set[str] = set()
|
||||
ordered: list[str] = []
|
||||
for value in values:
|
||||
if value not in seen:
|
||||
seen.add(value)
|
||||
ordered.append(value)
|
||||
return ordered
|
||||
def sha256_file(file_path: Path) -> str:
|
||||
"""Return the SHA-256 hex digest of *path*."""
|
||||
hasher = hashlib.sha256()
|
||||
with file_path.open('rb') as handle:
|
||||
for chunk in iter(lambda: handle.read(CHUNK_SIZE), b''):
|
||||
hasher.update(chunk)
|
||||
return hasher.hexdigest()
|
||||
|
||||
|
||||
def create_metadata_sidecar(file_path: Path, metadata: dict) -> None:
|
||||
"""Create a .metadata sidecar file with JSON metadata.
|
||||
|
||||
The metadata dict should contain title. If not present, it will be derived from
|
||||
the filename. This ensures the .metadata file can be matched during batch import.
|
||||
|
||||
Args:
|
||||
file_path: Path to the exported file
|
||||
metadata: Dictionary of metadata to save
|
||||
"""
|
||||
if not metadata:
|
||||
return
|
||||
file_name = file_path.stem
|
||||
file_ext = file_path.suffix.lower()
|
||||
# Ensure metadata has a title field that matches the filename (without extension)
|
||||
# This allows the sidecar to be matched and imported properly during batch import
|
||||
if 'title' not in metadata or not metadata.get('title'):
|
||||
metadata['title'] = file_name
|
||||
metadata['hash'] = sha256_file(file_path)
|
||||
metadata['size'] = Path(file_path).stat().st_size
|
||||
format_found = False
|
||||
for mime_type, ext_map in helper.utils_constant.mime_maps.items():
|
||||
for key, info in ext_map.items():
|
||||
if info.get("ext") == file_ext:
|
||||
metadata['type'] = mime_type
|
||||
format_found = True
|
||||
break
|
||||
if format_found:
|
||||
break
|
||||
else:
|
||||
metadata['type'] = 'unknown'
|
||||
metadata.update(ffprobe(str(file_path)))
|
||||
|
||||
|
||||
metadata_path = file_path.with_suffix(file_path.suffix + '.metadata')
|
||||
try:
|
||||
with open(metadata_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(metadata, f, ensure_ascii=False, indent=2)
|
||||
except OSError as exc:
|
||||
raise RuntimeError(f"Failed to write metadata sidecar {metadata_path}: {exc}") from exc
|
||||
|
||||
def create_tags_sidecar(file_path: Path, tags: set) -> None:
|
||||
"""Create a .tags sidecar file with tags (one per line).
|
||||
|
||||
Args:
|
||||
file_path: Path to the exported file
|
||||
tags: Set of tag strings
|
||||
"""
|
||||
if not tags:
|
||||
return
|
||||
|
||||
tags_path = file_path.with_suffix(file_path.suffix + '.tags')
|
||||
try:
|
||||
with open(tags_path, 'w', encoding='utf-8') as f:
|
||||
for tag in sorted(tags):
|
||||
f.write(f"{tag}\n")
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to create tags sidecar {tags_path}: {e}") from e
|
||||
|
||||
|
||||
def ffprobe(file_path: str) -> dict:
|
||||
probe = ffmpeg.probe(file_path)
|
||||
metadata = {}
|
||||
|
||||
# Format-level info
|
||||
fmt = probe.get("format", {})
|
||||
metadata["duration"] = float(fmt.get("duration", 0)) if "duration" in fmt else None
|
||||
metadata["size"] = int(fmt.get("size", 0)) if "size" in fmt else None
|
||||
metadata["format_name"] = fmt.get("format_name", None)
|
||||
|
||||
# Stream-level info
|
||||
for stream in probe.get("streams", []):
|
||||
codec_type = stream.get("codec_type")
|
||||
if codec_type == "audio":
|
||||
metadata["audio_codec"] = stream.get("codec_name")
|
||||
metadata["bitrate"] = int(stream.get("bit_rate", 0)) if "bit_rate" in stream else None
|
||||
metadata["samplerate"] = int(stream.get("sample_rate", 0)) if "sample_rate" in stream else None
|
||||
metadata["channels"] = int(stream.get("channels", 0)) if "channels" in stream else None
|
||||
elif codec_type == "video":
|
||||
metadata["video_codec"] = stream.get("codec_name")
|
||||
metadata["width"] = int(stream.get("width", 0)) if "width" in stream else None
|
||||
metadata["height"] = int(stream.get("height", 0)) if "height" in stream else None
|
||||
elif codec_type == "image":
|
||||
metadata["image_codec"] = stream.get("codec_name")
|
||||
metadata["width"] = int(stream.get("width", 0)) if "width" in stream else None
|
||||
metadata["height"] = int(stream.get("height", 0)) if "height" in stream else None
|
||||
|
||||
return metadata
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# CBOR Utilities - Consolidated from cbor.py
|
||||
# ============================================================================
|
||||
"""CBOR utilities backed by the `cbor2` library."""
|
||||
|
||||
|
||||
def decode_cbor(data: bytes) -> Any:
|
||||
"""Decode *data* from CBOR into native Python objects."""
|
||||
if not data:
|
||||
return None
|
||||
if cbor2 is None:
|
||||
raise ImportError("cbor2 library is required for CBOR decoding")
|
||||
return cbor2.loads(data)
|
||||
|
||||
|
||||
def jsonify(value: Any) -> Any:
|
||||
"""Convert *value* into a JSON-friendly structure."""
|
||||
if isinstance(value, dict):
|
||||
return {str(key): jsonify(val) for key, val in value.items()}
|
||||
if isinstance(value, list):
|
||||
return [jsonify(item) for item in value]
|
||||
if isinstance(value, bytes):
|
||||
return {"__bytes__": base64.b64encode(value).decode("ascii")}
|
||||
return value
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Format Utilities - Consolidated from format_utils.py
|
||||
# ============================================================================
|
||||
"""Formatting utilities for displaying metadata consistently across the application."""
|
||||
|
||||
|
||||
def format_bytes(bytes_value) -> str:
|
||||
"""Format bytes to human-readable format (e.g., '1.5 MB', '250 KB').
|
||||
|
||||
Args:
|
||||
bytes_value: Size in bytes (int or float)
|
||||
|
||||
Returns:
|
||||
Formatted string like '1.5 MB' or '756 MB'
|
||||
"""
|
||||
if bytes_value is None or bytes_value <= 0:
|
||||
return "0 B"
|
||||
|
||||
if isinstance(bytes_value, (int, float)):
|
||||
for unit in ("B", "KB", "MB", "GB", "TB"):
|
||||
if bytes_value < 1024:
|
||||
if unit == "B":
|
||||
return f"{int(bytes_value)} {unit}"
|
||||
return f"{bytes_value:.1f} {unit}"
|
||||
bytes_value /= 1024
|
||||
return f"{bytes_value:.1f} PB"
|
||||
return str(bytes_value)
|
||||
|
||||
|
||||
def format_duration(seconds) -> str:
|
||||
"""Format duration in seconds to human-readable format (e.g., '1h 23m 5s', '5m 30s').
|
||||
|
||||
Args:
|
||||
seconds: Duration in seconds (int or float)
|
||||
|
||||
Returns:
|
||||
Formatted string like '1:23:45' or '5:30'
|
||||
"""
|
||||
if seconds is None or seconds == '':
|
||||
return "N/A"
|
||||
|
||||
if isinstance(seconds, str):
|
||||
try:
|
||||
seconds = float(seconds)
|
||||
except ValueError:
|
||||
return str(seconds)
|
||||
|
||||
if not isinstance(seconds, (int, float)):
|
||||
return str(seconds)
|
||||
|
||||
total_seconds = int(seconds)
|
||||
if total_seconds < 0:
|
||||
return "N/A"
|
||||
|
||||
hours = total_seconds // 3600
|
||||
minutes = (total_seconds % 3600) // 60
|
||||
secs = total_seconds % 60
|
||||
|
||||
if hours > 0:
|
||||
return f"{hours}:{minutes:02d}:{secs:02d}"
|
||||
elif minutes > 0:
|
||||
return f"{minutes}:{secs:02d}"
|
||||
else:
|
||||
return f"{secs}s"
|
||||
|
||||
|
||||
def format_timestamp(timestamp_str) -> str:
|
||||
"""Format ISO timestamp to readable format.
|
||||
|
||||
Args:
|
||||
timestamp_str: ISO format timestamp string or None
|
||||
|
||||
Returns:
|
||||
Formatted string like "2025-10-28 19:36:01" or original string if parsing fails
|
||||
"""
|
||||
if not timestamp_str:
|
||||
return "N/A"
|
||||
|
||||
try:
|
||||
# Handle ISO format timestamps
|
||||
if isinstance(timestamp_str, str):
|
||||
# Try parsing ISO format
|
||||
if 'T' in timestamp_str:
|
||||
dt = datetime.fromisoformat(timestamp_str.replace('Z', '+00:00'))
|
||||
else:
|
||||
# Try other common formats
|
||||
dt = datetime.fromisoformat(timestamp_str)
|
||||
return dt.strftime("%Y-%m-%d %H:%M:%S")
|
||||
except Exception as e:
|
||||
_format_logger.debug(f"Could not parse timestamp '{timestamp_str}': {e}")
|
||||
|
||||
return str(timestamp_str)
|
||||
|
||||
|
||||
def format_metadata_value(key: str, value) -> str:
|
||||
"""Format a metadata value based on its key for display.
|
||||
|
||||
This is the central formatting rule for all metadata display.
|
||||
|
||||
Args:
|
||||
key: Metadata field name
|
||||
value: Value to format
|
||||
|
||||
Returns:
|
||||
Formatted string for display
|
||||
"""
|
||||
if value is None or value == '':
|
||||
return "N/A"
|
||||
|
||||
# Apply field-specific formatting
|
||||
if key in ('size', 'file_size'):
|
||||
return format_bytes(value)
|
||||
elif key in ('duration', 'length'):
|
||||
return format_duration(value)
|
||||
elif key in ('time_modified', 'time_imported', 'created_at', 'updated_at', 'indexed_at', 'timestamp'):
|
||||
return format_timestamp(value)
|
||||
else:
|
||||
return str(value)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Link Utilities - Consolidated from link_utils.py
|
||||
# ============================================================================
|
||||
"""Link utilities - Extract and process URLs from various sources."""
|
||||
|
||||
|
||||
def extract_link_from_args(args: Iterable[str]) -> Any | None:
|
||||
"""Extract HTTP/HTTPS URL from command arguments.
|
||||
|
||||
Args:
|
||||
args: Command arguments
|
||||
|
||||
Returns:
|
||||
URL string if found, None otherwise
|
||||
"""
|
||||
args_list = list(args) if not isinstance(args, (list, tuple)) else args
|
||||
if not args_list or len(args_list) == 0:
|
||||
return None
|
||||
|
||||
potential_link = str(args_list[0])
|
||||
if potential_link.startswith(('http://', 'https://')):
|
||||
return potential_link
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_link_from_result(result: Any) -> Any | None:
|
||||
"""Extract URL from a result object (dict or object with attributes).
|
||||
|
||||
Args:
|
||||
result: Result object from pipeline (dict or object)
|
||||
|
||||
Returns:
|
||||
URL string if found, None otherwise
|
||||
"""
|
||||
if isinstance(result, dict):
|
||||
return result.get('url') or result.get('link') or result.get('href')
|
||||
|
||||
return (
|
||||
getattr(result, 'url', None) or
|
||||
getattr(result, 'link', None) or
|
||||
getattr(result, 'href', None)
|
||||
)
|
||||
|
||||
|
||||
def extract_link(result: Any, args: Iterable[str]) -> Any | None:
|
||||
"""Extract link from args or result (args take priority).
|
||||
|
||||
Args:
|
||||
result: Pipeline result object
|
||||
args: Command arguments
|
||||
|
||||
Returns:
|
||||
URL string if found, None otherwise
|
||||
"""
|
||||
# Try args first
|
||||
link = extract_link_from_args(args)
|
||||
if link:
|
||||
return link
|
||||
|
||||
# Fall back to result
|
||||
return extract_link_from_result(result)
|
||||
|
||||
|
||||
def get_api_key(config: dict[str, Any], service: str, key_path: str) -> str | None:
|
||||
"""Get API key from config with fallback support.
|
||||
|
||||
Args:
|
||||
config: Configuration dictionary
|
||||
service: Service name for logging
|
||||
key_path: Dot-notation path to key (e.g., "Debrid.All-debrid")
|
||||
|
||||
Returns:
|
||||
API key if found and not empty, None otherwise
|
||||
"""
|
||||
try:
|
||||
parts = key_path.split('.')
|
||||
value = config
|
||||
for part in parts:
|
||||
if isinstance(value, dict):
|
||||
value = value.get(part)
|
||||
else:
|
||||
return None
|
||||
|
||||
if isinstance(value, str):
|
||||
return value.strip() or None
|
||||
|
||||
return None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def add_direct_link_to_result(result: Any, direct_link: str, original_link: str) -> None:
|
||||
"""Add direct link information to result object.
|
||||
|
||||
Args:
|
||||
result: Result object to modify (dict or object)
|
||||
direct_link: The unlocked/direct URL
|
||||
original_link: The original restricted URL
|
||||
"""
|
||||
if isinstance(result, dict):
|
||||
result['direct_link'] = direct_link
|
||||
result['original_link'] = original_link
|
||||
else:
|
||||
setattr(result, 'direct_link', direct_link)
|
||||
setattr(result, 'original_link', original_link)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# URL Policy Resolution - Consolidated from url_parser.py
|
||||
# ============================================================================
|
||||
"""URL policy resolution for downlow workflows."""
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class UrlPolicy:
|
||||
"""Describe how a URL should be handled by download and screenshot flows."""
|
||||
|
||||
skip_download: bool = False
|
||||
skip_metadata: bool = False
|
||||
force_screenshot: bool = False
|
||||
extra_tags: list[str] = field(default_factory=list)
|
||||
|
||||
def apply_tags(self, sources: Iterable[str]) -> list[str]:
|
||||
tags = [tag.strip() for tag in self.extra_tags if tag and tag.strip()]
|
||||
for value in sources:
|
||||
text = str(value).strip()
|
||||
if text:
|
||||
tags.append(text)
|
||||
return tags
|
||||
|
||||
|
||||
def _normalise_rule(rule: dict[str, Any]) -> dict[str, Any] | None:
|
||||
pattern = str(rule.get("pattern") or rule.get("host") or "").strip()
|
||||
if not pattern:
|
||||
return None
|
||||
skip_download = bool(rule.get("skip_download"))
|
||||
skip_metadata = bool(rule.get("skip_metadata"))
|
||||
force_screenshot = bool(rule.get("force_screenshot"))
|
||||
extra_tags_raw = rule.get("extra_tags")
|
||||
if isinstance(extra_tags_raw, str):
|
||||
extra_tags = [part.strip() for part in extra_tags_raw.split(",") if part.strip()]
|
||||
elif isinstance(extra_tags_raw, (list, tuple, set)):
|
||||
extra_tags = [str(item).strip() for item in extra_tags_raw if str(item).strip()]
|
||||
else:
|
||||
extra_tags = []
|
||||
return {
|
||||
"pattern": pattern,
|
||||
"skip_download": skip_download,
|
||||
"skip_metadata": skip_metadata,
|
||||
"force_screenshot": force_screenshot,
|
||||
"extra_tags": extra_tags,
|
||||
}
|
||||
|
||||
|
||||
def resolve_url_policy(config: dict[str, Any], url: str) -> UrlPolicy:
|
||||
policies_raw = config.get("url_policies")
|
||||
if not policies_raw:
|
||||
return UrlPolicy()
|
||||
if not isinstance(policies_raw, list):
|
||||
return UrlPolicy()
|
||||
parsed = urlparse(url)
|
||||
subject = f"{parsed.netloc}{parsed.path}"
|
||||
host = parsed.netloc
|
||||
resolved = UrlPolicy()
|
||||
for rule_raw in policies_raw:
|
||||
if not isinstance(rule_raw, dict):
|
||||
continue
|
||||
rule = _normalise_rule(rule_raw)
|
||||
if rule is None:
|
||||
continue
|
||||
pattern = rule["pattern"]
|
||||
if not (fnmatch(host, pattern) or fnmatch(subject, pattern)):
|
||||
continue
|
||||
if rule["skip_download"]:
|
||||
resolved.skip_download = True
|
||||
if rule["skip_metadata"]:
|
||||
resolved.skip_metadata = True
|
||||
if rule["force_screenshot"]:
|
||||
resolved.force_screenshot = True
|
||||
if rule["extra_tags"]:
|
||||
for tag in rule["extra_tags"]:
|
||||
if tag not in resolved.extra_tags:
|
||||
resolved.extra_tags.append(tag)
|
||||
return resolved
|
||||
79
helper/utils_constant.py
Normal file
79
helper/utils_constant.py
Normal file
@@ -0,0 +1,79 @@
|
||||
mime_maps = {
|
||||
"image": {
|
||||
"jpg": { "ext": ".jpg", "mimes": ["image/jpeg", "image/jpg"] },
|
||||
"png": { "ext": ".png", "mimes": ["image/png"] },
|
||||
"gif": { "ext": ".gif", "mimes": ["image/gif"] },
|
||||
"webp": { "ext": ".webp", "mimes": ["image/webp"] },
|
||||
"avif": { "ext": ".avif", "mimes": ["image/avif"] },
|
||||
"jxl": { "ext": ".jxl", "mimes": ["image/jxl"] },
|
||||
"bmp": { "ext": ".bmp", "mimes": ["image/bmp"] },
|
||||
"heic": { "ext": ".heic", "mimes": ["image/heic"] },
|
||||
"heif": { "ext": ".heif", "mimes": ["image/heif"] },
|
||||
"ico": { "ext": ".ico", "mimes": ["image/x-icon", "image/vnd.microsoft.icon"] },
|
||||
"qoi": { "ext": ".qoi", "mimes": ["image/qoi"] },
|
||||
"tiff": { "ext": ".tiff", "mimes": ["image/tiff", "image/x-tiff"] },
|
||||
"svg": { "ext": ".svg", "mimes": ["image/svg+xml"] }
|
||||
},
|
||||
"image_sequence": {
|
||||
"apng": { "ext": ".apng", "mimes": ["image/apng"], "sequence": True },
|
||||
"avifs": { "ext": ".avifs", "mimes": ["image/avif-sequence"], "sequence": True },
|
||||
"heics": { "ext": ".heics", "mimes": ["image/heic-sequence"], "sequence": True },
|
||||
"heifs": { "ext": ".heifs", "mimes": ["image/heif-sequence"], "sequence": True }
|
||||
},
|
||||
"video": {
|
||||
"mp4": { "ext": ".mp4", "mimes": ["video/mp4", "audio/mp4"] },
|
||||
"webm": { "ext": ".webm", "mimes": ["video/webm", "audio/webm"] },
|
||||
"mov": { "ext": ".mov", "mimes": ["video/quicktime"] },
|
||||
"ogv": { "ext": ".ogv", "mimes": ["video/ogg"] },
|
||||
"mpeg": { "ext": ".mpeg", "mimes": ["video/mpeg"] },
|
||||
"avi": { "ext": ".avi", "mimes": ["video/x-msvideo", "video/avi"] },
|
||||
"flv": { "ext": ".flv", "mimes": ["video/x-flv"] },
|
||||
"mkv": { "ext": ".mkv", "mimes": ["video/x-matroska", "application/x-matroska"], "audio_only_ext": ".mka" },
|
||||
"wmv": { "ext": ".wmv", "mimes": ["video/x-ms-wmv"] },
|
||||
"rv": { "ext": ".rv", "mimes": ["video/vnd.rn-realvideo"] }
|
||||
},
|
||||
"audio": {
|
||||
"mp3": { "ext": ".mp3", "mimes": ["audio/mpeg", "audio/mp3"] },
|
||||
"m4a": { "ext": ".m4a", "mimes": ["audio/mp4", "audio/x-m4a"] },
|
||||
"ogg": { "ext": ".ogg", "mimes": ["audio/ogg"] },
|
||||
"flac": { "ext": ".flac", "mimes": ["audio/flac"] },
|
||||
"wav": { "ext": ".wav", "mimes": ["audio/wav", "audio/x-wav", "audio/vnd.wave"] },
|
||||
"wma": { "ext": ".wma", "mimes": ["audio/x-ms-wma"] },
|
||||
"tta": { "ext": ".tta", "mimes": ["audio/x-tta"] },
|
||||
"wv": { "ext": ".wv", "mimes": ["audio/x-wavpack", "audio/wavpack"] },
|
||||
"mka": { "ext": ".mka", "mimes": ["audio/x-matroska", "video/x-matroska"] }
|
||||
},
|
||||
"document": {
|
||||
"pdf": { "ext": ".pdf", "mimes": ["application/pdf"] },
|
||||
"epub": { "ext": ".epub", "mimes": ["application/epub+zip"] },
|
||||
"djvu": { "ext": ".djvu", "mimes": ["application/vnd.djvu"] },
|
||||
"rtf": { "ext": ".rtf", "mimes": ["application/rtf"] },
|
||||
"docx": { "ext": ".docx", "mimes": ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"] },
|
||||
"xlsx": { "ext": ".xlsx", "mimes": ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"] },
|
||||
"pptx": { "ext": ".pptx", "mimes": ["application/vnd.openxmlformats-officedocument.presentationml.presentation"] },
|
||||
"doc": { "ext": ".doc", "mimes": ["application/msword"] },
|
||||
"xls": { "ext": ".xls", "mimes": ["application/vnd.ms-excel"] },
|
||||
"ppt": { "ext": ".ppt", "mimes": ["application/vnd.ms-powerpoint"] }
|
||||
},
|
||||
"archive": {
|
||||
"zip": { "ext": ".zip", "mimes": ["application/zip"] },
|
||||
"7z": { "ext": ".7z", "mimes": ["application/x-7z-compressed"] },
|
||||
"rar": { "ext": ".rar", "mimes": ["application/x-rar-compressed", "application/vnd.rar"] },
|
||||
"gz": { "ext": ".gz", "mimes": ["application/gzip", "application/x-gzip"] },
|
||||
"tar": { "ext": ".tar", "mimes": ["application/x-tar"] },
|
||||
"cbz": { "ext": ".cbz", "mimes": ["application/zip"], "note": "zip archive of images; prefer extension-based detection for comics" }
|
||||
},
|
||||
"project": {
|
||||
"clip": { "ext": ".clip", "mimes": ["application/clip"] },
|
||||
"kra": { "ext": ".kra", "mimes": ["application/x-krita"] },
|
||||
"procreate": { "ext": ".procreate", "mimes": ["application/x-procreate"] },
|
||||
"psd": { "ext": ".psd", "mimes": ["image/vnd.adobe.photoshop"] },
|
||||
"swf": { "ext": ".swf", "mimes": ["application/x-shockwave-flash"] }
|
||||
},
|
||||
"other": {
|
||||
"octet-stream": { "ext": "", "mimes": ["application/octet-stream"] },
|
||||
"json": { "ext": ".json", "mimes": ["application/json"] },
|
||||
"xml": { "ext": ".xml", "mimes": ["application/xml", "text/xml"] },
|
||||
"csv": { "ext": ".csv", "mimes": ["text/csv"] }
|
||||
}
|
||||
}
|
||||
655
helper/worker_manager.py
Normal file
655
helper/worker_manager.py
Normal file
@@ -0,0 +1,655 @@
|
||||
"""Worker task management with persistent database storage.
|
||||
|
||||
Manages worker tasks for downloads, searches, imports, etc. with automatic
|
||||
persistence to database and optional auto-refresh callbacks.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, Any, List, Callable
|
||||
from datetime import datetime
|
||||
from threading import Thread, Lock
|
||||
import time
|
||||
|
||||
from .local_library import LocalLibraryDB
|
||||
from helper.logger import log
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Worker:
|
||||
"""Represents a single worker task with state management."""
|
||||
|
||||
def __init__(self, worker_id: str, worker_type: str, title: str = "",
|
||||
description: str = "", manager: Optional['WorkerManager'] = None):
|
||||
"""Initialize a worker.
|
||||
|
||||
Args:
|
||||
worker_id: Unique identifier for this worker
|
||||
worker_type: Type of work (e.g., 'download', 'search', 'import')
|
||||
title: Human-readable title
|
||||
description: Detailed description
|
||||
manager: Reference to parent WorkerManager for state updates
|
||||
"""
|
||||
self.id = worker_id
|
||||
self.type = worker_type
|
||||
self.title = title or worker_type
|
||||
self.description = description
|
||||
self.manager = manager
|
||||
self.status = "running"
|
||||
self.progress = ""
|
||||
self.details = ""
|
||||
self.error_message = ""
|
||||
self.result = "pending"
|
||||
self._stdout_buffer = []
|
||||
self._steps_buffer = []
|
||||
|
||||
def log_step(self, step_text: str) -> None:
|
||||
"""Log a step for this worker.
|
||||
|
||||
Args:
|
||||
step_text: Text describing the step
|
||||
"""
|
||||
try:
|
||||
if self.manager:
|
||||
self.manager.log_step(self.id, step_text)
|
||||
else:
|
||||
logger.info(f"[{self.id}] {step_text}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error logging step for worker {self.id}: {e}")
|
||||
|
||||
def append_stdout(self, text: str) -> None:
|
||||
"""Append text to stdout log.
|
||||
|
||||
Args:
|
||||
text: Text to append
|
||||
"""
|
||||
try:
|
||||
if self.manager:
|
||||
self.manager.append_worker_stdout(self.id, text)
|
||||
else:
|
||||
self._stdout_buffer.append(text)
|
||||
except Exception as e:
|
||||
logger.error(f"Error appending stdout for worker {self.id}: {e}")
|
||||
|
||||
def get_stdout(self) -> str:
|
||||
"""Get all stdout for this worker.
|
||||
|
||||
Returns:
|
||||
Complete stdout text
|
||||
"""
|
||||
try:
|
||||
if self.manager:
|
||||
return self.manager.get_stdout(self.id)
|
||||
else:
|
||||
return "\n".join(self._stdout_buffer)
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting stdout for worker {self.id}: {e}")
|
||||
return ""
|
||||
|
||||
def get_steps(self) -> str:
|
||||
"""Get all steps for this worker.
|
||||
|
||||
Returns:
|
||||
Complete steps text
|
||||
"""
|
||||
try:
|
||||
if self.manager:
|
||||
return self.manager.get_steps(self.id)
|
||||
else:
|
||||
return "\n".join(self._steps_buffer)
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting steps for worker {self.id}: {e}")
|
||||
return ""
|
||||
|
||||
def update_progress(self, progress: str = "", details: str = "") -> None:
|
||||
"""Update worker progress.
|
||||
|
||||
Args:
|
||||
progress: Progress string (e.g., "50%")
|
||||
details: Additional details
|
||||
"""
|
||||
self.progress = progress
|
||||
self.details = details
|
||||
try:
|
||||
if self.manager:
|
||||
self.manager.update_worker(self.id, progress, details)
|
||||
except Exception as e:
|
||||
logger.error(f"Error updating worker {self.id}: {e}")
|
||||
|
||||
def finish(self, result: str = "completed", message: str = "") -> None:
|
||||
"""Mark worker as finished.
|
||||
|
||||
Args:
|
||||
result: Result status ('completed', 'error', 'cancelled')
|
||||
message: Result message/error details
|
||||
"""
|
||||
self.result = result
|
||||
self.status = "finished"
|
||||
self.error_message = message
|
||||
try:
|
||||
if self.manager:
|
||||
# Flush and disable logging handler before marking finished
|
||||
self.manager.disable_logging_for_worker(self.id)
|
||||
# Then mark as finished in database
|
||||
self.manager.finish_worker(self.id, result, message)
|
||||
except Exception as e:
|
||||
logger.error(f"Error finishing worker {self.id}: {e}")
|
||||
|
||||
|
||||
class WorkerLoggingHandler(logging.StreamHandler):
|
||||
"""Custom logging handler that captures logs for a worker."""
|
||||
|
||||
def __init__(self, worker_id: str, db: LocalLibraryDB,
|
||||
manager: Optional['WorkerManager'] = None,
|
||||
buffer_size: int = 50):
|
||||
"""Initialize the handler.
|
||||
|
||||
Args:
|
||||
worker_id: ID of the worker to capture logs for
|
||||
db: Reference to LocalLibraryDB for storing logs
|
||||
buffer_size: Number of logs to buffer before flushing to DB
|
||||
"""
|
||||
super().__init__()
|
||||
self.worker_id = worker_id
|
||||
self.db = db
|
||||
self.manager = manager
|
||||
self.buffer_size = buffer_size
|
||||
self.buffer = []
|
||||
self._lock = Lock()
|
||||
|
||||
# Set a format that includes timestamp and level
|
||||
formatter = logging.Formatter(
|
||||
'%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
self.setFormatter(formatter)
|
||||
|
||||
def emit(self, record):
|
||||
"""Emit a log record."""
|
||||
try:
|
||||
# Try to format the record normally
|
||||
try:
|
||||
msg = self.format(record)
|
||||
except (TypeError, ValueError):
|
||||
# If formatting fails (e.g., %d format with non-int arg),
|
||||
# build message manually without calling getMessage()
|
||||
try:
|
||||
# Try to format with args if possible
|
||||
if record.args:
|
||||
msg = record.msg % record.args
|
||||
else:
|
||||
msg = record.msg
|
||||
except (TypeError, ValueError):
|
||||
# If that fails too, just use the raw message string
|
||||
msg = str(record.msg)
|
||||
|
||||
# Add timestamp and level if not already in message
|
||||
import time
|
||||
timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(record.created))
|
||||
msg = f"{timestamp} - {record.name} - {record.levelname} - {msg}"
|
||||
|
||||
with self._lock:
|
||||
self.buffer.append(msg)
|
||||
|
||||
# Flush to DB when buffer reaches size
|
||||
if len(self.buffer) >= self.buffer_size:
|
||||
self._flush()
|
||||
except Exception:
|
||||
self.handleError(record)
|
||||
|
||||
def _flush(self):
|
||||
"""Flush buffered logs to database."""
|
||||
if self.buffer:
|
||||
log_text = '\n'.join(self.buffer)
|
||||
try:
|
||||
if self.manager:
|
||||
self.manager.append_worker_stdout(self.worker_id, log_text, channel='log')
|
||||
else:
|
||||
self.db.append_worker_stdout(self.worker_id, log_text, channel='log')
|
||||
except Exception as e:
|
||||
# If we can't write to DB, at least log it
|
||||
log(f"Error flushing worker logs: {e}")
|
||||
self.buffer = []
|
||||
|
||||
def flush(self):
|
||||
"""Flush any buffered records."""
|
||||
with self._lock:
|
||||
self._flush()
|
||||
super().flush()
|
||||
|
||||
def close(self):
|
||||
"""Close the handler."""
|
||||
self.flush()
|
||||
super().close()
|
||||
|
||||
|
||||
class WorkerManager:
|
||||
"""Manages persistent worker tasks with auto-refresh capability."""
|
||||
|
||||
def __init__(self, library_root: Path, auto_refresh_interval: float = 2.0):
|
||||
"""Initialize the worker manager.
|
||||
|
||||
Args:
|
||||
library_root: Root directory for the local library database
|
||||
auto_refresh_interval: Seconds between auto-refresh checks (0 = disabled)
|
||||
"""
|
||||
self.library_root = Path(library_root)
|
||||
self.db = LocalLibraryDB(library_root)
|
||||
self.auto_refresh_interval = auto_refresh_interval
|
||||
self.refresh_callbacks: List[Callable] = []
|
||||
self.refresh_thread: Optional[Thread] = None
|
||||
self._stop_refresh = False
|
||||
self._lock = Lock()
|
||||
self.worker_handlers: Dict[str, WorkerLoggingHandler] = {} # Track active handlers
|
||||
self._worker_last_step: Dict[str, str] = {}
|
||||
|
||||
def add_refresh_callback(self, callback: Callable[[List[Dict[str, Any]]], None]) -> None:
|
||||
"""Register a callback to be called on worker updates.
|
||||
|
||||
Args:
|
||||
callback: Function that receives list of active workers
|
||||
"""
|
||||
with self._lock:
|
||||
self.refresh_callbacks.append(callback)
|
||||
|
||||
def expire_running_workers(
|
||||
self,
|
||||
older_than_seconds: int = 300,
|
||||
worker_id_prefix: Optional[str] = None,
|
||||
reason: Optional[str] = None,
|
||||
status: str = "error",
|
||||
) -> int:
|
||||
"""Mark stale running workers as finished.
|
||||
|
||||
Args:
|
||||
older_than_seconds: Idle threshold before expiring.
|
||||
worker_id_prefix: Optional wildcard filter (e.g., 'cli_%').
|
||||
reason: Error message if none already exists.
|
||||
status: New status to apply.
|
||||
|
||||
Returns:
|
||||
Count of workers updated.
|
||||
"""
|
||||
try:
|
||||
return self.db.expire_running_workers(
|
||||
older_than_seconds=older_than_seconds,
|
||||
status=status,
|
||||
reason=reason,
|
||||
worker_id_prefix=worker_id_prefix,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.error(f"Failed to expire stale workers: {exc}", exc_info=True)
|
||||
return 0
|
||||
|
||||
def remove_refresh_callback(self, callback: Callable) -> None:
|
||||
"""Remove a refresh callback.
|
||||
|
||||
Args:
|
||||
callback: The callback function to remove
|
||||
"""
|
||||
with self._lock:
|
||||
if callback in self.refresh_callbacks:
|
||||
self.refresh_callbacks.remove(callback)
|
||||
|
||||
def enable_logging_for_worker(self, worker_id: str) -> Optional[WorkerLoggingHandler]:
|
||||
"""Enable logging capture for a worker.
|
||||
|
||||
Creates a logging handler that captures all logs for this worker.
|
||||
|
||||
Args:
|
||||
worker_id: ID of the worker to capture logs for
|
||||
|
||||
Returns:
|
||||
The logging handler that was created, or None if there was an error
|
||||
"""
|
||||
try:
|
||||
handler = WorkerLoggingHandler(worker_id, self.db, manager=self)
|
||||
with self._lock:
|
||||
self.worker_handlers[worker_id] = handler
|
||||
|
||||
# Add the handler to the root logger so it captures all logs
|
||||
root_logger = logging.getLogger()
|
||||
root_logger.addHandler(handler)
|
||||
root_logger.setLevel(logging.DEBUG) # Capture all levels
|
||||
|
||||
logger.debug(f"[WorkerManager] Enabled logging for worker: {worker_id}")
|
||||
return handler
|
||||
except Exception as e:
|
||||
logger.error(f"[WorkerManager] Error enabling logging for worker {worker_id}: {e}", exc_info=True)
|
||||
return None
|
||||
|
||||
def disable_logging_for_worker(self, worker_id: str) -> None:
|
||||
"""Disable logging capture for a worker and flush any pending logs.
|
||||
|
||||
Args:
|
||||
worker_id: ID of the worker to stop capturing logs for
|
||||
"""
|
||||
try:
|
||||
with self._lock:
|
||||
handler = self.worker_handlers.pop(worker_id, None)
|
||||
|
||||
if handler:
|
||||
# Flush and close the handler
|
||||
handler.flush()
|
||||
handler.close()
|
||||
|
||||
# Remove from root logger
|
||||
root_logger = logging.getLogger()
|
||||
root_logger.removeHandler(handler)
|
||||
|
||||
logger.debug(f"[WorkerManager] Disabled logging for worker: {worker_id}")
|
||||
except Exception as e:
|
||||
logger.error(f"[WorkerManager] Error disabling logging for worker {worker_id}: {e}", exc_info=True)
|
||||
|
||||
def track_worker(self, worker_id: str, worker_type: str, title: str = "",
|
||||
description: str = "", total_steps: int = 0,
|
||||
pipe: Optional[str] = None) -> bool:
|
||||
"""Start tracking a new worker.
|
||||
|
||||
Args:
|
||||
worker_id: Unique identifier for the worker
|
||||
worker_type: Type of worker (e.g., 'download', 'search', 'import')
|
||||
title: Worker title/name
|
||||
description: Worker description
|
||||
total_steps: Total number of steps for progress tracking
|
||||
pipe: Text of the originating pipe/prompt, if any
|
||||
|
||||
Returns:
|
||||
True if worker was inserted successfully
|
||||
"""
|
||||
try:
|
||||
result = self.db.insert_worker(worker_id, worker_type, title, description, total_steps, pipe=pipe)
|
||||
if result > 0:
|
||||
logger.debug(f"[WorkerManager] Tracking worker: {worker_id} ({worker_type})")
|
||||
self._start_refresh_if_needed()
|
||||
return True
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(f"[WorkerManager] Error tracking worker: {e}", exc_info=True)
|
||||
return False
|
||||
|
||||
def update_worker(self, worker_id: str, progress: float = 0.0, current_step: str = "",
|
||||
details: str = "", error: str = "") -> bool:
|
||||
"""Update worker progress and status.
|
||||
|
||||
Args:
|
||||
worker_id: Unique identifier for the worker
|
||||
progress: Progress percentage (0-100)
|
||||
current_step: Current step description
|
||||
details: Additional details
|
||||
error: Error message if any
|
||||
|
||||
Returns:
|
||||
True if update was successful
|
||||
"""
|
||||
try:
|
||||
kwargs = {}
|
||||
if progress > 0:
|
||||
kwargs['progress'] = progress
|
||||
if current_step:
|
||||
kwargs['current_step'] = current_step
|
||||
if details:
|
||||
kwargs['description'] = details
|
||||
if error:
|
||||
kwargs['error_message'] = error
|
||||
|
||||
if kwargs:
|
||||
kwargs['last_updated'] = datetime.now().isoformat()
|
||||
if 'current_step' in kwargs and kwargs['current_step']:
|
||||
self._worker_last_step[worker_id] = str(kwargs['current_step'])
|
||||
return self.db.update_worker(worker_id, **kwargs)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"[WorkerManager] Error updating worker {worker_id}: {e}", exc_info=True)
|
||||
return False
|
||||
|
||||
def finish_worker(self, worker_id: str, result: str = "completed",
|
||||
error_msg: str = "", result_data: str = "") -> bool:
|
||||
"""Mark a worker as finished.
|
||||
|
||||
Args:
|
||||
worker_id: Unique identifier for the worker
|
||||
result: Result status ('completed', 'error', 'cancelled')
|
||||
error_msg: Error message if any
|
||||
result_data: Result data as JSON string
|
||||
|
||||
Returns:
|
||||
True if update was successful
|
||||
"""
|
||||
try:
|
||||
kwargs = {
|
||||
'status': result,
|
||||
'completed_at': datetime.now().isoformat()
|
||||
}
|
||||
if error_msg:
|
||||
kwargs['error_message'] = error_msg
|
||||
if result_data:
|
||||
kwargs['result_data'] = result_data
|
||||
|
||||
success = self.db.update_worker(worker_id, **kwargs)
|
||||
logger.info(f"[WorkerManager] Worker finished: {worker_id} ({result})")
|
||||
self._worker_last_step.pop(worker_id, None)
|
||||
return success
|
||||
except Exception as e:
|
||||
logger.error(f"[WorkerManager] Error finishing worker {worker_id}: {e}", exc_info=True)
|
||||
return False
|
||||
|
||||
def get_active_workers(self) -> List[Dict[str, Any]]:
|
||||
"""Get all active (running) workers.
|
||||
|
||||
Returns:
|
||||
List of active worker dictionaries
|
||||
"""
|
||||
try:
|
||||
return self.db.get_active_workers()
|
||||
except Exception as e:
|
||||
logger.error(f"[WorkerManager] Error getting active workers: {e}", exc_info=True)
|
||||
return []
|
||||
|
||||
def get_finished_workers(self, limit: int = 100) -> List[Dict[str, Any]]:
|
||||
"""Get all finished workers (completed, errored, or cancelled).
|
||||
|
||||
Args:
|
||||
limit: Maximum number of workers to retrieve
|
||||
|
||||
Returns:
|
||||
List of finished worker dictionaries
|
||||
"""
|
||||
try:
|
||||
all_workers = self.db.get_all_workers(limit=limit)
|
||||
# Filter to only finished workers
|
||||
finished = [w for w in all_workers if w.get('status') in ['completed', 'error', 'cancelled']]
|
||||
return finished
|
||||
except Exception as e:
|
||||
logger.error(f"[WorkerManager] Error getting finished workers: {e}", exc_info=True)
|
||||
return []
|
||||
|
||||
def get_worker(self, worker_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""Get a specific worker's data.
|
||||
|
||||
Args:
|
||||
worker_id: Unique identifier for the worker
|
||||
|
||||
Returns:
|
||||
Worker data or None if not found
|
||||
"""
|
||||
try:
|
||||
return self.db.get_worker(worker_id)
|
||||
except Exception as e:
|
||||
logger.error(f"[WorkerManager] Error getting worker {worker_id}: {e}", exc_info=True)
|
||||
return None
|
||||
|
||||
def get_worker_events(self, worker_id: str, limit: int = 500) -> List[Dict[str, Any]]:
|
||||
"""Fetch recorded worker timeline events."""
|
||||
return self.db.get_worker_events(worker_id, limit)
|
||||
|
||||
def log_step(self, worker_id: str, step_text: str) -> bool:
|
||||
"""Log a step to a worker's step history.
|
||||
|
||||
Args:
|
||||
worker_id: Unique identifier for the worker
|
||||
step_text: Step description to log
|
||||
|
||||
Returns:
|
||||
True if successful
|
||||
"""
|
||||
try:
|
||||
success = self.db.append_worker_steps(worker_id, step_text)
|
||||
if success:
|
||||
self._worker_last_step[worker_id] = step_text
|
||||
return success
|
||||
except Exception as e:
|
||||
logger.error(f"[WorkerManager] Error logging step for worker {worker_id}: {e}", exc_info=True)
|
||||
return False
|
||||
|
||||
def _get_last_step(self, worker_id: str) -> Optional[str]:
|
||||
"""Return the most recent step description for a worker."""
|
||||
return self._worker_last_step.get(worker_id)
|
||||
|
||||
def get_steps(self, worker_id: str) -> str:
|
||||
"""Get step logs for a worker.
|
||||
|
||||
Args:
|
||||
worker_id: Unique identifier for the worker
|
||||
|
||||
Returns:
|
||||
Steps text or empty string if not found
|
||||
"""
|
||||
try:
|
||||
return self.db.get_worker_steps(worker_id)
|
||||
except Exception as e:
|
||||
logger.error(f"[WorkerManager] Error getting steps for worker {worker_id}: {e}", exc_info=True)
|
||||
return ''
|
||||
|
||||
def start_auto_refresh(self) -> None:
|
||||
"""Start the auto-refresh thread for periodic worker updates."""
|
||||
if self.auto_refresh_interval <= 0:
|
||||
logger.debug("[WorkerManager] Auto-refresh disabled (interval <= 0)")
|
||||
return
|
||||
|
||||
if self.refresh_thread and self.refresh_thread.is_alive():
|
||||
logger.debug("[WorkerManager] Auto-refresh already running")
|
||||
return
|
||||
|
||||
logger.info(f"[WorkerManager] Starting auto-refresh with {self.auto_refresh_interval}s interval")
|
||||
self._stop_refresh = False
|
||||
self.refresh_thread = Thread(target=self._auto_refresh_loop, daemon=True)
|
||||
self.refresh_thread.start()
|
||||
|
||||
def stop_auto_refresh(self) -> None:
|
||||
"""Stop the auto-refresh thread."""
|
||||
logger.info("[WorkerManager] Stopping auto-refresh")
|
||||
self._stop_refresh = True
|
||||
if self.refresh_thread:
|
||||
self.refresh_thread.join(timeout=5)
|
||||
self.refresh_thread = None
|
||||
|
||||
def _start_refresh_if_needed(self) -> None:
|
||||
"""Start auto-refresh if we have active workers and callbacks."""
|
||||
active = self.get_active_workers()
|
||||
if active and self.refresh_callbacks and not self._stop_refresh:
|
||||
self.start_auto_refresh()
|
||||
|
||||
def _auto_refresh_loop(self) -> None:
|
||||
"""Main auto-refresh loop that periodically queries and notifies."""
|
||||
try:
|
||||
while not self._stop_refresh:
|
||||
time.sleep(self.auto_refresh_interval)
|
||||
|
||||
# Check if there are active workers
|
||||
active = self.get_active_workers()
|
||||
|
||||
if not active:
|
||||
# No more active workers, stop refreshing
|
||||
logger.debug("[WorkerManager] No active workers, stopping auto-refresh")
|
||||
break
|
||||
|
||||
# Call all registered callbacks with the active workers
|
||||
with self._lock:
|
||||
for callback in self.refresh_callbacks:
|
||||
try:
|
||||
callback(active)
|
||||
except Exception as e:
|
||||
logger.error(f"[WorkerManager] Error in refresh callback: {e}", exc_info=True)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[WorkerManager] Error in auto-refresh loop: {e}", exc_info=True)
|
||||
finally:
|
||||
logger.debug("[WorkerManager] Auto-refresh loop ended")
|
||||
|
||||
def cleanup_old_workers(self, days: int = 7) -> int:
|
||||
"""Clean up completed/errored workers older than specified days.
|
||||
|
||||
Args:
|
||||
days: Delete workers completed more than this many days ago
|
||||
|
||||
Returns:
|
||||
Number of workers deleted
|
||||
"""
|
||||
try:
|
||||
count = self.db.cleanup_old_workers(days)
|
||||
if count > 0:
|
||||
logger.info(f"[WorkerManager] Cleaned up {count} old workers")
|
||||
return count
|
||||
except Exception as e:
|
||||
logger.error(f"[WorkerManager] Error cleaning up old workers: {e}", exc_info=True)
|
||||
return 0
|
||||
|
||||
def append_stdout(self, worker_id: str, text: str, channel: str = "stdout") -> bool:
|
||||
"""Append text to a worker's stdout log.
|
||||
|
||||
Args:
|
||||
worker_id: Unique identifier for the worker
|
||||
text: Text to append
|
||||
channel: Logical channel (stdout, stderr, log, etc.)
|
||||
|
||||
Returns:
|
||||
True if append was successful
|
||||
"""
|
||||
try:
|
||||
step_label = self._get_last_step(worker_id)
|
||||
return self.db.append_worker_stdout(worker_id, text, step=step_label, channel=channel)
|
||||
except Exception as e:
|
||||
logger.error(f"[WorkerManager] Error appending stdout: {e}", exc_info=True)
|
||||
return False
|
||||
|
||||
def get_stdout(self, worker_id: str) -> str:
|
||||
"""Get stdout logs for a worker.
|
||||
|
||||
Args:
|
||||
worker_id: Unique identifier for the worker
|
||||
|
||||
Returns:
|
||||
Worker's stdout or empty string
|
||||
"""
|
||||
try:
|
||||
return self.db.get_worker_stdout(worker_id)
|
||||
except Exception as e:
|
||||
logger.error(f"[WorkerManager] Error getting stdout: {e}", exc_info=True)
|
||||
return ""
|
||||
|
||||
def append_worker_stdout(self, worker_id: str, text: str, channel: str = "stdout") -> bool:
|
||||
"""Compatibility wrapper for append_stdout."""
|
||||
return self.append_stdout(worker_id, text, channel=channel)
|
||||
|
||||
def clear_stdout(self, worker_id: str) -> bool:
|
||||
"""Clear stdout logs for a worker.
|
||||
|
||||
Args:
|
||||
worker_id: Unique identifier for the worker
|
||||
|
||||
Returns:
|
||||
True if clear was successful
|
||||
"""
|
||||
try:
|
||||
return self.db.clear_worker_stdout(worker_id)
|
||||
except Exception as e:
|
||||
logger.error(f"[WorkerManager] Error clearing stdout: {e}", exc_info=True)
|
||||
return False
|
||||
|
||||
def close(self) -> None:
|
||||
"""Close the worker manager and database connection."""
|
||||
self.stop_auto_refresh()
|
||||
self.db.close()
|
||||
logger.info("[WorkerManager] Closed")
|
||||
Reference in New Issue
Block a user