This commit is contained in:
nose
2025-12-11 19:04:02 -08:00
parent 6863c6c7ea
commit 16d8a763cd
103 changed files with 4759 additions and 9156 deletions

195
SYS/background_notifier.py Normal file
View File

@@ -0,0 +1,195 @@
"""Lightweight console notifier for background WorkerManager tasks.
Registers a refresh callback on WorkerManager and prints concise updates when
workers start, progress, or finish. Intended for CLI background workflows.
Filters to show only workers related to the current pipeline session to avoid
cluttering the terminal with workers from previous sessions.
"""
from __future__ import annotations
from typing import Any, Callable, Dict, Optional, Set
from SYS.logger import log, debug
class BackgroundNotifier:
"""Simple notifier that prints worker status changes for a session."""
def __init__(
self,
manager: Any,
output: Callable[[str], None] = log,
session_worker_ids: Optional[Set[str]] = None,
only_terminal_updates: bool = False,
overlay_mode: bool = False,
) -> None:
self.manager = manager
self.output = output
self.session_worker_ids = session_worker_ids if session_worker_ids is not None else set()
self.only_terminal_updates = only_terminal_updates
self.overlay_mode = overlay_mode
self._filter_enabled = session_worker_ids is not None
self._last_state: Dict[str, str] = {}
try:
self.manager.add_refresh_callback(self._on_refresh)
self.manager.start_auto_refresh()
except Exception as exc: # pragma: no cover - best effort
debug(f"[notifier] Could not attach refresh callback: {exc}")
def _render_line(self, worker: Dict[str, Any]) -> Optional[str]:
# Use worker_id (the actual worker ID we set) for filtering and display
worker_id = str(worker.get("worker_id") or "").strip()
if not worker_id:
# Fallback to database id if worker_id is not set
worker_id = str(worker.get("id") or "").strip()
if not worker_id:
return None
status = str(worker.get("status") or "running")
progress_val = worker.get("progress") or worker.get("progress_percent")
progress = ""
if isinstance(progress_val, (int, float)):
progress = f" {progress_val:.1f}%"
elif progress_val:
progress = f" {progress_val}"
step = str(worker.get("current_step") or worker.get("description") or "").strip()
parts = [f"[worker:{worker_id}] {status}{progress}"]
if step:
parts.append(step)
return " - ".join(parts)
def _on_refresh(self, workers: list[Dict[str, Any]]) -> None:
overlay_active_workers = 0
for worker in workers:
# Use worker_id (the actual worker ID we set) for filtering
worker_id = str(worker.get("worker_id") or "").strip()
if not worker_id:
# Fallback to database id if worker_id is not set
worker_id = str(worker.get("id") or "").strip()
if not worker_id:
continue
# If filtering is enabled, skip workers not in this session
if self._filter_enabled and worker_id not in self.session_worker_ids:
continue
status = str(worker.get("status") or "running")
# Overlay mode: only emit on completion; suppress start/progress spam
if self.overlay_mode:
if status in ("completed", "finished", "error"):
progress_val = worker.get("progress") or worker.get("progress_percent") or ""
step = str(worker.get("current_step") or worker.get("description") or "").strip()
signature = f"{status}|{progress_val}|{step}"
if self._last_state.get(worker_id) == signature:
continue
self._last_state[worker_id] = signature
line = self._render_line(worker)
if line:
try:
self.output(line)
except Exception:
pass
self._last_state.pop(worker_id, None)
self.session_worker_ids.discard(worker_id)
continue
# For terminal-only mode, emit once when the worker finishes and skip intermediate updates
if self.only_terminal_updates:
if status in ("completed", "finished", "error"):
if self._last_state.get(worker_id) == status:
continue
self._last_state[worker_id] = status
line = self._render_line(worker)
if line:
try:
self.output(line)
except Exception:
pass
# Stop tracking this worker after terminal notification
self.session_worker_ids.discard(worker_id)
continue
# Skip finished workers after showing them once (standard verbose mode)
if status in ("completed", "finished", "error"):
if worker_id in self._last_state:
# Already shown, remove from tracking
self._last_state.pop(worker_id, None)
self.session_worker_ids.discard(worker_id)
continue
progress_val = worker.get("progress") or worker.get("progress_percent") or ""
step = str(worker.get("current_step") or worker.get("description") or "").strip()
signature = f"{status}|{progress_val}|{step}"
if self._last_state.get(worker_id) == signature:
continue
self._last_state[worker_id] = signature
line = self._render_line(worker)
if line:
try:
self.output(line)
except Exception:
pass
if self.overlay_mode:
try:
# If nothing active for this session, clear the overlay text
if overlay_active_workers == 0:
self.output("")
except Exception:
pass
def ensure_background_notifier(
manager: Any,
output: Callable[[str], None] = log,
session_worker_ids: Optional[Set[str]] = None,
only_terminal_updates: bool = False,
overlay_mode: bool = False,
) -> Optional[BackgroundNotifier]:
"""Attach a BackgroundNotifier to a WorkerManager if not already present.
Args:
manager: WorkerManager instance
output: Function to call for printing updates
session_worker_ids: Set of worker IDs belonging to this pipeline session.
If None, show all workers. If a set (even empty), only show workers in that set.
"""
if manager is None:
return None
existing = getattr(manager, "_background_notifier", None)
if isinstance(existing, BackgroundNotifier):
# Update session IDs if provided
if session_worker_ids is not None:
existing._filter_enabled = True
existing.session_worker_ids.update(session_worker_ids)
# Respect the most restrictive setting for terminal-only updates
if only_terminal_updates:
existing.only_terminal_updates = True
# Enable overlay mode if requested later
if overlay_mode:
existing.overlay_mode = True
return existing
notifier = BackgroundNotifier(
manager,
output,
session_worker_ids=session_worker_ids,
only_terminal_updates=only_terminal_updates,
overlay_mode=overlay_mode,
)
try:
manager._background_notifier = notifier # type: ignore[attr-defined]
except Exception:
pass
return notifier

767
SYS/download.py Normal file
View File

@@ -0,0 +1,767 @@
"""Download media files using yt-dlp with support for direct file downloads.
Lean, focused downloader without event infrastructure overhead.
- yt-dlp integration for streaming sites
- Direct file download fallback for PDFs, images, documents
- Tag extraction via metadata.extract_ytdlp_tags()
- Logging via helper.logger.log()
"""
from __future__ import annotations
import glob # noqa: F401
import hashlib
import json # noqa: F401
import random
import re
import string
import subprocess
import sys
import time
import traceback
from pathlib import Path
from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import urljoin, urlparse
import httpx
from SYS.logger import log, debug
from SYS.utils import ensure_directory, sha256_file
from API.HTTP import HTTPClient
from models import DownloadError, DownloadOptions, DownloadMediaResult, DebugLogger, ProgressBar
try:
import yt_dlp # type: ignore
from yt_dlp.extractor import gen_extractors # type: ignore
except Exception as exc:
yt_dlp = None # type: ignore
YTDLP_IMPORT_ERROR = exc
else:
YTDLP_IMPORT_ERROR = None
try:
from metadata import extract_ytdlp_tags
except ImportError:
extract_ytdlp_tags = None
_EXTRACTOR_CACHE: List[Any] | None = None
def _ensure_yt_dlp_ready() -> None:
"""Verify yt-dlp is available, raise if not."""
if yt_dlp is not None:
return
detail = str(YTDLP_IMPORT_ERROR or "yt-dlp is not installed")
raise DownloadError(f"yt-dlp module not available: {detail}")
def _progress_callback(status: Dict[str, Any]) -> None:
"""Simple progress callback using logger."""
event = status.get("status")
if event == "downloading":
percent = status.get("_percent_str", "?")
speed = status.get("_speed_str", "?")
eta = status.get("_eta_str", "?")
sys.stdout.write(f"\r[download] {percent} at {speed} ETA {eta} ")
sys.stdout.flush()
elif event == "finished":
sys.stdout.write("\r" + " " * 70 + "\r")
sys.stdout.flush()
debug(f"✓ Download finished: {status.get('filename')}")
elif event in ("postprocessing", "processing"):
debug(f"Post-processing: {status.get('postprocessor')}")
def is_url_supported_by_ytdlp(url: str) -> bool:
"""Check if URL is supported by yt-dlp."""
if yt_dlp is None:
return False
global _EXTRACTOR_CACHE
if _EXTRACTOR_CACHE is None:
try:
_EXTRACTOR_CACHE = [ie for ie in gen_extractors()] # type: ignore[arg-type]
except Exception:
_EXTRACTOR_CACHE = []
for extractor in _EXTRACTOR_CACHE:
try:
if not extractor.suitable(url):
continue
except Exception:
continue
name = getattr(extractor, "IE_NAME", "")
if name.lower() == "generic":
continue
return True
return False
def list_formats(url: str, no_playlist: bool = False, playlist_items: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
"""Get list of available formats for a URL using yt-dlp."""
_ensure_yt_dlp_ready()
try:
ydl_opts = {
"quiet": True,
"no_warnings": True,
"socket_timeout": 30,
}
if no_playlist:
ydl_opts["noplaylist"] = True
if playlist_items:
ydl_opts["playlist_items"] = playlist_items
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
debug(f"Fetching format list for: {url}")
info = ydl.extract_info(url, download=False)
formats = info.get("formats", [])
if not formats:
log("No formats available", file=sys.stderr)
return None
result_formats = []
for fmt in formats:
result_formats.append({
"format_id": fmt.get("format_id", ""),
"format": fmt.get("format", ""),
"ext": fmt.get("ext", ""),
"resolution": fmt.get("resolution", ""),
"width": fmt.get("width"),
"height": fmt.get("height"),
"fps": fmt.get("fps"),
"vcodec": fmt.get("vcodec", "none"),
"acodec": fmt.get("acodec", "none"),
"filesize": fmt.get("filesize"),
"tbr": fmt.get("tbr"),
})
debug(f"Found {len(result_formats)} available formats")
return result_formats
except Exception as e:
log(f"✗ Error fetching formats: {e}", file=sys.stderr)
return None
def _download_with_sections_via_cli(url: str, ytdl_options: Dict[str, Any], sections: List[str], quiet: bool = False) -> tuple[Optional[str], Dict[str, Any]]:
"""Download each section separately so merge-file can combine them.
yt-dlp with multiple --download-sections args merges them into one file.
We need separate files for merge-file, so download each section individually.
Uses hash-based filenames for sections (not title-based) to prevent yt-dlp from
thinking sections are already downloaded. The title is extracted and stored in tags.
Returns:
(session_id, first_section_info_dict) - session_id for finding files, info dict for metadata extraction
"""
sections_list = ytdl_options.get("download_sections", [])
if not sections_list:
return "", {}
# Generate a unique hash-based ID for this download session
# This ensures different videos/downloads don't have filename collisions
session_id = hashlib.md5(
(url + str(time.time()) + ''.join(random.choices(string.ascii_letters, k=10))).encode()
).hexdigest()[:12]
first_section_info = None
title_from_first = None
# Download each section separately with unique output template using session ID
for section_idx, section in enumerate(sections_list, 1):
# Build unique output template for this section using session-based filename
# e.g., "{session_id}_{section_idx}.ext" - simple and unique per section
base_outtmpl = ytdl_options.get("outtmpl", "%(title)s.%(ext)s")
output_dir_path = Path(base_outtmpl).parent
# Use session_id + section index for temp filename
# e.g., "/path/{session_id}_1.%(ext)s"
filename_tmpl = f"{session_id}_{section_idx}"
if base_outtmpl.endswith(".%(ext)s"):
filename_tmpl += ".%(ext)s"
# Use Path to handle separators correctly for the OS
section_outtmpl = str(output_dir_path / filename_tmpl)
# For the first section, extract metadata first (separate call)
if section_idx == 1:
metadata_cmd = ["yt-dlp", "--dump-json", "--skip-download"]
if ytdl_options.get("cookiefile"):
cookies_path = ytdl_options["cookiefile"].replace("\\", "/")
metadata_cmd.extend(["--cookies", cookies_path])
if ytdl_options.get("noplaylist"):
metadata_cmd.append("--no-playlist")
metadata_cmd.append(url)
try:
meta_result = subprocess.run(metadata_cmd, capture_output=True, text=True)
if meta_result.returncode == 0 and meta_result.stdout:
try:
info_dict = json.loads(meta_result.stdout.strip())
first_section_info = info_dict
title_from_first = info_dict.get('title')
if not quiet:
debug(f"Extracted title from metadata: {title_from_first}")
except json.JSONDecodeError:
if not quiet:
debug("Could not parse JSON metadata")
except Exception as e:
if not quiet:
debug(f"Error extracting metadata: {e}")
# Build yt-dlp command for downloading this section
cmd = ["yt-dlp"]
# Add format
if ytdl_options.get("format"):
cmd.extend(["-f", ytdl_options["format"]])
# Add ONLY this section (not all sections)
cmd.extend(["--download-sections", section])
# Add force-keyframes-at-cuts if specified
if ytdl_options.get("force_keyframes_at_cuts"):
cmd.append("--force-keyframes-at-cuts")
# Add output template for this section
cmd.extend(["-o", section_outtmpl])
# Add cookies file if present
if ytdl_options.get("cookiefile"):
# Convert backslashes to forward slashes for better compatibility
cookies_path = ytdl_options["cookiefile"].replace("\\", "/")
cmd.extend(["--cookies", cookies_path])
# Add no-playlist if specified
if ytdl_options.get("noplaylist"):
cmd.append("--no-playlist")
# Add the URL
cmd.append(url)
if not quiet:
debug(f"Running yt-dlp for section {section_idx}/{len(sections_list)}: {section}")
debug(f"Command: {' '.join(cmd)}")
# Run the subprocess - don't capture output so progress is shown
try:
result = subprocess.run(cmd)
if result.returncode != 0:
raise DownloadError(f"yt-dlp subprocess failed for section {section_idx} with code {result.returncode}")
except Exception as exc:
raise DownloadError(f"yt-dlp subprocess error for section {section_idx}: {exc}") from exc
return session_id, first_section_info or {}
def _build_ytdlp_options(opts: DownloadOptions) -> Dict[str, Any]:
"""Build yt-dlp download options."""
ensure_directory(opts.output_dir)
# Build output template
# When downloading sections, each section will have .section_N_of_M added by _download_with_sections_via_cli
outtmpl = str((opts.output_dir / "%(title)s.%(ext)s").resolve())
base_options: Dict[str, Any] = {
"outtmpl": outtmpl,
"quiet": True,
"no_warnings": True,
"noprogress": True,
"socket_timeout": 30,
"retries": 10,
"fragment_retries": 10,
"http_chunk_size": 10_485_760,
"restrictfilenames": True,
"progress_hooks": [] if opts.quiet else [_progress_callback],
}
if opts.cookies_path and opts.cookies_path.is_file():
base_options["cookiefile"] = str(opts.cookies_path)
else:
# Check global cookies file lazily to avoid import cycles
from hydrus_health_check import get_cookies_file_path # local import
global_cookies = get_cookies_file_path()
if global_cookies:
base_options["cookiefile"] = global_cookies
else:
# Fallback to browser cookies
base_options["cookiesfrombrowser"] = ("chrome",)
# Add no-playlist option if specified (for single video from playlist url)
if opts.no_playlist:
base_options["noplaylist"] = True
# Configure based on mode
if opts.mode == "audio":
base_options["format"] = opts.ytdl_format or "251/140/bestaudio"
base_options["postprocessors"] = [{"key": "FFmpegExtractAudio"}]
else: # video
base_options["format"] = opts.ytdl_format or "bestvideo+bestaudio/best"
base_options["format_sort"] = [
"res:4320", "res:2880", "res:2160", "res:1440", "res:1080", "res:720", "res"
]
# Add clip sections if provided (yt-dlp will download only these sections)
if opts.clip_sections:
# Parse section ranges like "48-65,120-152,196-205" (seconds)
# and convert to yt-dlp format: "*HH:MM:SS-HH:MM:SS,*HH:MM:SS-HH:MM:SS"
sections = []
for section_range in opts.clip_sections.split(','):
try:
start_str, end_str = section_range.strip().split('-')
start_sec = float(start_str)
end_sec = float(end_str)
# Convert seconds to HH:MM:SS format
def sec_to_hhmmss(seconds):
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = int(seconds % 60)
return f"{hours:02d}:{minutes:02d}:{secs:02d}"
start_time = sec_to_hhmmss(start_sec)
end_time = sec_to_hhmmss(end_sec)
sections.append(f"*{start_time}-{end_time}")
except (ValueError, AttributeError):
pass
if sections:
# Pass each section as a separate element in the list (yt-dlp expects multiple --download-sections args)
base_options["download_sections"] = sections
debug(f"Download sections configured: {', '.join(sections)}")
# Note: Not using --force-keyframes-at-cuts to avoid re-encoding
# This may result in less precise cuts but faster downloads
# Add playlist items selection if provided
if opts.playlist_items:
base_options["playlist_items"] = opts.playlist_items
if not opts.quiet:
debug(f"yt-dlp: mode={opts.mode}, format={base_options.get('format')}")
return base_options
def _iter_download_entries(info: Dict[str, Any]) -> Iterator[Dict[str, Any]]:
"""Iterate through download entries, handling playlists."""
queue: List[Dict[str, Any]] = [info]
seen: set[int] = set()
while queue:
current = queue.pop(0)
obj_id = id(current)
if obj_id in seen:
continue
seen.add(obj_id)
entries = current.get("entries")
if isinstance(entries, list):
for entry in entries:
if isinstance(entry, dict):
queue.append(entry)
if current.get("requested_downloads") or not entries:
yield current
def _candidate_paths(entry: Dict[str, Any], output_dir: Path) -> Iterator[Path]:
"""Get candidate file paths for downloaded media."""
requested = entry.get("requested_downloads")
if isinstance(requested, list):
for item in requested:
if isinstance(item, dict):
for key in ("filepath", "_filename", "filename"):
value = item.get(key)
if value:
yield Path(value)
for key in ("filepath", "_filename", "filename"):
value = entry.get(key)
if value:
yield Path(value)
if entry.get("filename"):
yield output_dir / entry["filename"]
def _resolve_entry_and_path(info: Dict[str, Any], output_dir: Path) -> tuple[Dict[str, Any], Path]:
"""Find downloaded file in yt-dlp metadata."""
for entry in _iter_download_entries(info):
for candidate in _candidate_paths(entry, output_dir):
if candidate.is_file():
return entry, candidate
if not candidate.is_absolute():
resolved = output_dir / candidate
if resolved.is_file():
return entry, resolved
raise FileNotFoundError("yt-dlp did not report a downloaded media file")
def _extract_sha256(info: Dict[str, Any]) -> Optional[str]:
"""Extract SHA256 hash from yt-dlp metadata."""
for payload in [info] + info.get("entries", []):
if not isinstance(payload, dict):
continue
hashes = payload.get("hashes")
if isinstance(hashes, dict):
for key in ("sha256", "sha-256", "sha_256"):
value = hashes.get(key)
if isinstance(value, str) and value.strip():
return value.strip().lower()
for key in ("sha256", "sha-256", "sha_256"):
value = payload.get(key)
if isinstance(value, str) and value.strip():
return value.strip().lower()
return None
def _get_libgen_download_url(libgen_url: str) -> Optional[str]:
"""Extract the actual download link from LibGen redirect URL.
LibGen url like https://libgen.gl/file.php?id=123456 redirect to
actual mirror url. This follows the redirect chain to get the real file.
Args:
libgen_url: LibGen file.php URL
Returns:
Actual download URL or None if extraction fails
"""
try:
import requests
from urllib.parse import urlparse
# Check if this is a LibGen URL
parsed = urlparse(libgen_url)
if 'libgen' not in parsed.netloc.lower():
return None
if '/file.php' not in parsed.path.lower():
return None
# LibGen redirects to actual mirrors, follow redirects to get final URL
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
debug(f"Following LibGen redirect chain for: {libgen_url}")
# First, get the page and look for direct download link
try:
response = session.get(libgen_url, timeout=10, allow_redirects=True)
final_url = response.url
# Try to find actual download link in the page
try:
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Look for download links - LibGen typically has forms with download buttons
# Look for all links and forms that might lead to download
for link in soup.find_all('a'):
href = link.get('href')
if href and isinstance(href, str):
# Look for direct file links or get.php redirects
if 'get.php' in href.lower() or href.endswith(('.pdf', '.epub', '.djvu', '.mobi')):
download_url = href if href.startswith('http') else urljoin(final_url, href)
debug(f"Found download link: {download_url}")
return download_url
except ImportError:
pass # BeautifulSoup not available
# If we followed redirects successfully, return the final URL
# This handles cases where libgen redirects to a direct download mirror
if final_url != libgen_url:
debug(f"LibGen resolved to mirror: {final_url}")
return final_url
except requests.RequestException as e:
log(f"Error following LibGen redirects: {e}", file=sys.stderr)
# Try head request as fallback
try:
response = session.head(libgen_url, allow_redirects=True, timeout=10)
if response.url != libgen_url:
debug(f"LibGen HEAD resolved to: {response.url}")
return response.url
except:
pass
return None
except Exception as e:
log(f"Error resolving LibGen URL: {e}", file=sys.stderr)
return None
def _download_direct_file(
url: str,
output_dir: Path,
debug_logger: Optional[DebugLogger] = None,
quiet: bool = False,
) -> DownloadMediaResult:
"""Download a direct file (PDF, image, document, etc.) without yt-dlp."""
ensure_directory(output_dir)
from urllib.parse import unquote, urlparse, parse_qs
import re
# Extract filename from URL
parsed_url = urlparse(url)
url_path = parsed_url.path
# Try to get filename from query parameters first (for LibGen and similar services)
# e.g., ?filename=Book+Title.pdf or &download=filename.pdf
filename = None
if parsed_url.query:
query_params = parse_qs(parsed_url.query)
for param_name in ('filename', 'download', 'file', 'name'):
if param_name in query_params and query_params[param_name]:
filename = query_params[param_name][0]
filename = unquote(filename)
break
# If not found in query params, extract from URL path
if not filename or not filename.strip():
filename = url_path.split("/")[-1] if url_path else ""
filename = unquote(filename)
# Remove query strings from filename if any
if "?" in filename:
filename = filename.split("?")[0]
# Try to get real filename from Content-Disposition header (HEAD request)
try:
with HTTPClient(timeout=10.0) as client:
response = client._request("HEAD", url, follow_redirects=True)
content_disposition = response.headers.get("content-disposition", "")
if content_disposition:
# Extract filename from Content-Disposition header
# Format: attachment; filename="filename.pdf" or filename=filename.pdf
match = re.search(r'filename\*?=(?:"([^"]*)"|([^;\s]*))', content_disposition)
if match:
extracted_name = match.group(1) or match.group(2)
if extracted_name:
filename = unquote(extracted_name)
if not quiet:
debug(f"Filename from Content-Disposition: {filename}")
except Exception as e:
if not quiet:
log(f"Could not get filename from headers: {e}", file=sys.stderr)
# Fallback if we still don't have a good filename
if not filename or "." not in filename:
filename = "downloaded_file.bin"
file_path = output_dir / filename
progress_bar = ProgressBar()
if not quiet:
debug(f"Direct download: {filename}")
try:
start_time = time.time()
downloaded_bytes = [0]
total_bytes = [0]
last_progress_time = [start_time]
def progress_callback(bytes_downloaded: int, content_length: int) -> None:
downloaded_bytes[0] = bytes_downloaded
total_bytes[0] = content_length
now = time.time()
if now - last_progress_time[0] >= 0.5 and total_bytes[0] > 0:
elapsed = now - start_time
percent = (bytes_downloaded / content_length) * 100 if content_length > 0 else 0
speed = bytes_downloaded / elapsed if elapsed > 0 else 0
eta_seconds = (content_length - bytes_downloaded) / speed if speed > 0 else 0
speed_str = progress_bar.format_bytes(speed) + "/s"
minutes, seconds = divmod(int(eta_seconds), 60)
hours, minutes = divmod(minutes, 60)
eta_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
progress_line = progress_bar.format_progress(
percent_str=f"{percent:.1f}%",
downloaded=bytes_downloaded,
total=content_length,
speed_str=speed_str,
eta_str=eta_str,
)
if not quiet:
debug(progress_line)
last_progress_time[0] = now
with HTTPClient(timeout=30.0) as client:
client.download(url, str(file_path), progress_callback=progress_callback)
elapsed = time.time() - start_time
avg_speed_str = progress_bar.format_bytes(downloaded_bytes[0] / elapsed if elapsed > 0 else 0) + "/s"
if not quiet:
debug(f"✓ Downloaded in {elapsed:.1f}s at {avg_speed_str}")
# For direct file downloads, create minimal info dict without filename as title
# This prevents creating duplicate title: tags when filename gets auto-generated
# We'll add title back later only if we couldn't extract meaningful tags
info = {
"id": filename.rsplit(".", 1)[0],
"ext": filename.rsplit(".", 1)[1] if "." in filename else "bin",
"webpage_url": url,
}
hash_value = None
try:
hash_value = sha256_file(file_path)
except Exception:
pass
tags = []
if extract_ytdlp_tags:
try:
tags = extract_ytdlp_tags(info)
except Exception as e:
log(f"Error extracting tags: {e}", file=sys.stderr)
# Only use filename as a title tag if we couldn't extract any meaningful tags
# This prevents duplicate title: tags when the filename could be mistaken for metadata
if not any(t.startswith('title:') for t in tags):
# Re-extract tags with filename as title only if needed
info['title'] = filename
tags = []
if extract_ytdlp_tags:
try:
tags = extract_ytdlp_tags(info)
except Exception as e:
log(f"Error extracting tags with filename: {e}", file=sys.stderr)
if debug_logger is not None:
debug_logger.write_record(
"direct-file-downloaded",
{"url": url, "path": str(file_path), "hash": hash_value},
)
return DownloadMediaResult(
path=file_path,
info=info,
tags=tags,
source_url=url,
hash_value=hash_value,
)
except (httpx.HTTPError, httpx.RequestError) as exc:
log(f"Download error: {exc}", file=sys.stderr)
if debug_logger is not None:
debug_logger.write_record(
"exception",
{"phase": "direct-file", "url": url, "error": str(exc)},
)
raise DownloadError(f"Failed to download {url}: {exc}") from exc
except Exception as exc:
log(f"Error downloading file: {exc}", file=sys.stderr)
if debug_logger is not None:
debug_logger.write_record(
"exception",
{
"phase": "direct-file",
"url": url,
"error": str(exc),
"traceback": traceback.format_exc(),
},
)
raise DownloadError(f"Error downloading file: {exc}") from exc
def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15) -> Optional[Dict[str, Any]]:
"""Probe URL to extract metadata WITHOUT downloading.
Args:
url: URL to probe
no_playlist: If True, ignore playlists and probe only the single video
timeout_seconds: Max seconds to wait for probe (default 15s)
Returns:
Dict with keys: extractor, title, entries (if playlist), duration, etc.
Returns None if not supported by yt-dlp or on timeout.
"""
if not is_url_supported_by_ytdlp(url):
return None
# Wrap probe in timeout to prevent hanging on large playlists
import threading
from typing import cast
result_container: List[Optional[Any]] = [None, None] # [result, error]
def _do_probe() -> None:
try:
_ensure_yt_dlp_ready()
assert yt_dlp is not None
# Extract info without downloading
# Use extract_flat='in_playlist' to get full metadata for playlist items
ydl_opts = {
"quiet": True, # Suppress all output
"no_warnings": True,
"socket_timeout": 10,
"retries": 2, # Reduce retries for faster timeout
"skip_download": True, # Don't actually download
"extract_flat": "in_playlist", # Get playlist with metadata for each entry
"noprogress": True, # No progress bars
}
# Add cookies if available (lazy import to avoid circular dependency)
from hydrus_health_check import get_cookies_file_path # local import
global_cookies = get_cookies_file_path()
if global_cookies:
ydl_opts["cookiefile"] = global_cookies
# Add no_playlist option if specified
if no_playlist:
ydl_opts["noplaylist"] = True
with yt_dlp.YoutubeDL(ydl_opts) as ydl: # type: ignore[arg-type]
info = ydl.extract_info(url, download=False)
if not isinstance(info, dict):
result_container[0] = None
return
# Extract relevant fields
result_container[0] = {
"extractor": info.get("extractor", ""),
"title": info.get("title", ""),
"entries": info.get("entries", []), # Will be populated if playlist
"duration": info.get("duration"),
"uploader": info.get("uploader"),
"description": info.get("description"),
"url": url,
}
except Exception as exc:
log(f"Probe error for {url}: {exc}")
result_container[1] = exc
thread = threading.Thread(target=_do_probe, daemon=False)
thread.start()
thread.join(timeout=timeout_seconds)
if thread.is_alive():
# Probe timed out - return None to fall back to direct download
debug(f"Probe timeout for {url} (>={timeout_seconds}s), proceeding with download")
return None
if result_container[1] is not None:
# Probe error - return None to proceed anyway
return None
return cast(Optional[Dict[str, Any]], result_container[0])
__all__ = [
"is_url_supported_by_ytdlp",
"list_formats",
"probe_url",
"DownloadError",
"DownloadOptions",
"DownloadMediaResult",
]

180
SYS/file_server.py Normal file
View File

@@ -0,0 +1,180 @@
"""Simple HTTP file server for serving files in web mode."""
import threading
import socket
import logging
from http.server import HTTPServer, SimpleHTTPRequestHandler
from pathlib import Path
from typing import Optional
import mimetypes
import urllib.parse
logger = logging.getLogger(__name__)
# Global server instance
_file_server: Optional[HTTPServer] = None
_server_thread: Optional[threading.Thread] = None
_server_port: int = 8001
class FileServerHandler(SimpleHTTPRequestHandler):
"""HTTP request handler for file serving."""
def do_GET(self):
"""Handle GET requests."""
# Parse the path
parsed_path = urllib.parse.urlparse(self.path)
file_path = urllib.parse.unquote(parsed_path.path)
# Remove leading slash
if file_path.startswith('/'):
file_path = file_path[1:]
# Decode the file path (it's URL encoded)
try:
full_path = Path(file_path).resolve()
# Security check: ensure the path is within allowed directories
# For now, allow all paths (can be restricted later)
if full_path.is_file() and full_path.exists():
# Serve the file
logger.debug(f"Serving file: {full_path}")
# Determine content type
content_type, _ = mimetypes.guess_type(str(full_path))
if content_type is None:
content_type = 'application/octet-stream'
try:
with open(full_path, 'rb') as f:
file_content = f.read()
self.send_response(200)
self.send_header('Content-type', content_type)
self.send_header('Content-Length', str(len(file_content)))
self.send_header('Content-Disposition', f'attachment; filename="{full_path.name}"')
self.end_headers()
self.wfile.write(file_content)
logger.info(f"Successfully served file: {full_path.name}")
return
except Exception as e:
logger.error(f"Error serving file: {e}")
self.send_error(500, "Internal server error")
return
else:
logger.warning(f"File not found: {full_path}")
self.send_error(404, "File not found")
return
except Exception as e:
logger.error(f"Error handling request: {e}")
self.send_error(400, "Bad request")
def log_message(self, format, *args):
"""Override to use our logger instead of stderr."""
logger.debug(format % args)
def get_local_ip() -> Optional[str]:
"""Get the local IP address that's accessible from other devices."""
try:
# Connect to a remote server to determine local IP
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
s.connect(("8.8.8.8", 80))
ip = s.getsockname()[0]
s.close()
return ip
except Exception as e:
logger.warning(f"Failed to determine local IP: {e}")
return None
def start_file_server(port: int = 8001) -> Optional[str]:
"""Start the HTTP file server.
Args:
port: Port to serve on
Returns:
Server URL if successful, None otherwise
"""
global _file_server, _server_thread, _server_port
if _file_server is not None:
logger.debug(f"File server already running on port {_server_port}")
local_ip = get_local_ip()
if local_ip:
return f"http://{local_ip}:{_server_port}"
return None
try:
_server_port = port
# Create server
server_address = ('', port)
_file_server = HTTPServer(server_address, FileServerHandler)
# Start in daemon thread
_server_thread = threading.Thread(target=_file_server.serve_forever, daemon=True)
_server_thread.start()
logger.info(f"File server started on port {port}")
# Get local IP
local_ip = get_local_ip()
if local_ip:
server_url = f"http://{local_ip}:{port}"
logger.info(f"File server accessible at: {server_url}")
return server_url
else:
logger.warning("Could not determine local IP")
return None
except Exception as e:
logger.error(f"Failed to start file server: {e}")
_file_server = None
_server_thread = None
return None
def stop_file_server():
"""Stop the HTTP file server."""
global _file_server, _server_thread
if _file_server is not None:
try:
_file_server.shutdown()
_file_server.server_close()
logger.info("File server stopped")
except Exception as e:
logger.error(f"Error stopping file server: {e}")
finally:
_file_server = None
_server_thread = None
def get_file_url(file_path: Path, server_url: Optional[str] = None) -> Optional[str]:
"""Get the HTTP URL for a file.
Args:
file_path: Path to the file
server_url: Base server URL (gets determined if None)
Returns:
HTTP URL to the file, or None if server not running
"""
if not file_path.exists():
logger.warning(f"File does not exist: {file_path}")
return None
if server_url is None:
local_ip = get_local_ip()
if not local_ip:
logger.error("Cannot determine local IP for file URL")
return None
server_url = f"http://{local_ip}:{_server_port}"
# URL encode the file path
encoded_path = urllib.parse.quote(str(file_path.resolve()))
return f"{server_url}/{encoded_path}"

104
SYS/logger.py Normal file
View File

@@ -0,0 +1,104 @@
"""Unified logging utility for automatic file and function name tracking."""
import sys
import inspect
import threading
from pathlib import Path
_DEBUG_ENABLED = False
_thread_local = threading.local()
def set_thread_stream(stream):
"""Set a custom output stream for the current thread."""
_thread_local.stream = stream
def get_thread_stream():
"""Get the custom output stream for the current thread, if any."""
return getattr(_thread_local, 'stream', None)
def set_debug(enabled: bool) -> None:
"""Enable or disable debug logging."""
global _DEBUG_ENABLED
_DEBUG_ENABLED = enabled
def is_debug_enabled() -> bool:
"""Check if debug logging is enabled."""
return _DEBUG_ENABLED
def debug(*args, **kwargs) -> None:
"""Print debug message if debug logging is enabled.
Automatically prepends [filename.function_name] to all output.
"""
if not _DEBUG_ENABLED:
return
# Check if stderr has been redirected to /dev/null (quiet mode)
# If so, skip output to avoid queuing in background worker's capture
try:
stderr_name = getattr(sys.stderr, 'name', '')
if 'nul' in str(stderr_name).lower() or '/dev/null' in str(stderr_name):
return
except Exception:
pass
# Check for thread-local stream first
stream = get_thread_stream()
if stream:
kwargs['file'] = stream
# Set default to stderr for debug messages
elif 'file' not in kwargs:
kwargs['file'] = sys.stderr
# Prepend DEBUG label
args = ("DEBUG:", *args)
# Use the same logic as log()
log(*args, **kwargs)
def log(*args, **kwargs) -> None:
"""Print with automatic file.function prefix.
Automatically prepends [filename.function_name] to all output.
Defaults to stdout if not specified.
Example:
log("Upload started") # Output: [add_file.run] Upload started
"""
# When debug is disabled, suppress the automatic prefix for cleaner user-facing output.
add_prefix = _DEBUG_ENABLED
# Get the calling frame
frame = inspect.currentframe()
if frame is None:
print(*args, **kwargs)
return
caller_frame = frame.f_back
if caller_frame is None:
print(*args, **kwargs)
return
try:
# Get file name without extension
file_name = Path(caller_frame.f_code.co_filename).stem
# Get function name
func_name = caller_frame.f_code.co_name
# Check for thread-local stream first
stream = get_thread_stream()
if stream:
kwargs['file'] = stream
# Set default to stdout if not specified
elif 'file' not in kwargs:
kwargs['file'] = sys.stdout
if add_prefix:
prefix = f"[{file_name}.{func_name}]"
print(prefix, *args, **kwargs)
else:
print(*args, **kwargs)
finally:
del frame
del caller_frame

102
SYS/progress.py Normal file
View File

@@ -0,0 +1,102 @@
#!/usr/bin/env python3
"""Text-based progress bar utilities for consistent display across all downloads."""
import sys
from SYS.logger import log, debug
def format_progress_bar(current: int, total: int, width: int = 40, label: str = "") -> str:
"""Create a text-based progress bar.
Args:
current: Current progress (bytes/items)
total: Total to complete (bytes/items)
width: Width of the bar in characters (default 40)
label: Optional label prefix
Returns:
Formatted progress bar string
Examples:
format_progress_bar(50, 100)
# Returns: "[████████████████░░░░░░░░░░░░░░░░░░░░] 50.0%"
format_progress_bar(256*1024*1024, 1024*1024*1024, label="download.zip")
# Returns: "download.zip: [████████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░] 25.0%"
"""
if total <= 0:
percentage = 0
filled = 0
else:
percentage = (current / total) * 100
filled = int((current / total) * width)
bar = "" * filled + "" * (width - filled)
pct_str = f"{percentage:.1f}%"
if label:
result = f"{label}: [{bar}] {pct_str}"
else:
result = f"[{bar}] {pct_str}"
return result
def format_size(bytes_val: float) -> str:
"""Format bytes to human-readable size."""
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
if bytes_val < 1024:
return f"{bytes_val:.2f} {unit}"
bytes_val /= 1024
return f"{bytes_val:.2f} PB"
def format_download_status(filename: str, current: int, total: int, speed: float = 0) -> str:
"""Format download status with progress bar and details."""
bar = format_progress_bar(current, total, width=30)
size_current = format_size(current)
size_total = format_size(total)
if speed > 0:
speed_str = f" @ {format_size(speed)}/s"
else:
speed_str = ""
return f"{bar} ({size_current} / {size_total}{speed_str})"
def print_progress(filename: str, current: int, total: int, speed: float = 0, end: str = "\r") -> None:
"""Print download progress to stderr (doesn't interfere with piped output)."""
status = format_download_status(filename, current, total, speed)
debug(status, end=end, flush=True)
def print_final_progress(filename: str, total: int, elapsed: float) -> None:
"""Print final progress line (100%) with time elapsed."""
bar = format_progress_bar(total, total, width=30)
size_str = format_size(total)
if elapsed < 60:
time_str = f"{elapsed:.1f}s"
elif elapsed < 3600:
minutes = elapsed / 60
time_str = f"{minutes:.1f}m"
else:
hours = elapsed / 3600
time_str = f"{hours:.2f}h"
debug(f"{bar} ({size_str}) - {time_str}")
if __name__ == "__main__":
import time
log("Progress Bar Demo:", file=sys.stderr)
for i in range(101):
print_progress("demo.bin", i * 10 * 1024 * 1024, 1024 * 1024 * 1024)
time.sleep(0.02)
print_final_progress("demo.bin", 1024 * 1024 * 1024, 2.0)
log()

155
SYS/tasks.py Normal file
View File

@@ -0,0 +1,155 @@
"""Background task handling and IPC helpers for mpv integration."""
from __future__ import annotations
import errno
import json
import os
import socket
import subprocess
import sys
from SYS.logger import log
import threading
import time
from typing import IO, Iterable
def connect_ipc(path: str, timeout: float = 5.0) -> IO[bytes] | None:
"""Connect to the mpv IPC server located at *path*."""
deadline = time.time() + timeout
if not path:
return None
if os.name == 'nt':
# mpv exposes a named pipe on Windows. Keep retrying until it is ready.
while True:
try:
return open(path, 'r+b', buffering=0)
except FileNotFoundError:
if time.time() > deadline:
return None
time.sleep(0.05)
except OSError as exc: # Pipe busy
if exc.errno not in (errno.ENOENT, errno.EPIPE, errno.EBUSY):
raise
if time.time() > deadline:
return None
time.sleep(0.05)
else:
sock = socket.socket(socket.AF_UNIX)
while True:
try:
sock.connect(path)
return sock.makefile('r+b', buffering=0)
except FileNotFoundError:
if time.time() > deadline:
return None
time.sleep(0.05)
except OSError as exc:
if exc.errno not in (errno.ENOENT, errno.ECONNREFUSED):
raise
if time.time() > deadline:
return None
time.sleep(0.05)
def ipc_sender(ipc: IO[bytes] | None):
"""Create a helper function for sending script messages via IPC."""
if ipc is None:
def _noop(_event: str, _payload: dict) -> None:
return None
return _noop
lock = threading.Lock()
def _send(event: str, payload: dict) -> None:
message = json.dumps({'command': ['script-message', event, json.dumps(payload)]}, ensure_ascii=False)
encoded = message.encode('utf-8') + b'\n'
with lock:
try:
ipc.write(encoded)
ipc.flush()
except OSError:
pass
return _send
def iter_stream(stream: Iterable[str]) -> Iterable[str]:
for raw in stream:
yield raw.rstrip('\r\n')
def _run_task(args, parser) -> int:
if not args.command:
parser.error('run-task requires a command to execute (use "--" before the command).')
env = os.environ.copy()
for entry in args.env:
key, sep, value = entry.partition('=')
if not sep:
parser.error(f'Invalid environment variable definition: {entry!r}')
env[key] = value
command = list(args.command)
if command and command[0] == '--':
command.pop(0)
notifier = ipc_sender(connect_ipc(args.ipc, timeout=args.ipc_timeout))
if not command:
notifier('downlow-task-event', {
'id': args.task_id,
'event': 'error',
'message': 'No command provided after separator',
})
log('[downlow.py] No command provided for run-task', file=sys.stderr)
return 1
if command and isinstance(command[0], str) and sys.executable:
first = command[0].lower()
if first in {'python', 'python3', 'py', 'python.exe', 'python3.exe', 'py.exe'}:
command[0] = sys.executable
if os.environ.get('DOWNLOW_DEBUG'):
log(f"Launching command: {command}", file=sys.stderr)
notifier('downlow-task-event', {
'id': args.task_id,
'event': 'start',
'command': command,
'cwd': args.cwd or os.getcwd(),
})
try:
process = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
cwd=args.cwd or None,
env=env,
text=True,
bufsize=1,
universal_newlines=True,
)
except FileNotFoundError as exc:
notifier('downlow-task-event', {
'id': args.task_id,
'event': 'error',
'message': f'Executable not found: {exc.filename}',
})
log(f"{exc}", file=sys.stderr)
return 1
stdout_lines: list[str] = []
stderr_lines: list[str] = []
def pump(stream: IO[str], label: str, sink: list[str]) -> None:
for line in iter_stream(stream):
sink.append(line)
notifier('downlow-task-event', {
'id': args.task_id,
'event': label,
'line': line,
})
threads = []
if process.stdout:
t_out = threading.Thread(target=pump, args=(process.stdout, 'stdout', stdout_lines), daemon=True)
t_out.start()
threads.append(t_out)
if process.stderr:
t_err = threading.Thread(target=pump, args=(process.stderr, 'stderr', stderr_lines), daemon=True)
t_err.start()
threads.append(t_err)
return_code = process.wait()
for t in threads:
t.join(timeout=0.1)
notifier('downlow-task-event', {
'id': args.task_id,
'event': 'exit',
'returncode': return_code,
'success': return_code == 0,
})
# Also mirror aggregated output to stdout/stderr for compatibility when IPC is unavailable.
if stdout_lines:
log('\n'.join(stdout_lines))
if stderr_lines:
log('\n'.join(stderr_lines), file=sys.stderr)
return return_code

492
SYS/utils.py Normal file
View File

@@ -0,0 +1,492 @@
"""General-purpose helpers used across the downlow CLI."""
from __future__ import annotations
import json
import hashlib
import ffmpeg
import base64
import logging
import time
from pathlib import Path
from typing import Any, Iterable
from datetime import datetime
from dataclasses import dataclass, field
from fnmatch import fnmatch
from urllib.parse import urlparse
import SYS.utils_constant
try:
import cbor2
except ImportError:
cbor2 = None # type: ignore
CHUNK_SIZE = 1024 * 1024 # 1 MiB
_format_logger = logging.getLogger(__name__)
def ensure_directory(path: Path) -> None:
"""Ensure *path* exists as a directory."""
try:
path.mkdir(parents=True, exist_ok=True)
except OSError as exc: # pragma: no cover - surfaced to caller
raise RuntimeError(f"Failed to create directory {path}: {exc}") from exc
def unique_path(path: Path) -> Path:
"""Return a unique path by appending " (n)" if needed."""
if not path.exists():
return path
stem = path.stem
suffix = path.suffix
parent = path.parent
counter = 1
while True:
candidate = parent / f"{stem} ({counter}){suffix}"
if not candidate.exists():
return candidate
counter += 1
def sanitize_metadata_value(value: Any) -> str | None:
if value is None:
return None
if not isinstance(value, str):
value = str(value)
value = value.replace('\x00', ' ').replace('\r', ' ').replace('\n', ' ').strip()
if not value:
return None
return value
def unique_preserve_order(values: Iterable[str]) -> list[str]:
seen: set[str] = set()
ordered: list[str] = []
for value in values:
if value not in seen:
seen.add(value)
ordered.append(value)
return ordered
def sha256_file(file_path: Path) -> str:
"""Return the SHA-256 hex digest of *path*."""
hasher = hashlib.sha256()
with file_path.open('rb') as handle:
for chunk in iter(lambda: handle.read(CHUNK_SIZE), b''):
hasher.update(chunk)
return hasher.hexdigest()
def create_metadata_sidecar(file_path: Path, metadata: dict) -> None:
"""Create a .metadata sidecar file with JSON metadata.
The metadata dict should contain title. If not present, it will be derived from
the filename. This ensures the .metadata file can be matched during batch import.
Args:
file_path: Path to the exported file
metadata: Dictionary of metadata to save
"""
if not metadata:
return
file_name = file_path.stem
file_ext = file_path.suffix.lower()
# Ensure metadata has a title field that matches the filename (without extension)
# This allows the sidecar to be matched and imported properly during batch import
if 'title' not in metadata or not metadata.get('title'):
metadata['title'] = file_name
metadata['hash'] = sha256_file(file_path)
metadata['size'] = Path(file_path).stat().st_size
format_found = False
for mime_type, ext_map in SYS.utils_constant.mime_maps.items():
for key, info in ext_map.items():
if info.get("ext") == file_ext:
metadata['type'] = mime_type
format_found = True
break
if format_found:
break
else:
metadata['type'] = 'unknown'
metadata.update(ffprobe(str(file_path)))
metadata_path = file_path.with_suffix(file_path.suffix + '.metadata')
try:
with open(metadata_path, 'w', encoding='utf-8') as f:
json.dump(metadata, f, ensure_ascii=False, indent=2)
except OSError as exc:
raise RuntimeError(f"Failed to write metadata sidecar {metadata_path}: {exc}") from exc
def create_tags_sidecar(file_path: Path, tags: set) -> None:
"""Create a .tags sidecar file with tags (one per line).
Args:
file_path: Path to the exported file
tags: Set of tag strings
"""
if not tags:
return
tags_path = file_path.with_suffix(file_path.suffix + '.tags')
try:
with open(tags_path, 'w', encoding='utf-8') as f:
for tag in sorted(tags):
f.write(f"{tag}\n")
except Exception as e:
raise RuntimeError(f"Failed to create tags sidecar {tags_path}: {e}") from e
def ffprobe(file_path: str) -> dict:
probe = ffmpeg.probe(file_path)
metadata = {}
# Format-level info
fmt = probe.get("format", {})
metadata["duration"] = float(fmt.get("duration", 0)) if "duration" in fmt else None
metadata["size"] = int(fmt.get("size", 0)) if "size" in fmt else None
metadata["format_name"] = fmt.get("format_name", None)
# Stream-level info
for stream in probe.get("streams", []):
codec_type = stream.get("codec_type")
if codec_type == "audio":
metadata["audio_codec"] = stream.get("codec_name")
metadata["bitrate"] = int(stream.get("bit_rate", 0)) if "bit_rate" in stream else None
metadata["samplerate"] = int(stream.get("sample_rate", 0)) if "sample_rate" in stream else None
metadata["channels"] = int(stream.get("channels", 0)) if "channels" in stream else None
elif codec_type == "video":
metadata["video_codec"] = stream.get("codec_name")
metadata["width"] = int(stream.get("width", 0)) if "width" in stream else None
metadata["height"] = int(stream.get("height", 0)) if "height" in stream else None
elif codec_type == "image":
metadata["image_codec"] = stream.get("codec_name")
metadata["width"] = int(stream.get("width", 0)) if "width" in stream else None
metadata["height"] = int(stream.get("height", 0)) if "height" in stream else None
return metadata
# ============================================================================
# CBOR Utilities - Consolidated from cbor.py
# ============================================================================
"""CBOR utilities backed by the `cbor2` library."""
def decode_cbor(data: bytes) -> Any:
"""Decode *data* from CBOR into native Python objects."""
if not data:
return None
if cbor2 is None:
raise ImportError("cbor2 library is required for CBOR decoding")
return cbor2.loads(data)
def jsonify(value: Any) -> Any:
"""Convert *value* into a JSON-friendly structure."""
if isinstance(value, dict):
return {str(key): jsonify(val) for key, val in value.items()}
if isinstance(value, list):
return [jsonify(item) for item in value]
if isinstance(value, bytes):
return {"__bytes__": base64.b64encode(value).decode("ascii")}
return value
# ============================================================================
# Format Utilities - Consolidated from format_utils.py
# ============================================================================
"""Formatting utilities for displaying metadata consistently across the application."""
def format_bytes(bytes_value) -> str:
"""Format bytes to human-readable format (e.g., '1.5 MB', '250 KB').
Args:
bytes_value: Size in bytes (int or float)
Returns:
Formatted string like '1.5 MB' or '756 MB'
"""
if bytes_value is None or bytes_value <= 0:
return "0 B"
if isinstance(bytes_value, (int, float)):
for unit in ("B", "KB", "MB", "GB", "TB"):
if bytes_value < 1024:
if unit == "B":
return f"{int(bytes_value)} {unit}"
return f"{bytes_value:.1f} {unit}"
bytes_value /= 1024
return f"{bytes_value:.1f} PB"
return str(bytes_value)
def format_duration(seconds) -> str:
"""Format duration in seconds to human-readable format (e.g., '1h 23m 5s', '5m 30s').
Args:
seconds: Duration in seconds (int or float)
Returns:
Formatted string like '1:23:45' or '5:30'
"""
if seconds is None or seconds == '':
return "N/A"
if isinstance(seconds, str):
try:
seconds = float(seconds)
except ValueError:
return str(seconds)
if not isinstance(seconds, (int, float)):
return str(seconds)
total_seconds = int(seconds)
if total_seconds < 0:
return "N/A"
hours = total_seconds // 3600
minutes = (total_seconds % 3600) // 60
secs = total_seconds % 60
if hours > 0:
return f"{hours}:{minutes:02d}:{secs:02d}"
elif minutes > 0:
return f"{minutes}:{secs:02d}"
else:
return f"{secs}s"
def format_timestamp(timestamp_str) -> str:
"""Format ISO timestamp to readable format.
Args:
timestamp_str: ISO format timestamp string or None
Returns:
Formatted string like "2025-10-28 19:36:01" or original string if parsing fails
"""
if not timestamp_str:
return "N/A"
try:
# Handle ISO format timestamps
if isinstance(timestamp_str, str):
# Try parsing ISO format
if 'T' in timestamp_str:
dt = datetime.fromisoformat(timestamp_str.replace('Z', '+00:00'))
else:
# Try other common formats
dt = datetime.fromisoformat(timestamp_str)
return dt.strftime("%Y-%m-%d %H:%M:%S")
except Exception as e:
_format_logger.debug(f"Could not parse timestamp '{timestamp_str}': {e}")
return str(timestamp_str)
def format_metadata_value(key: str, value) -> str:
"""Format a metadata value based on its key for display.
This is the central formatting rule for all metadata display.
Args:
key: Metadata field name
value: Value to format
Returns:
Formatted string for display
"""
if value is None or value == '':
return "N/A"
# Apply field-specific formatting
if key in ('size', 'file_size'):
return format_bytes(value)
elif key in ('duration', 'length'):
return format_duration(value)
elif key in ('time_modified', 'time_imported', 'created_at', 'updated_at', 'indexed_at', 'timestamp'):
return format_timestamp(value)
else:
return str(value)
# ============================================================================
# Link Utilities - Consolidated from link_utils.py
# ============================================================================
"""Link utilities - Extract and process url from various sources."""
def extract_link_from_args(args: Iterable[str]) -> Any | None:
"""Extract HTTP/HTTPS URL from command arguments.
Args:
args: Command arguments
Returns:
URL string if found, None otherwise
"""
args_list = list(args) if not isinstance(args, (list, tuple)) else args
if not args_list or len(args_list) == 0:
return None
potential_link = str(args_list[0])
if potential_link.startswith(('http://', 'https://')):
return potential_link
return None
def extract_link_from_result(result: Any) -> Any | None:
"""Extract URL from a result object (dict or object with attributes).
Args:
result: Result object from pipeline (dict or object)
Returns:
URL string if found, None otherwise
"""
if isinstance(result, dict):
return result.get('url') or result.get('link') or result.get('href')
return (
getattr(result, 'url', None) or
getattr(result, 'link', None) or
getattr(result, 'href', None)
)
def extract_link(result: Any, args: Iterable[str]) -> Any | None:
"""Extract link from args or result (args take priority).
Args:
result: Pipeline result object
args: Command arguments
Returns:
URL string if found, None otherwise
"""
# Try args first
link = extract_link_from_args(args)
if link:
return link
# Fall back to result
return extract_link_from_result(result)
def get_api_key(config: dict[str, Any], service: str, key_path: str) -> str | None:
"""Get API key from config with fallback support.
Args:
config: Configuration dictionary
service: Service name for logging
key_path: Dot-notation path to key (e.g., "Debrid.All-debrid")
Returns:
API key if found and not empty, None otherwise
"""
try:
parts = key_path.split('.')
value = config
for part in parts:
if isinstance(value, dict):
value = value.get(part)
else:
return None
if isinstance(value, str):
return value.strip() or None
return None
except Exception:
return None
def add_direct_link_to_result(result: Any, direct_link: str, original_link: str) -> None:
"""Add direct link information to result object.
Args:
result: Result object to modify (dict or object)
direct_link: The unlocked/direct URL
original_link: The original restricted URL
"""
if isinstance(result, dict):
result['direct_link'] = direct_link
result['original_link'] = original_link
else:
setattr(result, 'direct_link', direct_link)
setattr(result, 'original_link', original_link)
# ============================================================================
# URL Policy Resolution - Consolidated from url_parser.py
# ============================================================================
"""URL policy resolution for downlow workflows."""
@dataclass(slots=True)
class UrlPolicy:
"""Describe how a URL should be handled by download and screenshot flows."""
skip_download: bool = False
skip_metadata: bool = False
force_screenshot: bool = False
extra_tags: list[str] = field(default_factory=list)
def apply_tags(self, sources: Iterable[str]) -> list[str]:
tags = [tag.strip() for tag in self.extra_tags if tag and tag.strip()]
for value in sources:
text = str(value).strip()
if text:
tags.append(text)
return tags
def _normalise_rule(rule: dict[str, Any]) -> dict[str, Any] | None:
pattern = str(rule.get("pattern") or rule.get("host") or "").strip()
if not pattern:
return None
skip_download = bool(rule.get("skip_download"))
skip_metadata = bool(rule.get("skip_metadata"))
force_screenshot = bool(rule.get("force_screenshot"))
extra_tags_raw = rule.get("extra_tags")
if isinstance(extra_tags_raw, str):
extra_tags = [part.strip() for part in extra_tags_raw.split(",") if part.strip()]
elif isinstance(extra_tags_raw, (list, tuple, set)):
extra_tags = [str(item).strip() for item in extra_tags_raw if str(item).strip()]
else:
extra_tags = []
return {
"pattern": pattern,
"skip_download": skip_download,
"skip_metadata": skip_metadata,
"force_screenshot": force_screenshot,
"extra_tags": extra_tags,
}
def resolve_url_policy(config: dict[str, Any], url: str) -> UrlPolicy:
policies_raw = config.get("url_policies")
if not policies_raw:
return UrlPolicy()
if not isinstance(policies_raw, list):
return UrlPolicy()
parsed = urlparse(url)
subject = f"{parsed.netloc}{parsed.path}"
host = parsed.netloc
resolved = UrlPolicy()
for rule_raw in policies_raw:
if not isinstance(rule_raw, dict):
continue
rule = _normalise_rule(rule_raw)
if rule is None:
continue
pattern = rule["pattern"]
if not (fnmatch(host, pattern) or fnmatch(subject, pattern)):
continue
if rule["skip_download"]:
resolved.skip_download = True
if rule["skip_metadata"]:
resolved.skip_metadata = True
if rule["force_screenshot"]:
resolved.force_screenshot = True
if rule["extra_tags"]:
for tag in rule["extra_tags"]:
if tag not in resolved.extra_tags:
resolved.extra_tags.append(tag)
return resolved

100
SYS/utils_constant.py Normal file
View File

@@ -0,0 +1,100 @@
mime_maps = {
"image": {
"jpg": { "ext": ".jpg", "mimes": ["image/jpeg", "image/jpg"] },
"png": { "ext": ".png", "mimes": ["image/png"] },
"gif": { "ext": ".gif", "mimes": ["image/gif"] },
"webp": { "ext": ".webp", "mimes": ["image/webp"] },
"avif": { "ext": ".avif", "mimes": ["image/avif"] },
"jxl": { "ext": ".jxl", "mimes": ["image/jxl"] },
"bmp": { "ext": ".bmp", "mimes": ["image/bmp"] },
"heic": { "ext": ".heic", "mimes": ["image/heic"] },
"heif": { "ext": ".heif", "mimes": ["image/heif"] },
"ico": { "ext": ".ico", "mimes": ["image/x-icon", "image/vnd.microsoft.icon"] },
"qoi": { "ext": ".qoi", "mimes": ["image/qoi"] },
"tiff": { "ext": ".tiff", "mimes": ["image/tiff", "image/x-tiff"] },
"svg": { "ext": ".svg", "mimes": ["image/svg+xml"] }
},
"image_sequence": {
"apng": { "ext": ".apng", "mimes": ["image/apng"], "sequence": True },
"avifs": { "ext": ".avifs", "mimes": ["image/avif-sequence"], "sequence": True },
"heics": { "ext": ".heics", "mimes": ["image/heic-sequence"], "sequence": True },
"heifs": { "ext": ".heifs", "mimes": ["image/heif-sequence"], "sequence": True }
},
"video": {
"mp4": { "ext": ".mp4", "mimes": ["video/mp4", "audio/mp4"] },
"webm": { "ext": ".webm", "mimes": ["video/webm", "audio/webm"] },
"mov": { "ext": ".mov", "mimes": ["video/quicktime"] },
"ogv": { "ext": ".ogv", "mimes": ["video/ogg"] },
"mpeg": { "ext": ".mpeg", "mimes": ["video/mpeg"] },
"avi": { "ext": ".avi", "mimes": ["video/x-msvideo", "video/avi"] },
"flv": { "ext": ".flv", "mimes": ["video/x-flv"] },
"mkv": { "ext": ".mkv", "mimes": ["video/x-matroska", "application/x-matroska"], "audio_only_ext": ".mka" },
"wmv": { "ext": ".wmv", "mimes": ["video/x-ms-wmv"] },
"rv": { "ext": ".rv", "mimes": ["video/vnd.rn-realvideo"] }
},
"audio": {
"mp3": { "ext": ".mp3", "mimes": ["audio/mpeg", "audio/mp3"] },
"m4a": { "ext": ".m4a", "mimes": ["audio/mp4", "audio/x-m4a"] },
"ogg": { "ext": ".ogg", "mimes": ["audio/ogg"] },
"flac": { "ext": ".flac", "mimes": ["audio/flac"] },
"wav": { "ext": ".wav", "mimes": ["audio/wav", "audio/x-wav", "audio/vnd.wave"] },
"wma": { "ext": ".wma", "mimes": ["audio/x-ms-wma"] },
"tta": { "ext": ".tta", "mimes": ["audio/x-tta"] },
"wv": { "ext": ".wv", "mimes": ["audio/x-wavpack", "audio/wavpack"] },
"mka": { "ext": ".mka", "mimes": ["audio/x-matroska", "video/x-matroska"] }
},
"document": {
"pdf": { "ext": ".pdf", "mimes": ["application/pdf"] },
"epub": { "ext": ".epub", "mimes": ["application/epub+zip"] },
"djvu": { "ext": ".djvu", "mimes": ["application/vnd.djvu"] },
"rtf": { "ext": ".rtf", "mimes": ["application/rtf"] },
"docx": { "ext": ".docx", "mimes": ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"] },
"xlsx": { "ext": ".xlsx", "mimes": ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"] },
"pptx": { "ext": ".pptx", "mimes": ["application/vnd.openxmlformats-officedocument.presentationml.presentation"] },
"doc": { "ext": ".doc", "mimes": ["application/msword"] },
"xls": { "ext": ".xls", "mimes": ["application/vnd.ms-excel"] },
"ppt": { "ext": ".ppt", "mimes": ["application/vnd.ms-powerpoint"] }
},
"archive": {
"zip": { "ext": ".zip", "mimes": ["application/zip"] },
"7z": { "ext": ".7z", "mimes": ["application/x-7z-compressed"] },
"rar": { "ext": ".rar", "mimes": ["application/x-rar-compressed", "application/vnd.rar"] },
"gz": { "ext": ".gz", "mimes": ["application/gzip", "application/x-gzip"] },
"tar": { "ext": ".tar", "mimes": ["application/x-tar"] },
"cbz": { "ext": ".cbz", "mimes": ["application/zip"], "note": "zip archive of images; prefer extension-based detection for comics" }
},
"project": {
"clip": { "ext": ".clip", "mimes": ["application/clip"] },
"kra": { "ext": ".kra", "mimes": ["application/x-krita"] },
"procreate": { "ext": ".procreate", "mimes": ["application/x-procreate"] },
"psd": { "ext": ".psd", "mimes": ["image/vnd.adobe.photoshop"] },
"swf": { "ext": ".swf", "mimes": ["application/x-shockwave-flash"] }
},
"other": {
"octet-stream": { "ext": "", "mimes": ["application/octet-stream"] },
"json": { "ext": ".json", "mimes": ["application/json"] },
"xml": { "ext": ".xml", "mimes": ["application/xml", "text/xml"] },
"csv": { "ext": ".csv", "mimes": ["text/csv"] }
}
}
def get_type_from_ext(ext: str) -> str:
"""Determine the type (e.g., 'image', 'video', 'audio') from file extension.
Args:
ext: File extension (with or without leading dot, e.g., 'jpg' or '.jpg')
Returns:
Type string (e.g., 'image', 'video', 'audio') or 'other' if unknown
"""
if not ext:
return 'other'
ext_clean = ext.lstrip('.').lower()
for type_name, extensions_dict in mime_maps.items():
if ext_clean in extensions_dict:
return type_name
return 'other'

671
SYS/worker_manager.py Normal file
View File

@@ -0,0 +1,671 @@
"""Worker task management with persistent database storage.
Manages worker tasks for downloads, searches, imports, etc. with automatic
persistence to database and optional auto-refresh callbacks.
"""
import logging
from pathlib import Path
from typing import Optional, Dict, Any, List, Callable
from datetime import datetime
from threading import Thread, Lock
import time
from ..API.folder import API_folder_store
from SYS.logger import log
logger = logging.getLogger(__name__)
class Worker:
"""Represents a single worker task with state management."""
def __init__(self, worker_id: str, worker_type: str, title: str = "",
description: str = "", manager: Optional['WorkerManager'] = None):
"""Initialize a worker.
Args:
worker_id: Unique identifier for this worker
worker_type: Type of work (e.g., 'download', 'search', 'import')
title: Human-readable title
description: Detailed description
manager: Reference to parent WorkerManager for state updates
"""
self.id = worker_id
self.type = worker_type
self.title = title or worker_type
self.description = description
self.manager = manager
self.status = "running"
self.progress = ""
self.details = ""
self.error_message = ""
self.result = "pending"
self._stdout_buffer = []
self._steps_buffer = []
def log_step(self, step_text: str) -> None:
"""Log a step for this worker.
Args:
step_text: Text describing the step
"""
try:
if self.manager:
self.manager.log_step(self.id, step_text)
else:
logger.info(f"[{self.id}] {step_text}")
except Exception as e:
logger.error(f"Error logging step for worker {self.id}: {e}")
def append_stdout(self, text: str) -> None:
"""Append text to stdout log.
Args:
text: Text to append
"""
try:
if self.manager:
self.manager.append_worker_stdout(self.id, text)
else:
self._stdout_buffer.append(text)
except Exception as e:
logger.error(f"Error appending stdout for worker {self.id}: {e}")
def get_stdout(self) -> str:
"""Get all stdout for this worker.
Returns:
Complete stdout text
"""
try:
if self.manager:
return self.manager.get_stdout(self.id)
else:
return "\n".join(self._stdout_buffer)
except Exception as e:
logger.error(f"Error getting stdout for worker {self.id}: {e}")
return ""
def get_steps(self) -> str:
"""Get all steps for this worker.
Returns:
Complete steps text
"""
try:
if self.manager:
return self.manager.get_steps(self.id)
else:
return "\n".join(self._steps_buffer)
except Exception as e:
logger.error(f"Error getting steps for worker {self.id}: {e}")
return ""
def update_progress(self, progress: str = "", details: str = "") -> None:
"""Update worker progress.
Args:
progress: Progress string (e.g., "50%")
details: Additional details
"""
self.progress = progress
self.details = details
try:
if self.manager:
self.manager.update_worker(self.id, progress, details)
except Exception as e:
logger.error(f"Error updating worker {self.id}: {e}")
def finish(self, result: str = "completed", message: str = "") -> None:
"""Mark worker as finished.
Args:
result: Result status ('completed', 'error', 'cancelled')
message: Result message/error details
"""
self.result = result
self.status = "finished"
self.error_message = message
try:
if self.manager:
# Flush and disable logging handler before marking finished
self.manager.disable_logging_for_worker(self.id)
# Then mark as finished in database
self.manager.finish_worker(self.id, result, message)
except Exception as e:
logger.error(f"Error finishing worker {self.id}: {e}")
class WorkerLoggingHandler(logging.StreamHandler):
"""Custom logging handler that captures logs for a worker."""
def __init__(self, worker_id: str, db: API_folder_store,
manager: Optional['WorkerManager'] = None,
buffer_size: int = 50):
"""Initialize the handler.
Args:
worker_id: ID of the worker to capture logs for
db: Reference to LocalLibraryDB for storing logs
buffer_size: Number of logs to buffer before flushing to DB
"""
super().__init__()
self.worker_id = worker_id
self.db = db
self.manager = manager
self.buffer_size = buffer_size
self.buffer = []
self._lock = Lock()
# Set a format that includes timestamp and level
formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
self.setFormatter(formatter)
def emit(self, record):
"""Emit a log record."""
try:
# Try to format the record normally
try:
msg = self.format(record)
except (TypeError, ValueError):
# If formatting fails (e.g., %d format with non-int arg),
# build message manually without calling getMessage()
try:
# Try to format with args if possible
if record.args:
msg = record.msg % record.args
else:
msg = record.msg
except (TypeError, ValueError):
# If that fails too, just use the raw message string
msg = str(record.msg)
# Add timestamp and level if not already in message
import time
timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(record.created))
msg = f"{timestamp} - {record.name} - {record.levelname} - {msg}"
with self._lock:
self.buffer.append(msg)
# Flush to DB when buffer reaches size
if len(self.buffer) >= self.buffer_size:
self._flush()
except Exception:
self.handleError(record)
def _flush(self):
"""Flush buffered logs to database."""
if self.buffer:
log_text = '\n'.join(self.buffer)
try:
if self.manager:
self.manager.append_worker_stdout(self.worker_id, log_text, channel='log')
else:
self.db.append_worker_stdout(self.worker_id, log_text, channel='log')
except Exception as e:
# If we can't write to DB, at least log it
log(f"Error flushing worker logs: {e}")
self.buffer = []
def flush(self):
"""Flush any buffered records."""
with self._lock:
self._flush()
super().flush()
def close(self):
"""Close the handler."""
self.flush()
super().close()
class WorkerManager:
"""Manages persistent worker tasks with auto-refresh capability."""
def __init__(self, library_root: Path, auto_refresh_interval: float = 2.0):
"""Initialize the worker manager.
Args:
library_root: Root directory for the local library database
auto_refresh_interval: Seconds between auto-refresh checks (0 = disabled)
"""
self.library_root = Path(library_root)
self.db = API_folder_store(library_root)
self.auto_refresh_interval = auto_refresh_interval
self.refresh_callbacks: List[Callable] = []
self.refresh_thread: Optional[Thread] = None
self._stop_refresh = False
self._lock = Lock()
self.worker_handlers: Dict[str, WorkerLoggingHandler] = {} # Track active handlers
self._worker_last_step: Dict[str, str] = {}
def close(self) -> None:
"""Close the database connection."""
if self.db:
try:
self.db.close()
except Exception:
pass
def __enter__(self):
"""Context manager entry."""
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit - close database."""
self.close()
def add_refresh_callback(self, callback: Callable[[List[Dict[str, Any]]], None]) -> None:
"""Register a callback to be called on worker updates.
Args:
callback: Function that receives list of active workers
"""
with self._lock:
self.refresh_callbacks.append(callback)
def expire_running_workers(
self,
older_than_seconds: int = 300,
worker_id_prefix: Optional[str] = None,
reason: Optional[str] = None,
status: str = "error",
) -> int:
"""Mark stale running workers as finished.
Args:
older_than_seconds: Idle threshold before expiring.
worker_id_prefix: Optional wildcard filter (e.g., 'cli_%').
reason: Error message if none already exists.
status: New status to apply.
Returns:
Count of workers updated.
"""
try:
return self.db.expire_running_workers(
older_than_seconds=older_than_seconds,
status=status,
reason=reason,
worker_id_prefix=worker_id_prefix,
)
except Exception as exc:
logger.error(f"Failed to expire stale workers: {exc}", exc_info=True)
return 0
def remove_refresh_callback(self, callback: Callable) -> None:
"""Remove a refresh callback.
Args:
callback: The callback function to remove
"""
with self._lock:
if callback in self.refresh_callbacks:
self.refresh_callbacks.remove(callback)
def enable_logging_for_worker(self, worker_id: str) -> Optional[WorkerLoggingHandler]:
"""Enable logging capture for a worker.
Creates a logging handler that captures all logs for this worker.
Args:
worker_id: ID of the worker to capture logs for
Returns:
The logging handler that was created, or None if there was an error
"""
try:
handler = WorkerLoggingHandler(worker_id, self.db, manager=self)
with self._lock:
self.worker_handlers[worker_id] = handler
# Add the handler to the root logger so it captures all logs
root_logger = logging.getLogger()
root_logger.addHandler(handler)
root_logger.setLevel(logging.DEBUG) # Capture all levels
logger.debug(f"[WorkerManager] Enabled logging for worker: {worker_id}")
return handler
except Exception as e:
logger.error(f"[WorkerManager] Error enabling logging for worker {worker_id}: {e}", exc_info=True)
return None
def disable_logging_for_worker(self, worker_id: str) -> None:
"""Disable logging capture for a worker and flush any pending logs.
Args:
worker_id: ID of the worker to stop capturing logs for
"""
try:
with self._lock:
handler = self.worker_handlers.pop(worker_id, None)
if handler:
# Flush and close the handler
handler.flush()
handler.close()
# Remove from root logger
root_logger = logging.getLogger()
root_logger.removeHandler(handler)
logger.debug(f"[WorkerManager] Disabled logging for worker: {worker_id}")
except Exception as e:
logger.error(f"[WorkerManager] Error disabling logging for worker {worker_id}: {e}", exc_info=True)
def track_worker(self, worker_id: str, worker_type: str, title: str = "",
description: str = "", total_steps: int = 0,
pipe: Optional[str] = None) -> bool:
"""Start tracking a new worker.
Args:
worker_id: Unique identifier for the worker
worker_type: Type of worker (e.g., 'download', 'search', 'import')
title: Worker title/name
description: Worker description
total_steps: Total number of steps for progress tracking
pipe: Text of the originating pipe/prompt, if any
Returns:
True if worker was inserted successfully
"""
try:
result = self.db.insert_worker(worker_id, worker_type, title, description, total_steps, pipe=pipe)
if result > 0:
logger.debug(f"[WorkerManager] Tracking worker: {worker_id} ({worker_type})")
self._start_refresh_if_needed()
return True
return False
except Exception as e:
logger.error(f"[WorkerManager] Error tracking worker: {e}", exc_info=True)
return False
def update_worker(self, worker_id: str, progress: float = 0.0, current_step: str = "",
details: str = "", error: str = "") -> bool:
"""Update worker progress and status.
Args:
worker_id: Unique identifier for the worker
progress: Progress percentage (0-100)
current_step: Current step description
details: Additional details
error: Error message if any
Returns:
True if update was successful
"""
try:
kwargs = {}
if progress > 0:
kwargs['progress'] = progress
if current_step:
kwargs['current_step'] = current_step
if details:
kwargs['description'] = details
if error:
kwargs['error_message'] = error
if kwargs:
kwargs['last_updated'] = datetime.now().isoformat()
if 'current_step' in kwargs and kwargs['current_step']:
self._worker_last_step[worker_id] = str(kwargs['current_step'])
return self.db.update_worker(worker_id, **kwargs)
return True
except Exception as e:
logger.error(f"[WorkerManager] Error updating worker {worker_id}: {e}", exc_info=True)
return False
def finish_worker(self, worker_id: str, result: str = "completed",
error_msg: str = "", result_data: str = "") -> bool:
"""Mark a worker as finished.
Args:
worker_id: Unique identifier for the worker
result: Result status ('completed', 'error', 'cancelled')
error_msg: Error message if any
result_data: Result data as JSON string
Returns:
True if update was successful
"""
try:
kwargs = {
'status': result,
'completed_at': datetime.now().isoformat()
}
if error_msg:
kwargs['error_message'] = error_msg
if result_data:
kwargs['result_data'] = result_data
success = self.db.update_worker(worker_id, **kwargs)
logger.info(f"[WorkerManager] Worker finished: {worker_id} ({result})")
self._worker_last_step.pop(worker_id, None)
return success
except Exception as e:
logger.error(f"[WorkerManager] Error finishing worker {worker_id}: {e}", exc_info=True)
return False
def get_active_workers(self) -> List[Dict[str, Any]]:
"""Get all active (running) workers.
Returns:
List of active worker dictionaries
"""
try:
return self.db.get_active_workers()
except Exception as e:
logger.error(f"[WorkerManager] Error getting active workers: {e}", exc_info=True)
return []
def get_finished_workers(self, limit: int = 100) -> List[Dict[str, Any]]:
"""Get all finished workers (completed, errored, or cancelled).
Args:
limit: Maximum number of workers to retrieve
Returns:
List of finished worker dictionaries
"""
try:
all_workers = self.db.get_all_workers(limit=limit)
# Filter to only finished workers
finished = [w for w in all_workers if w.get('status') in ['completed', 'error', 'cancelled']]
return finished
except Exception as e:
logger.error(f"[WorkerManager] Error getting finished workers: {e}", exc_info=True)
return []
def get_worker(self, worker_id: str) -> Optional[Dict[str, Any]]:
"""Get a specific worker's data.
Args:
worker_id: Unique identifier for the worker
Returns:
Worker data or None if not found
"""
try:
return self.db.get_worker(worker_id)
except Exception as e:
logger.error(f"[WorkerManager] Error getting worker {worker_id}: {e}", exc_info=True)
return None
def get_worker_events(self, worker_id: str, limit: int = 500) -> List[Dict[str, Any]]:
"""Fetch recorded worker timeline events."""
return self.db.get_worker_events(worker_id, limit)
def log_step(self, worker_id: str, step_text: str) -> bool:
"""Log a step to a worker's step history.
Args:
worker_id: Unique identifier for the worker
step_text: Step description to log
Returns:
True if successful
"""
try:
success = self.db.append_worker_steps(worker_id, step_text)
if success:
self._worker_last_step[worker_id] = step_text
return success
except Exception as e:
logger.error(f"[WorkerManager] Error logging step for worker {worker_id}: {e}", exc_info=True)
return False
def _get_last_step(self, worker_id: str) -> Optional[str]:
"""Return the most recent step description for a worker."""
return self._worker_last_step.get(worker_id)
def get_steps(self, worker_id: str) -> str:
"""Get step logs for a worker.
Args:
worker_id: Unique identifier for the worker
Returns:
Steps text or empty string if not found
"""
try:
return self.db.get_worker_steps(worker_id)
except Exception as e:
logger.error(f"[WorkerManager] Error getting steps for worker {worker_id}: {e}", exc_info=True)
return ''
def start_auto_refresh(self) -> None:
"""Start the auto-refresh thread for periodic worker updates."""
if self.auto_refresh_interval <= 0:
logger.debug("[WorkerManager] Auto-refresh disabled (interval <= 0)")
return
if self.refresh_thread and self.refresh_thread.is_alive():
logger.debug("[WorkerManager] Auto-refresh already running")
return
logger.info(f"[WorkerManager] Starting auto-refresh with {self.auto_refresh_interval}s interval")
self._stop_refresh = False
self.refresh_thread = Thread(target=self._auto_refresh_loop, daemon=True)
self.refresh_thread.start()
def stop_auto_refresh(self) -> None:
"""Stop the auto-refresh thread."""
logger.info("[WorkerManager] Stopping auto-refresh")
self._stop_refresh = True
if self.refresh_thread:
self.refresh_thread.join(timeout=5)
self.refresh_thread = None
def _start_refresh_if_needed(self) -> None:
"""Start auto-refresh if we have active workers and callbacks."""
active = self.get_active_workers()
if active and self.refresh_callbacks and not self._stop_refresh:
self.start_auto_refresh()
def _auto_refresh_loop(self) -> None:
"""Main auto-refresh loop that periodically queries and notifies."""
try:
while not self._stop_refresh:
time.sleep(self.auto_refresh_interval)
# Check if there are active workers
active = self.get_active_workers()
if not active:
# No more active workers, stop refreshing
logger.debug("[WorkerManager] No active workers, stopping auto-refresh")
break
# Call all registered callbacks with the active workers
with self._lock:
for callback in self.refresh_callbacks:
try:
callback(active)
except Exception as e:
logger.error(f"[WorkerManager] Error in refresh callback: {e}", exc_info=True)
except Exception as e:
logger.error(f"[WorkerManager] Error in auto-refresh loop: {e}", exc_info=True)
finally:
logger.debug("[WorkerManager] Auto-refresh loop ended")
def cleanup_old_workers(self, days: int = 7) -> int:
"""Clean up completed/errored workers older than specified days.
Args:
days: Delete workers completed more than this many days ago
Returns:
Number of workers deleted
"""
try:
count = self.db.cleanup_old_workers(days)
if count > 0:
logger.info(f"[WorkerManager] Cleaned up {count} old workers")
return count
except Exception as e:
logger.error(f"[WorkerManager] Error cleaning up old workers: {e}", exc_info=True)
return 0
def append_stdout(self, worker_id: str, text: str, channel: str = "stdout") -> bool:
"""Append text to a worker's stdout log.
Args:
worker_id: Unique identifier for the worker
text: Text to append
channel: Logical channel (stdout, stderr, log, etc.)
Returns:
True if append was successful
"""
try:
step_label = self._get_last_step(worker_id)
return self.db.append_worker_stdout(worker_id, text, step=step_label, channel=channel)
except Exception as e:
logger.error(f"[WorkerManager] Error appending stdout: {e}", exc_info=True)
return False
def get_stdout(self, worker_id: str) -> str:
"""Get stdout logs for a worker.
Args:
worker_id: Unique identifier for the worker
Returns:
Worker's stdout or empty string
"""
try:
return self.db.get_worker_stdout(worker_id)
except Exception as e:
logger.error(f"[WorkerManager] Error getting stdout: {e}", exc_info=True)
return ""
def append_worker_stdout(self, worker_id: str, text: str, channel: str = "stdout") -> bool:
"""Compatibility wrapper for append_stdout."""
return self.append_stdout(worker_id, text, channel=channel)
def clear_stdout(self, worker_id: str) -> bool:
"""Clear stdout logs for a worker.
Args:
worker_id: Unique identifier for the worker
Returns:
True if clear was successful
"""
try:
return self.db.clear_worker_stdout(worker_id)
except Exception as e:
logger.error(f"[WorkerManager] Error clearing stdout: {e}", exc_info=True)
return False
def close(self) -> None:
"""Close the worker manager and database connection."""
self.stop_auto_refresh()
self.db.close()
logger.info("[WorkerManager] Closed")