"""Data models for the pipeline.""" import datetime import hashlib import json import os import shutil import sys import time from dataclasses import dataclass, field from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Protocol, TextIO from rich.console import Console from rich.progress import ( BarColumn, DownloadColumn, Progress, TaskID, TaskProgressColumn, TextColumn, TimeRemainingColumn, TransferSpeedColumn, ) @dataclass(slots=True) class PipeObject: """Unified pipeline object for tracking files, metadata, tag values, and relationships through the pipeline. This is the single source of truth for all result data in the pipeline. Uses the hash+store canonical pattern for file identification. Attributes: hash: SHA-256 hash of the file (canonical identifier) store: Storage backend name (e.g., 'default', 'hydrus', 'test', 'home') tag: List of extracted or assigned tag values title: Human-readable title if applicable source_url: URL where the object came from duration: Duration in seconds if applicable metadata: Full metadata dictionary from source warnings: Any warnings or issues encountered path: Path to the file if this object represents a file relationships: Relationship data (king/alt/related hashes) is_temp: If True, this is a temporary/intermediate artifact that may be cleaned up action: The cmdlet that created this object (format: 'cmdlet:cmdlet_name') parent_hash: Hash of the parent file in the pipeline chain (for tracking provenance/lineage) extra: Additional fields not covered above """ hash: str store: str provider: Optional[str] = None tag: List[str] = field(default_factory=list) title: Optional[str] = None url: Optional[str] = None source_url: Optional[str] = None duration: Optional[float] = None metadata: Dict[str, Any] = field(default_factory=dict) warnings: List[str] = field(default_factory=list) path: Optional[str] = None relationships: Dict[str, Any] = field(default_factory=dict) is_temp: bool = False action: Optional[str] = None parent_hash: Optional[str] = None extra: Dict[str, Any] = field(default_factory=dict) def add_relationship(self, rel_type: str, rel_hash: str) -> None: """Add a relationship hash. Args: rel_type: Relationship type ('king', 'alt', 'related') rel_hash: Hash to add to the relationship """ if rel_type not in self.relationships: self.relationships[rel_type] = [] if isinstance(self.relationships[rel_type], list): if rel_hash not in self.relationships[rel_type]: self.relationships[rel_type].append(rel_hash) else: # Single value (e.g., king), convert to that value self.relationships[rel_type] = rel_hash def get_relationships(self) -> Dict[str, Any]: """Get all relationships for this object.""" return self.relationships.copy() if self.relationships else {} def debug_table(self) -> None: """Rich-inspect the PipeObject when debug logging is enabled.""" try: from SYS.logger import is_debug_enabled, debug_inspect except Exception: return if not is_debug_enabled(): return # Prefer a stable, human-friendly title: # "1 - download-media", "2 - download-media", ... # The index is preserved when possible via `pipe_index` in the PipeObject's extra. idx = None try: if isinstance(self.extra, dict): idx = self.extra.get("pipe_index") except Exception: idx = None cmdlet_name = "PipeObject" try: import pipeline as ctx current = ctx.get_current_cmdlet_name("") if hasattr(ctx, "get_current_cmdlet_name") else "" if current: cmdlet_name = current else: action = str(self.action or "").strip() if action.lower().startswith("cmdlet:"): cmdlet_name = action.split(":", 1)[1].strip() or cmdlet_name elif action: cmdlet_name = action except Exception: cmdlet_name = "PipeObject" title_text = cmdlet_name try: if idx is not None and str(idx).strip(): title_text = f"{idx} - {cmdlet_name}" except Exception: title_text = cmdlet_name # Color the title (requested: yellow instead of Rich's default blue-ish title). debug_inspect(self, title=f"[yellow]{title_text}[/yellow]") def to_dict(self) -> Dict[str, Any]: """Serialize to dictionary, excluding None and empty values.""" data: Dict[str, Any] = { "hash": self.hash, "store": self.store, } if self.provider: data["provider"] = self.provider if self.tag: data["tag"] = self.tag if self.title: data["title"] = self.title if self.url: data["url"] = self.url if self.source_url: data["source_url"] = self.source_url if self.duration is not None: data["duration"] = self.duration if self.metadata: data["metadata"] = self.metadata if self.warnings: data["warnings"] = self.warnings if self.path: data["path"] = self.path if self.relationships: data["relationships"] = self.relationships if self.is_temp: data["is_temp"] = self.is_temp if self.action: data["action"] = self.action if self.parent_hash: data["parent_hash"] = self.parent_hash # Add extra fields data.update({k: v for k, v in self.extra.items() if v is not None}) return data class FileRelationshipTracker: """Track relationships between files for sidecar creation. Allows tagging files with their relationships to other files: - king: The primary/master version of a file - alt: Alternate versions of the same content - related: Related files (e.g., screenshots of a book) """ def __init__(self) -> None: self.relationships: Dict[str, Dict[str, Any]] = {} def register_king(self, file_path: str, file_hash: str) -> None: """Register a file as the king (primary) version.""" if file_path not in self.relationships: self.relationships[file_path] = {} self.relationships[file_path]["king"] = file_hash def add_alt(self, file_path: str, alt_hash: str) -> None: """Add an alternate version of a file.""" if file_path not in self.relationships: self.relationships[file_path] = {} if "alt" not in self.relationships[file_path]: self.relationships[file_path]["alt"] = [] if alt_hash not in self.relationships[file_path]["alt"]: self.relationships[file_path]["alt"].append(alt_hash) def add_related(self, file_path: str, related_hash: str) -> None: """Add a related file.""" if file_path not in self.relationships: self.relationships[file_path] = {} if "related" not in self.relationships[file_path]: self.relationships[file_path]["related"] = [] if related_hash not in self.relationships[file_path]["related"]: self.relationships[file_path]["related"].append(related_hash) def get_relationships(self, file_path: str) -> Optional[Dict[str, Any]]: """Get relationships for a file.""" return self.relationships.get(file_path) def link_files(self, primary_path: str, king_hash: str, *alt_paths: str) -> None: """Link files together with primary as king and others as alternates. Args: primary_path: Path to the primary file (will be marked as 'king') king_hash: Hash of the primary file alt_paths: Paths to alternate versions (will be marked as 'alt') """ self.register_king(primary_path, king_hash) for alt_path in alt_paths: try: alt_hash = _get_file_hash(alt_path) self.add_alt(primary_path, alt_hash) except Exception as e: import sys print(f"Error hashing {alt_path}: {e}", file=sys.stderr) def _get_file_hash(filepath: str) -> str: """Calculate SHA256 hash of a file.""" sha256_hash = hashlib.sha256() with open(filepath, "rb") as f: for byte_block in iter(lambda: f.read(4096), b""): sha256_hash.update(byte_block) return sha256_hash.hexdigest() # ============= Download Module Classes ============= class DownloadError(RuntimeError): """Raised when the download or Hydrus import fails.""" @dataclass(slots=True) class DownloadOptions: """Configuration for downloading media. Use the add-file cmdlet separately for Hydrus import. """ url: str mode: str # "audio" or "video" output_dir: Path cookies_path: Optional[Path] = None ytdl_format: Optional[str] = None extra_tags: Optional[List[str]] = None debug_log: Optional[Path] = None native_progress: bool = False clip_sections: Optional[str] = None playlist_items: Optional[str] = None # yt-dlp --playlist-items format (e.g., "1-3,5,8") no_playlist: bool = False # If True, pass --no-playlist to yt-dlp quiet: bool = False # If True, suppress all console output (progress, debug logs) embed_chapters: bool = False # If True, pass yt-dlp --embed-chapters / embedchapters write_sub: bool = False # If True, download subtitles (writesubtitles/writeautomaticsub) class SendFunc(Protocol): """Protocol for event sender function.""" def __call__(self, event: str, **payload: Any) -> None: ... @dataclass(slots=True) class DownloadMediaResult: """Result of a successful media download.""" path: Path info: Dict[str, Any] tag: List[str] source_url: Optional[str] hash_value: Optional[str] = None paths: Optional[List[Path]] = None # For multiple files (e.g., section downloads) @dataclass(slots=True) class DebugLogger: """Logs events to a JSON debug file for troubleshooting downloads.""" path: Path file: Optional[TextIO] = None session_started: bool = False def ensure_open(self) -> None: """Open the debug log file if not already open.""" if self.file is not None: return try: parent = self.path.parent if parent and not parent.exists(): parent.mkdir(parents=True, exist_ok=True) self.file = self.path.open("a", encoding="utf-8") except OSError as exc: # pragma: no cover - surfaces to stderr print(f"Failed to open debug log {self.path}: {exc}", file=sys.stderr) self.file = None return self._write_session_header() def _write_session_header(self) -> None: """Write session start marker to log.""" if self.session_started: return self.session_started = True self.write_record("session-start", {"pid": os.getpid(), "exe": sys.executable}) def write_raw(self, text: str) -> None: """Write raw text to debug log.""" self.ensure_open() if self.file is None: return self.file.write(text + "\n") self.file.flush() def write_record(self, event: str, payload: Optional[Dict[str, Any]] = None) -> None: """Write a structured event record to debug log.""" record = { "timestamp": datetime.datetime.utcnow().isoformat(timespec="seconds") + "Z", "event": event, "payload": payload, } self.write_raw(json.dumps(_sanitise_for_json(record), ensure_ascii=False)) def close(self) -> None: """Close the debug log file.""" if self.file is None: return try: self.file.close() finally: self.file = None def _sanitise_for_json(value: Any, *, max_depth: int = 8, _seen: Optional[set[int]] = None) -> Any: """Best-effort conversion to JSON-serialisable types without raising on cycles.""" import math from dataclasses import asdict, is_dataclass if value is None or isinstance(value, (str, bool)): return value if isinstance(value, (int, float)): if isinstance(value, float) and not math.isfinite(value): return repr(value) return value if isinstance(value, Path): return str(value) if isinstance(value, bytes): try: return value.decode() except Exception: return value.hex() if max_depth <= 0: return repr(value) if _seen is None: _seen = set() obj_id = id(value) if obj_id in _seen: return "" _seen.add(obj_id) try: if isinstance(value, dict): return { str(key): _sanitise_for_json(val, max_depth=max_depth - 1, _seen=_seen) for key, val in value.items() } if isinstance(value, (list, tuple, set)): iterable = value if not isinstance(value, set) else list(value) return [ _sanitise_for_json(item, max_depth=max_depth - 1, _seen=_seen) for item in iterable ] if is_dataclass(value) and not isinstance(value, type): return _sanitise_for_json(asdict(value), max_depth=max_depth - 1, _seen=_seen) finally: _seen.discard(obj_id) return repr(value) class ProgressBar: """Rich progress helper for byte-based transfers. Opinionated: requires `rich` and always renders via Rich. """ def __init__(self, width: Optional[int] = None): """Initialize progress bar with optional custom width.""" if width is None: width = shutil.get_terminal_size((80, 20))[0] self.width = max(40, width) # Minimum 40 chars for readability self._console: Optional[Console] = None self._progress: Optional[Progress] = None self._task_id: Optional[TaskID] = None def _ensure_started(self, *, label: str, total: Optional[int], file: Any = None) -> None: if self._progress is not None and self._task_id is not None: if total is not None and total > 0: self._progress.update(self._task_id, total=int(total)) return stream = file if file is not None else sys.stderr console = Console(file=stream) progress = Progress( TextColumn("[progress.description]{task.description}"), BarColumn(), TaskProgressColumn(), DownloadColumn(), TransferSpeedColumn(), TimeRemainingColumn(), console=console, transient=True, ) progress.start() task_total = int(total) if isinstance(total, int) and total > 0 else None task_id: TaskID = progress.add_task(str(label or "download"), total=task_total) self._console = console self._progress = progress self._task_id = task_id def update( self, *, downloaded: Optional[int], total: Optional[int], label: str = "download", file: Any = None, ) -> None: if downloaded is None and total is None: return self._ensure_started(label=label, total=total, file=file) if self._progress is None or self._task_id is None: return if total is not None and total > 0: self._progress.update(self._task_id, completed=int(downloaded or 0), total=int(total), refresh=True) else: self._progress.update(self._task_id, completed=int(downloaded or 0), refresh=True) def finish(self) -> None: if self._progress is None: return try: self._progress.stop() finally: self._console = None self._progress = None self._task_id = None def format_bytes(self, bytes_val: Optional[float]) -> str: """Format bytes to human-readable size. Args: bytes_val: Number of bytes or None. Returns: Formatted string (e.g., "123.4 MB", "1.2 GB"). """ if bytes_val is None or bytes_val <= 0: return "?.? B" for unit in ("B", "KB", "MB", "GB", "TB"): if bytes_val < 1024: return f"{bytes_val:.1f} {unit}" bytes_val /= 1024 return f"{bytes_val:.1f} PB" # NOTE: rich.Progress handles the visual formatting; format_bytes remains as a general utility. class ProgressFileReader: """File-like wrapper that prints a ProgressBar as bytes are read. Intended for uploads: pass this wrapper as the file object to httpx/requests. Progress is written to stderr (so pipelines remain clean). """ def __init__(self, fileobj: Any, *, total_bytes: Optional[int], label: str = "upload", min_interval_s: float = 0.25): self._f = fileobj self._total = int(total_bytes) if total_bytes not in (None, 0, "") else 0 self._label = str(label or "upload") self._min_interval_s = max(0.05, float(min_interval_s)) self._bar = ProgressBar() self._start = time.time() self._last = self._start self._read = 0 self._done = False def _render(self) -> None: if self._done: return if self._total <= 0: return now = time.time() if now - self._last < self._min_interval_s: return self._bar.update(downloaded=int(self._read), total=int(self._total), label=str(self._label or "upload"), file=sys.stderr) self._last = now def _finish(self) -> None: if self._done: return self._done = True self._bar.finish() def read(self, size: int = -1) -> Any: chunk = self._f.read(size) try: if chunk: self._read += len(chunk) self._render() else: # EOF self._finish() except Exception: pass return chunk def seek(self, offset: int, whence: int = 0) -> Any: out = self._f.seek(offset, whence) try: pos = int(self._f.tell()) if pos <= 0: self._read = 0 self._start = time.time() self._last = self._start else: self._read = pos except Exception: pass return out def tell(self) -> Any: return self._f.tell() def close(self) -> None: try: self._finish() except Exception: pass return self._f.close() def __getattr__(self, name: str) -> Any: return getattr(self._f, name) # ============================================================================ # PIPELINE EXECUTION CONTEXT # Consolidated from pipeline_context.py # ============================================================================ # Note: Pipeline functions and state variables moved to pipeline.py class PipelineStageContext: """Context information for the current pipeline stage.""" def __init__(self, stage_index: int, total_stages: int, worker_id: Optional[str] = None): self.stage_index = stage_index self.total_stages = total_stages self.is_last_stage = (stage_index == total_stages - 1) self.worker_id = worker_id self.emits: List[Any] = [] def emit(self, obj: Any) -> None: """Emit an object to the next pipeline stage.""" self.emits.append(obj) def get_current_command_text(self) -> str: """Get the current command text (for backward compatibility).""" # This is maintained for backward compatibility with old code # In a real implementation, this would come from the stage context return "" def __repr__(self) -> str: return f"PipelineStageContext(stage={self.stage_index}/{self.total_stages}, is_last={self.is_last_stage}, worker_id={self.worker_id})" # ============================================================================ # RESULT TABLE CLASSES # Consolidated from result_table.py # ============================================================================ @dataclass class InputOption: """Represents an interactive input option (cmdlet argument) in a table. Allows users to select options that translate to cmdlet arguments, enabling interactive configuration right from the result table. Example: # Create an option for location selection location_opt = InputOption( "location", type="enum", choices=["local", "hydrus", "0x0"], description="Download destination" ) # Use in result table table.add_input_option(location_opt) selected = table.select_option("location") # Returns user choice """ name: str """Option name (maps to cmdlet argument)""" type: str = "string" """Option type: 'string', 'enum', 'flag', 'integer'""" choices: List[str] = field(default_factory=list) """Valid choices for enum type""" default: Optional[str] = None """Default value if not specified""" description: str = "" """Description of what this option does""" validator: Optional[Callable[[str], bool]] = None """Optional validator function: takes value, returns True if valid""" def to_dict(self) -> Dict[str, Any]: """Convert to dictionary.""" return { "name": self.name, "type": self.type, "choices": self.choices if self.choices else None, "default": self.default, "description": self.description, } @dataclass class TUIResultCard: """Represents a result as a UI card with title, metadata, and actions. Used in hub-ui and TUI contexts to render individual search results as grouped components with visual structure. """ title: str subtitle: Optional[str] = None metadata: Optional[Dict[str, str]] = None media_kind: Optional[str] = None tag: Optional[List[str]] = None file_hash: Optional[str] = None file_size: Optional[str] = None duration: Optional[str] = None def __post_init__(self): """Initialize default values.""" if self.metadata is None: self.metadata = {} if self.tag is None: self.tag = [] @dataclass class ResultColumn: """Represents a single column in a result table.""" name: str value: str width: Optional[int] = None def __str__(self) -> str: """String representation of the column.""" return f"{self.name}: {self.value}" def to_dict(self) -> Dict[str, str]: """Convert to dictionary.""" return {"name": self.name, "value": self.value} @dataclass class ResultRow: """Represents a single row in a result table.""" columns: List[ResultColumn] = field(default_factory=list) def add_column(self, name: str, value: Any) -> None: """Add a column to this row.""" str_value = str(value) if value is not None else "" self.columns.append(ResultColumn(name, str_value)) def get_column(self, name: str) -> Optional[str]: """Get column value by name.""" for col in self.columns: if col.name.lower() == name.lower(): return col.value return None def to_dict(self) -> List[Dict[str, str]]: """Convert to list of column dicts.""" return [col.to_dict() for col in self.columns] def to_list(self) -> List[tuple[str, str]]: """Convert to list of (name, value) tuples.""" return [(col.name, col.value) for col in self.columns] def __str__(self) -> str: """String representation of the row.""" return " | ".join(str(col) for col in self.columns)