diff --git a/API/HydrusNetwork.py b/API/HydrusNetwork.py index 9829596..819f966 100644 --- a/API/HydrusNetwork.py +++ b/API/HydrusNetwork.py @@ -147,7 +147,11 @@ class HydrusNetwork: file_size = file_path.stat().st_size headers["Content-Type"] = spec.content_type or "application/octet-stream" - headers["Content-Length"] = str(file_size) + # Do not set Content-Length when streaming an iterator body. + # If the file size changes between stat() and read() (or the source is truncated), + # h11 will raise: "Too little data for declared Content-Length". + # Let httpx choose chunked transfer encoding for safety. + headers.pop("Content-Length", None) logger.debug(f"{self._log_prefix()} Uploading file {file_path.name} ({file_size} bytes)") diff --git a/CLI.py b/CLI.py index ffe7973..c3fdbb7 100644 --- a/CLI.py +++ b/CLI.py @@ -1245,25 +1245,17 @@ class PipelineExecutor: stages.append(current) return stages - def execute_tokens(self, tokens: List[str]) -> None: - from cmdlet import REGISTRY - import pipeline as ctx - + @staticmethod + def _try_clear_pipeline_stop(ctx: Any) -> None: try: - try: - if hasattr(ctx, "clear_pipeline_stop"): - ctx.clear_pipeline_stop() - except Exception: - pass - - stages = self._split_stages(tokens) - if not stages: - print("Invalid pipeline syntax\n") - return - - pending_tail = ctx.get_pending_pipeline_tail() if hasattr(ctx, "get_pending_pipeline_tail") else [] - pending_source = ctx.get_pending_pipeline_source() if hasattr(ctx, "get_pending_pipeline_source") else None + if hasattr(ctx, "clear_pipeline_stop"): + ctx.clear_pipeline_stop() + except Exception: + pass + @staticmethod + def _maybe_seed_current_stage_table(ctx: Any) -> None: + try: if hasattr(ctx, "get_current_stage_table") and not ctx.get_current_stage_table(): display_table = ctx.get_display_table() if hasattr(ctx, "get_display_table") else None if display_table: @@ -1272,188 +1264,512 @@ class PipelineExecutor: last_table = ctx.get_last_result_table() if hasattr(ctx, "get_last_result_table") else None if last_table: ctx.set_current_stage_table(last_table) + except Exception: + pass + @staticmethod + def _maybe_apply_pending_pipeline_tail(ctx: Any, stages: List[List[str]]) -> List[List[str]]: + try: + pending_tail = ctx.get_pending_pipeline_tail() if hasattr(ctx, "get_pending_pipeline_tail") else [] + pending_source = ctx.get_pending_pipeline_source() if hasattr(ctx, "get_pending_pipeline_source") else None + except Exception: + pending_tail = [] + pending_source = None + + try: current_source = ( - ctx.get_current_stage_table_source_command() if hasattr(ctx, "get_current_stage_table_source_command") else None + ctx.get_current_stage_table_source_command() + if hasattr(ctx, "get_current_stage_table_source_command") + else None ) + except Exception: + current_source = None + + try: effective_source = current_source or ( - ctx.get_last_result_table_source_command() if hasattr(ctx, "get_last_result_table_source_command") else None + ctx.get_last_result_table_source_command() + if hasattr(ctx, "get_last_result_table_source_command") + else None ) - selection_only = len(stages) == 1 and stages[0] and stages[0][0].startswith("@") - if pending_tail and selection_only: - if (pending_source is None) or (effective_source and pending_source == effective_source): - stages.extend(pending_tail) + except Exception: + effective_source = current_source + + selection_only = bool(len(stages) == 1 and stages[0] and stages[0][0].startswith("@")) + if pending_tail and selection_only: + if (pending_source is None) or (effective_source and pending_source == effective_source): + stages = list(stages) + list(pending_tail) + try: if hasattr(ctx, "clear_pending_pipeline_tail"): ctx.clear_pending_pipeline_tail() - elif hasattr(ctx, "clear_pending_pipeline_tail"): - ctx.clear_pending_pipeline_tail() - - config = self._config_loader.load() - if isinstance(config, dict): - # This executor is used by both the REPL and the `pipeline` subcommand. - # Quiet/background mode is helpful for detached/background runners, but - # it suppresses interactive UX (like the pipeline Live progress UI). - config["_quiet_background_output"] = bool(self._toolbar_output is None) - - def _resolve_items_for_selection(table_obj, items_list): - return items_list if items_list else [] - - def _maybe_run_class_selector(selected_items: list, *, stage_is_last: bool) -> bool: - if not stage_is_last: - return False - - candidates: list[str] = [] - seen: set[str] = set() - - def _add(value) -> None: - try: - text = str(value or "").strip().lower() - except Exception: - return - if not text or text in seen: - return - seen.add(text) - candidates.append(text) - - try: - current_table = ctx.get_current_stage_table() or ctx.get_last_result_table() - _add(current_table.table if current_table and hasattr(current_table, "table") else None) except Exception: pass - - for item in selected_items or []: - if isinstance(item, dict): - _add(item.get("provider")) - _add(item.get("store")) - _add(item.get("table")) - else: - _add(getattr(item, "provider", None)) - _add(getattr(item, "store", None)) - _add(getattr(item, "table", None)) - + else: try: - from ProviderCore.registry import get_provider, is_known_provider_name + if hasattr(ctx, "clear_pending_pipeline_tail"): + ctx.clear_pending_pipeline_tail() except Exception: - get_provider = None # type: ignore - is_known_provider_name = None # type: ignore + pass + return stages - if get_provider is not None: - for key in candidates: - try: - if is_known_provider_name is not None and (not is_known_provider_name(key)): - continue - except Exception: - # If the predicate fails for any reason, fall back to legacy behavior. - pass - try: - provider = get_provider(key, config) - except Exception: - continue - selector = getattr(provider, "selector", None) - if selector is None: - continue - try: - handled = bool(selector(selected_items, ctx=ctx, stage_is_last=True)) - except Exception as exc: - print(f"{key} selector failed: {exc}\n") - return True - if handled: - return True + def _apply_quiet_background_flag(self, config: Any) -> Any: + if isinstance(config, dict): + # This executor is used by both the REPL and the `pipeline` subcommand. + # Quiet/background mode is helpful for detached/background runners, but + # it suppresses interactive UX (like the pipeline Live progress UI). + config["_quiet_background_output"] = bool(self._toolbar_output is None) + return config - store_keys: list[str] = [] - for item in selected_items or []: - if isinstance(item, dict): - v = item.get("store") - else: - v = getattr(item, "store", None) - name = str(v or "").strip() - if name: - store_keys.append(name) + @staticmethod + def _extract_first_stage_selection_tokens(stages: List[List[str]]) -> tuple[List[List[str]], List[int], bool, bool]: + first_stage_tokens = stages[0] if stages else [] + first_stage_selection_indices: List[int] = [] + first_stage_had_extra_args = False + first_stage_select_all = False - if store_keys: + if first_stage_tokens: + new_first_stage: List[str] = [] + for token in first_stage_tokens: + if token.startswith("@"): # selection + selection = SelectionSyntax.parse(token) + if selection is not None: + first_stage_selection_indices = sorted([i - 1 for i in selection]) + continue + if token == "@*": + first_stage_select_all = True + continue + new_first_stage.append(token) + + if new_first_stage: + stages = list(stages) + stages[0] = new_first_stage + if first_stage_selection_indices or first_stage_select_all: + first_stage_had_extra_args = True + elif first_stage_selection_indices or first_stage_select_all: + stages = list(stages) + stages.pop(0) + + return stages, first_stage_selection_indices, first_stage_had_extra_args, first_stage_select_all + + @staticmethod + def _apply_select_all_if_requested(ctx: Any, indices: List[int], select_all: bool) -> List[int]: + if not select_all: + return indices + try: + last_items = ctx.get_last_result_items() + except Exception: + last_items = None + if last_items: + return list(range(len(last_items))) + return indices + + @staticmethod + def _maybe_run_class_selector(ctx: Any, config: Any, selected_items: list, *, stage_is_last: bool) -> bool: + if not stage_is_last: + return False + + candidates: list[str] = [] + seen: set[str] = set() + + def _add(value) -> None: + try: + text = str(value or "").strip().lower() + except Exception: + return + if not text or text in seen: + return + seen.add(text) + candidates.append(text) + + try: + current_table = ctx.get_current_stage_table() or ctx.get_last_result_table() + _add(current_table.table if current_table and hasattr(current_table, "table") else None) + except Exception: + pass + + for item in selected_items or []: + if isinstance(item, dict): + _add(item.get("provider")) + _add(item.get("store")) + _add(item.get("table")) + else: + _add(getattr(item, "provider", None)) + _add(getattr(item, "store", None)) + _add(getattr(item, "table", None)) + + try: + from ProviderCore.registry import get_provider, is_known_provider_name + except Exception: + get_provider = None # type: ignore + is_known_provider_name = None # type: ignore + + if get_provider is not None: + for key in candidates: + try: + if is_known_provider_name is not None and (not is_known_provider_name(key)): + continue + except Exception: + # If the predicate fails for any reason, fall back to legacy behavior. + pass + try: + provider = get_provider(key, config) + except Exception: + continue + selector = getattr(provider, "selector", None) + if selector is None: + continue + try: + handled = bool(selector(selected_items, ctx=ctx, stage_is_last=True)) + except Exception as exc: + print(f"{key} selector failed: {exc}\n") + return True + if handled: + return True + + store_keys: list[str] = [] + for item in selected_items or []: + if isinstance(item, dict): + v = item.get("store") + else: + v = getattr(item, "store", None) + name = str(v or "").strip() + if name: + store_keys.append(name) + + if store_keys: + try: + from Store.registry import Store as StoreRegistry + + store_registry = StoreRegistry(config, suppress_debug=True) + _backend_names = list(store_registry.list_backends() or []) + _backend_by_lower = {str(n).lower(): str(n) for n in _backend_names if str(n).strip()} + for name in store_keys: + resolved_name = name + if not store_registry.is_available(resolved_name): + resolved_name = _backend_by_lower.get(str(name).lower(), name) + if not store_registry.is_available(resolved_name): + continue + backend = store_registry[resolved_name] + selector = getattr(backend, "selector", None) + if selector is None: + continue + handled = bool(selector(selected_items, ctx=ctx, stage_is_last=True)) + if handled: + return True + except Exception: + pass + + return False + + def _maybe_enable_background_notifier(self, worker_manager: Any, config: Any, pipeline_session: Any) -> None: + if not (pipeline_session and worker_manager and isinstance(config, dict)): + return + + session_worker_ids = config.get("_session_worker_ids") + if not session_worker_ids: + return + + try: + output_fn = self._toolbar_output + quiet_mode = bool(config.get("_quiet_background_output")) + terminal_only = quiet_mode and not output_fn + kwargs: Dict[str, Any] = { + "session_worker_ids": session_worker_ids, + "only_terminal_updates": terminal_only, + "overlay_mode": bool(output_fn), + } + if output_fn: + kwargs["output"] = output_fn + ensure_background_notifier(worker_manager, **kwargs) + except Exception: + pass + + @staticmethod + def _get_raw_stage_texts(ctx: Any) -> List[str]: + raw_stage_texts: List[str] = [] + try: + if hasattr(ctx, "get_current_command_stages"): + raw_stage_texts = ctx.get_current_command_stages() or [] + except Exception: + raw_stage_texts = [] + return raw_stage_texts + + def _maybe_apply_initial_selection( + self, + ctx: Any, + config: Any, + stages: List[List[str]], + *, + selection_indices: List[int], + first_stage_had_extra_args: bool, + worker_manager: Any, + pipeline_session: Any, + ) -> tuple[bool, Any]: + if not selection_indices: + return True, None + + try: + if not ctx.get_current_stage_table_source_command(): + display_table = ctx.get_display_table() if hasattr(ctx, "get_display_table") else None + table_for_stage = display_table or ctx.get_last_result_table() + if table_for_stage: + ctx.set_current_stage_table(table_for_stage) + except Exception: + pass + + source_cmd = None + source_args_raw = None + try: + source_cmd = ctx.get_current_stage_table_source_command() + source_args_raw = ctx.get_current_stage_table_source_args() + except Exception: + source_cmd = None + source_args_raw = None + + if isinstance(source_args_raw, str): + source_args: List[str] = [source_args_raw] + elif isinstance(source_args_raw, list): + source_args = [str(x) for x in source_args_raw if x is not None] + else: + source_args = [] + + current_table = None + try: + current_table = ctx.get_current_stage_table() + except Exception: + current_table = None + table_type = current_table.table if current_table and hasattr(current_table, "table") else None + + command_expanded = False + + if table_type in {"youtube", "soulseek"}: + command_expanded = False + elif source_cmd == "search-file" and source_args and "youtube" in source_args: + command_expanded = False + else: + selected_row_args: List[str] = [] + skip_pipe_expansion = source_cmd == ".pipe" and len(stages) > 0 + if source_cmd and not skip_pipe_expansion: + for idx in selection_indices: + row_args = ctx.get_current_stage_table_row_selection_args(idx) + if row_args: + selected_row_args.extend(row_args) + break + + if selected_row_args: + if isinstance(source_cmd, list): + cmd_list: List[str] = [str(x) for x in source_cmd if x is not None] + elif isinstance(source_cmd, str): + cmd_list = [source_cmd] + else: + cmd_list = [] + + expanded_stage: List[str] = cmd_list + source_args + selected_row_args + + if first_stage_had_extra_args and stages: + expanded_stage += stages[0] + stages[0] = expanded_stage + else: + stages.insert(0, expanded_stage) + + if pipeline_session and worker_manager: try: - from Store.registry import Store as StoreRegistry - - store_registry = StoreRegistry(config, suppress_debug=True) - _backend_names = list(store_registry.list_backends() or []) - _backend_by_lower = {str(n).lower(): str(n) for n in _backend_names if str(n).strip()} - for name in store_keys: - resolved_name = name - if not store_registry.is_available(resolved_name): - resolved_name = _backend_by_lower.get(str(name).lower(), name) - if not store_registry.is_available(resolved_name): - continue - backend = store_registry[resolved_name] - selector = getattr(backend, "selector", None) - if selector is None: - continue - handled = bool(selector(selected_items, ctx=ctx, stage_is_last=True)) - if handled: - return True + worker_manager.log_step( + pipeline_session.worker_id, + f"@N expansion: {source_cmd} + {' '.join(str(x) for x in selected_row_args)}", + ) except Exception: pass - return False + selection_indices = [] + command_expanded = True - first_stage_tokens = stages[0] if stages else [] - first_stage_selection_indices: List[int] = [] - first_stage_had_extra_args = False - first_stage_select_all = False + if (not command_expanded) and selection_indices: + last_piped_items = None + try: + last_piped_items = ctx.get_last_result_items() + except Exception: + last_piped_items = None - if first_stage_tokens: - new_first_stage: List[str] = [] - for token in first_stage_tokens: - if token.startswith("@"): # selection - selection = SelectionSyntax.parse(token) - if selection is not None: - first_stage_selection_indices = sorted([i - 1 for i in selection]) - continue - if token == "@*": - first_stage_select_all = True - continue - new_first_stage.append(token) + stage_table = None + try: + stage_table = ctx.get_current_stage_table() + except Exception: + stage_table = None + if not stage_table and hasattr(ctx, "get_display_table"): + try: + stage_table = ctx.get_display_table() + except Exception: + stage_table = None + if not stage_table: + try: + stage_table = ctx.get_last_result_table() + except Exception: + stage_table = None - if new_first_stage: - stages[0] = new_first_stage - if first_stage_selection_indices or first_stage_select_all: - first_stage_had_extra_args = True - elif first_stage_selection_indices or first_stage_select_all: - stages.pop(0) + resolved_items = last_piped_items if last_piped_items else [] + if last_piped_items: + filtered = [resolved_items[i] for i in selection_indices if 0 <= i < len(resolved_items)] + if not filtered: + print("No items matched selection in pipeline\n") + return False, None - if first_stage_select_all: - last_items = ctx.get_last_result_items() - if last_items: - first_stage_selection_indices = list(range(len(last_items))) + if PipelineExecutor._maybe_run_class_selector(ctx, config, filtered, stage_is_last=(not stages)): + return False, None + + from cmdlet._shared import coerce_to_pipe_object + + filtered_pipe_objs = [coerce_to_pipe_object(item) for item in filtered] + piped_result = filtered_pipe_objs if len(filtered_pipe_objs) > 1 else filtered_pipe_objs[0] + + if pipeline_session and worker_manager: + try: + selection_parts = [f"@{i+1}" for i in selection_indices] + worker_manager.log_step( + pipeline_session.worker_id, + f"Applied @N selection {' | '.join(selection_parts)}", + ) + except Exception: + pass + + # Auto-insert downloader stages for provider tables. + try: + current_table = ctx.get_current_stage_table() or ctx.get_last_result_table() + except Exception: + current_table = None + table_type = current_table.table if current_table and hasattr(current_table, "table") else None + + if not stages: + if table_type == "youtube": + print("Auto-running YouTube selection via download-media") + stages.append(["download-media"]) + elif table_type == "bandcamp": + print("Auto-running Bandcamp selection via download-media") + stages.append(["download-media"]) + elif table_type in {"soulseek", "openlibrary", "libgen"}: + print("Auto-piping selection to download-file") + stages.append(["download-file"]) + else: + first_cmd = stages[0][0] if stages and stages[0] else None + if table_type == "soulseek" and first_cmd not in ( + "download-file", + "download-media", + "download_media", + ".pipe", + ): + debug("Auto-inserting download-file after Soulseek selection") + stages.insert(0, ["download-file"]) + if table_type == "youtube" and first_cmd not in ( + "download-media", + "download_media", + "download-file", + ".pipe", + ): + debug("Auto-inserting download-media after YouTube selection") + stages.insert(0, ["download-media"]) + if table_type == "bandcamp" and first_cmd not in ( + "download-media", + "download_media", + "download-file", + ".pipe", + ): + print("Auto-inserting download-media after Bandcamp selection") + stages.insert(0, ["download-media"]) + if table_type == "libgen" and first_cmd not in ( + "download-file", + "download-media", + "download_media", + ".pipe", + ): + print("Auto-inserting download-file after Libgen selection") + stages.insert(0, ["download-file"]) + + return True, piped_result + else: + print("No previous results to select from\n") + return False, None + + return True, None + + @staticmethod + def _maybe_start_live_progress(config: Any, stages: List[List[str]]) -> tuple[Any, Dict[int, int]]: + progress_ui = None + pipe_index_by_stage: Dict[int, int] = {} + + try: + quiet_mode = bool(config.get("_quiet_background_output")) if isinstance(config, dict) else False + except Exception: + quiet_mode = False + + try: + import sys as _sys + + if (not quiet_mode) and bool(getattr(_sys.stderr, "isatty", lambda: False)()): + from models import PipelineLiveProgress + + pipe_stage_indices: List[int] = [] + pipe_labels: List[str] = [] + for idx, stage_tokens in enumerate(stages): + if not stage_tokens: + continue + name = str(stage_tokens[0]).replace("_", "-").lower() + if name == "@" or name.startswith("@"): + continue + # `.pipe` (MPV) is an interactive launcher; disable pipeline Live progress + # for it because it doesn't meaningfully "complete" (mpv may keep running) + # and Live output interferes with MPV playlist UI. + if name == ".pipe": + continue + # `.matrix` uses a two-phase picker (@N then .matrix -send). Pipeline Live + # progress can linger across those phases and interfere with interactive output. + if name == ".matrix": + continue + pipe_stage_indices.append(idx) + pipe_labels.append(name) + + if pipe_labels: + progress_ui = PipelineLiveProgress(pipe_labels, enabled=True) + progress_ui.start() + try: + import pipeline as _pipeline_ctx + if hasattr(_pipeline_ctx, "set_live_progress"): + _pipeline_ctx.set_live_progress(progress_ui) + except Exception: + pass + pipe_index_by_stage = {stage_idx: pipe_idx for pipe_idx, stage_idx in enumerate(pipe_stage_indices)} + except Exception: + progress_ui = None + pipe_index_by_stage = {} + + return progress_ui, pipe_index_by_stage + + def execute_tokens(self, tokens: List[str]) -> None: + from cmdlet import REGISTRY + import pipeline as ctx + + try: + self._try_clear_pipeline_stop(ctx) + stages = self._split_stages(tokens) + if not stages: + print("Invalid pipeline syntax\n") + return + self._maybe_seed_current_stage_table(ctx) + stages = self._maybe_apply_pending_pipeline_tail(ctx, stages) + config = self._config_loader.load() + config = self._apply_quiet_background_flag(config) + + stages, first_stage_selection_indices, first_stage_had_extra_args, first_stage_select_all = ( + self._extract_first_stage_selection_tokens(stages) + ) + first_stage_selection_indices = self._apply_select_all_if_requested( + ctx, first_stage_selection_indices, first_stage_select_all + ) piped_result: Any = None worker_manager = WorkerManagerRegistry.ensure(config) pipeline_text = " | ".join(" ".join(stage) for stage in stages) pipeline_session = WorkerStages.begin_pipeline(worker_manager, pipeline_text=pipeline_text, config=config) - - raw_stage_texts: List[str] = [] - try: - if hasattr(ctx, "get_current_command_stages"): - raw_stage_texts = ctx.get_current_command_stages() or [] - except Exception: - raw_stage_texts = [] - - if pipeline_session and worker_manager and isinstance(config, dict): - session_worker_ids = config.get("_session_worker_ids") - if session_worker_ids: - try: - output_fn = self._toolbar_output - quiet_mode = bool(config.get("_quiet_background_output")) - terminal_only = quiet_mode and not output_fn - kwargs: Dict[str, Any] = { - "session_worker_ids": session_worker_ids, - "only_terminal_updates": terminal_only, - "overlay_mode": bool(output_fn), - } - if output_fn: - kwargs["output"] = output_fn - ensure_background_notifier(worker_manager, **kwargs) - except Exception: - pass + raw_stage_texts = self._get_raw_stage_texts(ctx) + self._maybe_enable_background_notifier(worker_manager, config, pipeline_session) pipeline_status = "completed" pipeline_error = "" @@ -1462,201 +1778,24 @@ class PipelineExecutor: pipe_index_by_stage: Dict[int, int] = {} try: - if first_stage_selection_indices: - if not ctx.get_current_stage_table_source_command(): - display_table = ctx.get_display_table() if hasattr(ctx, "get_display_table") else None - table_for_stage = display_table or ctx.get_last_result_table() - if table_for_stage: - ctx.set_current_stage_table(table_for_stage) - - source_cmd = ctx.get_current_stage_table_source_command() - source_args_raw = ctx.get_current_stage_table_source_args() - if isinstance(source_args_raw, str): - source_args: List[str] = [source_args_raw] - elif isinstance(source_args_raw, list): - source_args = [str(x) for x in source_args_raw if x is not None] - else: - source_args = [] - - current_table = ctx.get_current_stage_table() - table_type = current_table.table if current_table and hasattr(current_table, "table") else None - - command_expanded = False - - if table_type in {"youtube", "soulseek"}: - command_expanded = False - elif source_cmd == "search-file" and source_args and "youtube" in source_args: - command_expanded = False - else: - selected_row_args: List[str] = [] - skip_pipe_expansion = source_cmd == ".pipe" and len(stages) > 0 - if source_cmd and not skip_pipe_expansion: - for idx in first_stage_selection_indices: - row_args = ctx.get_current_stage_table_row_selection_args(idx) - if row_args: - selected_row_args.extend(row_args) - break - - if selected_row_args: - if isinstance(source_cmd, list): - cmd_list: List[str] = [str(x) for x in source_cmd if x is not None] - elif isinstance(source_cmd, str): - cmd_list = [source_cmd] - else: - cmd_list = [] - - expanded_stage: List[str] = cmd_list + source_args + selected_row_args - - if first_stage_had_extra_args and stages: - expanded_stage += stages[0] - stages[0] = expanded_stage - else: - stages.insert(0, expanded_stage) - - if pipeline_session and worker_manager: - try: - worker_manager.log_step( - pipeline_session.worker_id, - f"@N expansion: {source_cmd} + {' '.join(str(x) for x in selected_row_args)}", - ) - except Exception: - pass - - first_stage_selection_indices = [] - command_expanded = True - - if not command_expanded and first_stage_selection_indices: - last_piped_items = ctx.get_last_result_items() - stage_table = ctx.get_current_stage_table() - if not stage_table and hasattr(ctx, "get_display_table"): - stage_table = ctx.get_display_table() - if not stage_table: - stage_table = ctx.get_last_result_table() - - resolved_items = _resolve_items_for_selection(stage_table, last_piped_items) - if last_piped_items: - filtered = [ - resolved_items[i] - for i in first_stage_selection_indices - if 0 <= i < len(resolved_items) - ] - if not filtered: - print("No items matched selection in pipeline\n") - return - - if _maybe_run_class_selector(filtered, stage_is_last=(not stages)): - return - - from cmdlet._shared import coerce_to_pipe_object - - filtered_pipe_objs = [coerce_to_pipe_object(item) for item in filtered] - piped_result = filtered_pipe_objs if len(filtered_pipe_objs) > 1 else filtered_pipe_objs[0] - - if pipeline_session and worker_manager: - try: - selection_parts = [f"@{i+1}" for i in first_stage_selection_indices] - worker_manager.log_step( - pipeline_session.worker_id, - f"Applied @N selection {' | '.join(selection_parts)}", - ) - except Exception: - pass - - # Auto-insert downloader stages for provider tables. - current_table = ctx.get_current_stage_table() or ctx.get_last_result_table() - table_type = current_table.table if current_table and hasattr(current_table, "table") else None - - if not stages: - if table_type == "youtube": - print("Auto-running YouTube selection via download-media") - stages.append(["download-media"]) - elif table_type == "bandcamp": - print("Auto-running Bandcamp selection via download-media") - stages.append(["download-media"]) - elif table_type in {"soulseek", "openlibrary", "libgen"}: - print("Auto-piping selection to download-file") - stages.append(["download-file"]) - else: - first_cmd = stages[0][0] if stages and stages[0] else None - if table_type == "soulseek" and first_cmd not in ( - "download-file", - "download-media", - "download_media", - ".pipe", - ): - debug("Auto-inserting download-file after Soulseek selection") - stages.insert(0, ["download-file"]) - if table_type == "youtube" and first_cmd not in ( - "download-media", - "download_media", - "download-file", - ".pipe", - ): - debug("Auto-inserting download-media after YouTube selection") - stages.insert(0, ["download-media"]) - if table_type == "bandcamp" and first_cmd not in ( - "download-media", - "download_media", - "download-file", - ".pipe", - ): - print("Auto-inserting download-media after Bandcamp selection") - stages.insert(0, ["download-media"]) - if table_type == "libgen" and first_cmd not in ( - "download-file", - "download-media", - "download_media", - ".pipe", - ): - print("Auto-inserting download-file after Libgen selection") - stages.insert(0, ["download-file"]) - else: - print("No previous results to select from\n") - return + ok, initial_piped = self._maybe_apply_initial_selection( + ctx, + config, + stages, + selection_indices=first_stage_selection_indices, + first_stage_had_extra_args=first_stage_had_extra_args, + worker_manager=worker_manager, + pipeline_session=pipeline_session, + ) + if not ok: + return + if initial_piped is not None: + piped_result = initial_piped # ------------------------------------------------------------------ # Multi-level pipeline progress (pipes = stages, tasks = items) # ------------------------------------------------------------------ - try: - quiet_mode = bool(config.get("_quiet_background_output")) if isinstance(config, dict) else False - except Exception: - quiet_mode = False - - try: - import sys as _sys - - if (not quiet_mode) and bool(getattr(_sys.stderr, "isatty", lambda: False)()): - from models import PipelineLiveProgress - - pipe_stage_indices: List[int] = [] - pipe_labels: List[str] = [] - for idx, tokens in enumerate(stages): - if not tokens: - continue - name = str(tokens[0]).replace("_", "-").lower() - if name == "@" or name.startswith("@"): - continue - # `.pipe` (MPV) is an interactive launcher; disable pipeline Live progress - # for it because it doesn't meaningfully "complete" (mpv may keep running) - # and Live output interferes with MPV playlist UI. - if name == ".pipe": - continue - pipe_stage_indices.append(idx) - pipe_labels.append(name) - - if pipe_labels: - progress_ui = PipelineLiveProgress(pipe_labels, enabled=True) - progress_ui.start() - try: - import pipeline as _pipeline_ctx - if hasattr(_pipeline_ctx, "set_live_progress"): - _pipeline_ctx.set_live_progress(progress_ui) - except Exception: - pass - pipe_index_by_stage = {stage_idx: pipe_idx for pipe_idx, stage_idx in enumerate(pipe_stage_indices)} - except Exception: - progress_ui = None - pipe_index_by_stage = {} + progress_ui, pipe_index_by_stage = self._maybe_start_live_progress(config, stages) for stage_index, stage_tokens in enumerate(stages): if not stage_tokens: @@ -1707,7 +1846,7 @@ class PipelineExecutor: if not stage_table: stage_table = ctx.get_last_result_table() items_list = ctx.get_last_result_items() or [] - resolved_items = _resolve_items_for_selection(stage_table, items_list) + resolved_items = items_list if items_list else [] filtered = [resolved_items[i] for i in selected_indices if 0 <= i < len(resolved_items)] if not filtered: print("No items matched selection\n") @@ -1715,7 +1854,7 @@ class PipelineExecutor: pipeline_error = "Empty selection" return - if _maybe_run_class_selector(filtered, stage_is_last=(stage_index + 1 >= len(stages))): + if PipelineExecutor._maybe_run_class_selector(ctx, config, filtered, stage_is_last=(stage_index + 1 >= len(stages))): return # Special case: selecting multiple tags from get-tag and piping into delete-tag @@ -1841,9 +1980,11 @@ class PipelineExecutor: on_emit = None if progress_ui is not None and pipe_idx is not None: - def _on_emit(obj: Any, _idx: int = int(pipe_idx)) -> None: + _ui = cast(Any, progress_ui) + + def _on_emit(obj: Any, _idx: int = int(pipe_idx), _progress=_ui) -> None: try: - progress_ui.on_emit(_idx, obj) + _progress.on_emit(_idx, obj) except Exception: pass on_emit = _on_emit diff --git a/Provider/libgen.py b/Provider/libgen.py index e6467da..214f51a 100644 --- a/Provider/libgen.py +++ b/Provider/libgen.py @@ -23,6 +23,15 @@ except ImportError: class Libgen(Provider): + # Domains that should be routed to this provider when the user supplies a URL. + # (Used by ProviderCore.registry.match_provider_name_for_url) + URL_DOMAINS = ( + "libgen.gl", + "libgen.li", + "libgen.is", + "libgen.rs", + "libgen.st", + ) """Search provider for Library Genesis books.""" def search( diff --git a/Provider/metadata_provider.py b/Provider/metadata_provider.py index 00052ac..ce995d4 100644 --- a/Provider/metadata_provider.py +++ b/Provider/metadata_provider.py @@ -1,9 +1,11 @@ from __future__ import annotations from abc import ABC, abstractmethod -from typing import Any, Dict, List, Optional, Type +from typing import Any, Dict, List, Optional, Type, cast import requests import sys +import json +import subprocess from SYS.logger import log, debug @@ -13,6 +15,12 @@ except ImportError: # pragma: no cover - optional musicbrainzngs = None +try: # Optional dependency + import yt_dlp # type: ignore +except ImportError: # pragma: no cover - optional + yt_dlp = None + + class MetadataProvider(ABC): """Base class for metadata providers (music, movies, books, etc.).""" @@ -351,6 +359,157 @@ class MusicBrainzMetadataProvider(MetadataProvider): return tags +class YtdlpMetadataProvider(MetadataProvider): + """Metadata provider that extracts tags from a supported URL using yt-dlp. + + This does NOT download media; it only probes metadata. + """ + + @property + def name(self) -> str: # type: ignore[override] + return "ytdlp" + + def _extract_info(self, url: str) -> Optional[Dict[str, Any]]: + url = (url or "").strip() + if not url: + return None + + # Prefer Python module when available. + if yt_dlp is not None: + try: + opts: Any = { + "quiet": True, + "no_warnings": True, + "skip_download": True, + "noprogress": True, + "socket_timeout": 15, + "retries": 1, + "playlist_items": "1-10", + } + with yt_dlp.YoutubeDL(opts) as ydl: # type: ignore[attr-defined] + info = ydl.extract_info(url, download=False) + return cast(Dict[str, Any], info) if isinstance(info, dict) else None + except Exception: + pass + + # Fallback to CLI. + try: + cmd = [ + "yt-dlp", + "-J", + "--no-warnings", + "--skip-download", + "--playlist-items", + "1-10", + url, + ] + proc = subprocess.run(cmd, capture_output=True, text=True, timeout=30) + if proc.returncode != 0: + return None + payload = (proc.stdout or "").strip() + if not payload: + return None + data = json.loads(payload) + return data if isinstance(data, dict) else None + except Exception: + return None + + def search(self, query: str, limit: int = 10) -> List[Dict[str, Any]]: + url = (query or "").strip() + if not url.startswith(("http://", "https://")): + return [] + + info = self._extract_info(url) + if not isinstance(info, dict): + return [] + + upload_date = str(info.get("upload_date") or "") + release_date = str(info.get("release_date") or "") + year = (release_date or upload_date)[:4] if (release_date or upload_date) else "" + + # Provide basic columns for the standard metadata selection table. + # NOTE: This is best-effort; many extractors don't provide artist/album. + artist = ( + info.get("artist") + or info.get("uploader") + or info.get("channel") + or "" + ) + album = info.get("album") or info.get("playlist_title") or "" + title = info.get("title") or "" + + return [ + { + "title": title, + "artist": str(artist or ""), + "album": str(album or ""), + "year": str(year or ""), + "provider": self.name, + "url": url, + "raw": info, + } + ] + + def to_tags(self, item: Dict[str, Any]) -> List[str]: + raw = item.get("raw") + if not isinstance(raw, dict): + return super().to_tags(item) + + tags: List[str] = [] + try: + from metadata import extract_ytdlp_tags + except Exception: + extract_ytdlp_tags = None # type: ignore[assignment] + + if extract_ytdlp_tags: + try: + tags.extend(extract_ytdlp_tags(raw)) + except Exception: + pass + + # Subtitle availability tags + def _langs(value: Any) -> List[str]: + if not isinstance(value, dict): + return [] + out: List[str] = [] + for k in value.keys(): + if isinstance(k, str) and k.strip(): + out.append(k.strip().lower()) + return sorted(set(out)) + + # If this is a playlist container, subtitle/captions are usually per-entry. + info_for_subs: Dict[str, Any] = raw + entries = raw.get("entries") + if isinstance(entries, list) and entries: + first = entries[0] + if isinstance(first, dict): + info_for_subs = first + + for lang in _langs(info_for_subs.get("subtitles")): + tags.append(f"subs:{lang}") + for lang in _langs(info_for_subs.get("automatic_captions")): + tags.append(f"subs_auto:{lang}") + + # Always include source tag for parity with other providers. + tags.append(f"source:{self.name}") + + # Dedup case-insensitively, preserve order. + seen = set() + out: List[str] = [] + for t in tags: + if not isinstance(t, str): + continue + s = t.strip() + if not s: + continue + k = s.lower() + if k in seen: + continue + seen.add(k) + out.append(s) + return out + + # Registry --------------------------------------------------------------- _METADATA_PROVIDERS: Dict[str, Type[MetadataProvider]] = { @@ -359,6 +518,7 @@ _METADATA_PROVIDERS: Dict[str, Type[MetadataProvider]] = { "googlebooks": GoogleBooksMetadataProvider, "google": GoogleBooksMetadataProvider, "musicbrainz": MusicBrainzMetadataProvider, + "ytdlp": YtdlpMetadataProvider, } @@ -370,7 +530,7 @@ def list_metadata_providers(config: Optional[Dict[str, Any]] = None) -> Dict[str availability: Dict[str, bool] = {} for name, cls in _METADATA_PROVIDERS.items(): try: - provider = cls(config) + _ = cls(config) # Basic availability check: perform lightweight validation if defined availability[name] = True except Exception: diff --git a/Provider/openlibrary.py b/Provider/openlibrary.py index e279e0e..c2202c5 100644 --- a/Provider/openlibrary.py +++ b/Provider/openlibrary.py @@ -11,7 +11,8 @@ import sys import tempfile import time from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Callable, Dict, List, Optional, Tuple +from urllib.parse import urlparse import requests @@ -183,7 +184,44 @@ def _resolve_archive_id(session: requests.Session, edition_id: str, ia_candidate return "" +def _archive_id_from_url(url: str) -> str: + """Best-effort extraction of an Archive.org item identifier from a URL.""" + + u = str(url or "").strip() + if not u: + return "" + try: + p = urlparse(u) + host = (p.hostname or "").lower().strip() + if not host.endswith("archive.org"): + return "" + parts = [x for x in (p.path or "").split("/") if x] + except Exception: + return "" + + # Common patterns: + # - /details//... + # - /borrow/ + # - /download//... + if len(parts) >= 2 and parts[0].lower() in {"details", "borrow", "download", "stream"}: + return str(parts[1]).strip() + + # Sometimes the identifier is the first segment. + if len(parts) >= 1: + first = str(parts[0]).strip() + if first and first.lower() not in {"account", "services", "search", "advancedsearch.php"}: + return first + + return "" + + class OpenLibrary(Provider): + # Domains that should be routed to this provider when the user supplies a URL. + # (Used by ProviderCore.registry.match_provider_name_for_url) + URL_DOMAINS = ( + "openlibrary.org", + "archive.org", + ) """Search provider for OpenLibrary books + Archive.org direct/borrow download.""" def __init__(self, config: Optional[Dict[str, Any]] = None): @@ -311,6 +349,60 @@ class OpenLibrary(Provider): pass raise RuntimeError("Something went wrong when trying to return the book") + @staticmethod + def _archive_logout(session: requests.Session) -> None: + """Best-effort logout from archive.org. + + Archive sessions are cookie-based; returning the loan is the critical step. + Logout is attempted for cleanliness but failures should not abort the workflow. + """ + + if session is None: + return + for url in ( + "https://archive.org/account/logout", + "https://archive.org/account/logout.php", + ): + try: + resp = session.get(url, timeout=15, allow_redirects=True) + code = int(getattr(resp, "status_code", 0) or 0) + if code and code < 500: + return + except Exception: + continue + + @staticmethod + def _archive_is_lendable(book_id: str) -> tuple[bool, str]: + """Heuristic lendable check using Archive.org item metadata. + + Some lendable items do not map cleanly to an OpenLibrary edition id. + In practice, Archive metadata collections often include markers like: + - inlibrary + - printdisabled + """ + + ident = str(book_id or "").strip() + if not ident: + return False, "no-archive-id" + try: + resp = requests.get(f"https://archive.org/metadata/{ident}", timeout=8) + resp.raise_for_status() + data = resp.json() if resp is not None else {} + meta = data.get("metadata", {}) if isinstance(data, dict) else {} + collection = meta.get("collection") if isinstance(meta, dict) else None + + values: List[str] = [] + if isinstance(collection, list): + values = [str(x).strip().lower() for x in collection if str(x).strip()] + elif isinstance(collection, str): + values = [collection.strip().lower()] + + if any(v in {"inlibrary", "printdisabled", "lendinglibrary"} for v in values): + return True, "archive-collection" + return False, "archive-not-lendable" + except Exception: + return False, "archive-metadata-error" + @staticmethod def _archive_get_book_infos(session: requests.Session, url: str) -> Tuple[str, List[str], Dict[str, Any]]: """Extract page links from Archive.org book reader.""" @@ -430,6 +522,7 @@ class OpenLibrary(Provider): links: List[str], scale: int, book_id: str, + progress_callback: Optional[Callable[[int, int], None]] = None, ) -> List[str]: links_scaled = [f"{link}&rotate=0&scale={scale}" for link in links] pages = len(links_scaled) @@ -448,7 +541,20 @@ class OpenLibrary(Provider): pages=pages, ) ) - if tqdm: + if progress_callback is not None: + done = 0 + total = len(tasks) + for fut in futures.as_completed(tasks): + try: + _ = fut.result() + except Exception: + pass + done += 1 + try: + progress_callback(done, total) + except Exception: + pass + elif tqdm: for _ in tqdm(futures.as_completed(tasks), total=len(tasks)): # type: ignore pass else: @@ -904,15 +1010,20 @@ class OpenLibrary(Provider): return results - def download(self, result: SearchResult, output_dir: Path) -> Optional[Path]: + def download( + self, + result: SearchResult, + output_dir: Path, + progress_callback: Optional[Callable[[str, int, Optional[int], str], None]] = None, + ) -> Optional[Path]: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) meta = result.full_metadata or {} edition_id = str(meta.get("openlibrary_id") or "").strip() - if not edition_id: - log("[openlibrary] Missing openlibrary_id; cannot download", file=sys.stderr) - return None + + # Accept direct Archive.org URLs too (details/borrow/download) even when no OL edition id is known. + archive_id = str(meta.get("archive_id") or "").strip() ia_ids = meta.get("ia") or [] if isinstance(ia_ids, str): @@ -921,12 +1032,23 @@ class OpenLibrary(Provider): ia_ids = [] ia_candidates = [str(x) for x in ia_ids if x] - archive_id = _resolve_archive_id(self._session, edition_id, ia_candidates) + if not archive_id: + archive_id = _first_str(ia_candidates) or "" + + if not archive_id and edition_id: + archive_id = _resolve_archive_id(self._session, edition_id, ia_candidates) + + if not archive_id: + # Try to extract identifier from the SearchResult path (URL). + archive_id = _archive_id_from_url(str(getattr(result, "path", "") or "")) + if not archive_id: log("[openlibrary] No archive identifier available; cannot download", file=sys.stderr) return None safe_title = sanitize_filename(result.title) + if not safe_title or "http" in safe_title.lower(): + safe_title = sanitize_filename(archive_id) or "archive" # 1) Direct download if available. try: @@ -935,8 +1057,22 @@ class OpenLibrary(Provider): can_direct, pdf_url = False, "" if can_direct and pdf_url: + try: + if progress_callback is not None: + progress_callback("step", 0, None, "direct download") + except Exception: + pass out_path = unique_path(output_dir / f"{safe_title}.pdf") - ok = download_file(pdf_url, out_path, session=self._session) + ok = download_file( + pdf_url, + out_path, + session=self._session, + progress_callback=( + (lambda downloaded, total, label: progress_callback("bytes", downloaded, total, label)) + if progress_callback is not None + else None + ), + ) if ok: return out_path log("[openlibrary] Direct download failed", file=sys.stderr) @@ -949,65 +1085,131 @@ class OpenLibrary(Provider): log("[openlibrary] Archive credentials missing; cannot borrow", file=sys.stderr) return None - lendable, reason = _check_lendable(self._session, edition_id) + lendable = True + reason = "" + if edition_id: + lendable, reason = _check_lendable(self._session, edition_id) + if not lendable: + # OpenLibrary API can be a false-negative; fall back to Archive metadata. + lendable2, reason2 = self._archive_is_lendable(archive_id) + if lendable2: + lendable, reason = True, reason2 + else: + lendable, reason = self._archive_is_lendable(archive_id) + if not lendable: log(f"[openlibrary] Not lendable: {reason}", file=sys.stderr) return None session = self._archive_login(email, password) + loaned = False try: - session = self._archive_loan(session, archive_id, verbose=False) - except self.BookNotAvailableError: - log("[openlibrary] Book not available to borrow", file=sys.stderr) - return None - except Exception: - log("[openlibrary] Borrow failed", file=sys.stderr) - return None - - urls = [f"https://archive.org/borrow/{archive_id}", f"https://archive.org/details/{archive_id}"] - title = safe_title - links: Optional[List[str]] = None - last_exc: Optional[Exception] = None - for u in urls: try: - title_raw, links, _metadata = self._archive_get_book_infos(session, u) - if title_raw: - title = sanitize_filename(title_raw) - break - except Exception as exc: - last_exc = exc - continue - - if not links: - log(f"[openlibrary] Failed to extract pages: {last_exc}", file=sys.stderr) - return None - - temp_dir = tempfile.mkdtemp(prefix=f"{title}_", dir=str(output_dir)) - try: - images = self._archive_download(session=session, n_threads=10, directory=temp_dir, links=links, scale=3, book_id=archive_id) - - pdf_bytes = _image_paths_to_pdf_bytes(images) - if not pdf_bytes: - # Keep images folder for manual conversion. - log("[openlibrary] PDF conversion failed; keeping images folder", file=sys.stderr) - return Path(temp_dir) - - pdf_path = unique_path(output_dir / f"{title}.pdf") - with open(pdf_path, "wb") as f: - f.write(pdf_bytes) - - try: - shutil.rmtree(temp_dir) + if progress_callback is not None: + progress_callback("step", 0, None, "login") except Exception: pass - return pdf_path - except Exception: try: - shutil.rmtree(temp_dir) + session = self._archive_loan(session, archive_id, verbose=False) + loaned = True + except self.BookNotAvailableError: + log("[openlibrary] Book not available to borrow", file=sys.stderr) + return None + except Exception: + log("[openlibrary] Borrow failed", file=sys.stderr) + return None + + try: + if progress_callback is not None: + progress_callback("step", 0, None, "borrow") + except Exception: + pass + + urls = [f"https://archive.org/borrow/{archive_id}", f"https://archive.org/details/{archive_id}"] + title = safe_title + links: Optional[List[str]] = None + last_exc: Optional[Exception] = None + for u in urls: + try: + title_raw, links, _metadata = self._archive_get_book_infos(session, u) + if title_raw: + title = sanitize_filename(title_raw) + break + except Exception as exc: + last_exc = exc + continue + + if not links: + log(f"[openlibrary] Failed to extract pages: {last_exc}", file=sys.stderr) + return None + + try: + if progress_callback is not None: + progress_callback("step", 0, None, "download pages") + except Exception: + pass + + temp_dir = tempfile.mkdtemp(prefix=f"{title}_", dir=str(output_dir)) + try: + images = self._archive_download( + session=session, + n_threads=10, + directory=temp_dir, + links=links, + scale=3, + book_id=archive_id, + progress_callback=( + (lambda done, total: progress_callback("pages", done, total, "pages")) + if progress_callback is not None + else None + ), + ) + + pdf_bytes = _image_paths_to_pdf_bytes(images) + if not pdf_bytes: + # Keep images folder for manual conversion. + log("[openlibrary] PDF conversion failed; keeping images folder", file=sys.stderr) + return Path(temp_dir) + + try: + if progress_callback is not None: + progress_callback("step", 0, None, "stitch pdf") + except Exception: + pass + + pdf_path = unique_path(output_dir / f"{title}.pdf") + with open(pdf_path, "wb") as f: + f.write(pdf_bytes) + + try: + shutil.rmtree(temp_dir) + except Exception: + pass + return pdf_path + + except Exception: + try: + shutil.rmtree(temp_dir) + except Exception: + pass + raise + finally: + # Always return the loan after a successful borrow, even if download/stitch fails. + if loaned: + try: + if progress_callback is not None: + progress_callback("step", 0, None, "return book") + except Exception: + pass + try: + self._archive_return_loan(session, archive_id) + except Exception as exc: + log(f"[openlibrary] Warning: failed to return loan: {exc}", file=sys.stderr) + try: + self._archive_logout(session) except Exception: pass - raise except Exception as exc: log(f"[openlibrary] Borrow workflow error: {exc}", file=sys.stderr) diff --git a/ProviderCore/download.py b/ProviderCore/download.py index 27d616a..0157c4f 100644 --- a/ProviderCore/download.py +++ b/ProviderCore/download.py @@ -1,7 +1,7 @@ from __future__ import annotations from pathlib import Path -from typing import Optional +from typing import Callable, Optional import sys import requests @@ -22,13 +22,20 @@ def sanitize_filename(name: str, *, max_len: int = 150) -> str: return cleaned[:max_len] -def download_file(url: str, output_path: Path, *, session: Optional[requests.Session] = None, timeout_s: float = 30.0) -> bool: +def download_file( + url: str, + output_path: Path, + *, + session: Optional[requests.Session] = None, + timeout_s: float = 30.0, + progress_callback: Optional[Callable[[int, Optional[int], str], None]] = None, +) -> bool: output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) s = session or requests.Session() - bar = ProgressBar() + bar = ProgressBar() if progress_callback is None else None downloaded = 0 total = None @@ -41,9 +48,14 @@ def download_file(url: str, output_path: Path, *, session: Optional[requests.Ses except Exception: total = None + label = str(output_path.name or "download") + # Render once immediately so fast downloads still show something. try: - bar.update(downloaded=0, total=total, label=str(output_path.name or "download"), file=sys.stderr) + if progress_callback is not None: + progress_callback(0, total, label) + elif bar is not None: + bar.update(downloaded=0, total=total, label=label, file=sys.stderr) except Exception: pass @@ -53,18 +65,23 @@ def download_file(url: str, output_path: Path, *, session: Optional[requests.Ses f.write(chunk) downloaded += len(chunk) try: - bar.update(downloaded=downloaded, total=total, label=str(output_path.name or "download"), file=sys.stderr) + if progress_callback is not None: + progress_callback(downloaded, total, label) + elif bar is not None: + bar.update(downloaded=downloaded, total=total, label=label, file=sys.stderr) except Exception: pass try: - bar.finish() + if bar is not None: + bar.finish() except Exception: pass return output_path.exists() and output_path.stat().st_size > 0 except Exception: try: - bar.finish() + if bar is not None: + bar.finish() except Exception: pass try: diff --git a/ProviderCore/registry.py b/ProviderCore/registry.py index 1908ab3..7f491dc 100644 --- a/ProviderCore/registry.py +++ b/ProviderCore/registry.py @@ -6,8 +6,9 @@ This module is the single source of truth for provider discovery. from __future__ import annotations -from typing import Any, Dict, Optional, Type +from typing import Any, Dict, Optional, Sequence, Type import sys +from urllib.parse import urlparse from SYS.logger import log @@ -141,6 +142,45 @@ def list_file_providers(config: Optional[Dict[str, Any]] = None) -> Dict[str, bo return availability +def match_provider_name_for_url(url: str) -> Optional[str]: + """Return a registered provider name that claims the URL's domain. + + Providers can declare domains via a class attribute `URL_DOMAINS` (sequence of strings). + This matcher is intentionally cheap (no provider instantiation, no network). + """ + + try: + parsed = urlparse(str(url)) + host = (parsed.hostname or "").strip().lower() + except Exception: + host = "" + + if not host: + return None + + for name, provider_class in _PROVIDERS.items(): + domains = getattr(provider_class, "URL_DOMAINS", None) + if not isinstance(domains, (list, tuple)): + continue + for d in domains: + dom = str(d or "").strip().lower() + if not dom: + continue + if host == dom or host.endswith("." + dom): + return name + + return None + + +def get_provider_for_url(url: str, config: Optional[Dict[str, Any]] = None) -> Optional[Provider]: + """Instantiate and return the matching provider for a URL, if any.""" + + name = match_provider_name_for_url(url) + if not name: + return None + return get_provider(name, config) + + __all__ = [ "SearchResult", "Provider", @@ -152,5 +192,7 @@ __all__ = [ "list_search_providers", "get_file_provider", "list_file_providers", + "match_provider_name_for_url", + "get_provider_for_url", "download_soulseek_file", ] diff --git a/SYS/download.py b/SYS/download.py index f508ad8..abbf0c1 100644 --- a/SYS/download.py +++ b/SYS/download.py @@ -584,10 +584,15 @@ def _download_direct_file( filename = filename.split("?")[0] # Try to get real filename from Content-Disposition header (HEAD request) + content_type = "" try: with HTTPClient(timeout=10.0) as client: response = client._request("HEAD", url, follow_redirects=True) content_disposition = response.headers.get("content-disposition", "") + try: + content_type = str(response.headers.get("content-type", "") or "").strip().lower() + except Exception: + content_type = "" if content_disposition: # Extract filename from Content-Disposition header # Format: attachment; filename="filename.pdf" or filename=filename.pdf @@ -620,9 +625,36 @@ def _download_direct_file( else: filename = suggested - # Final fallback if we still don't have a good filename - if not filename or "." not in filename: - filename = "downloaded_file.bin" + # If we still don't have an extension, try to infer one from Content-Type. + # Never fall back to a generic `.bin` extension. + try: + has_ext = bool(filename and Path(str(filename)).suffix) + except Exception: + has_ext = False + + if filename and (not has_ext): + ct = (content_type or "").split(";")[0].strip().lower() + ext_by_ct = { + "application/pdf": ".pdf", + "application/epub+zip": ".epub", + "application/x-mobipocket-ebook": ".mobi", + "image/jpeg": ".jpg", + "image/png": ".png", + "image/webp": ".webp", + "image/gif": ".gif", + "text/plain": ".txt", + "application/zip": ".zip", + } + + if ct in ext_by_ct: + filename = f"{filename}{ext_by_ct[ct]}" + elif ct.startswith("text/html"): + # Guardrail: HTML landing pages should not be downloaded as opaque files. + raise DownloadError("URL appears to be an HTML page, not a direct file") + + # Final guardrail: if filename is empty, refuse rather than inventing `download.bin`. + if not filename or not str(filename).strip(): + raise DownloadError("Could not determine filename for URL (no Content-Disposition and no path filename)") file_path = _unique_path(output_dir / filename) progress_bar = ProgressBar() @@ -684,9 +716,15 @@ def _download_direct_file( # For direct file downloads, create minimal info dict without filename as title # This prevents creating duplicate title: tags when filename gets auto-generated # We'll add title back later only if we couldn't extract meaningful tags + ext = "" + try: + ext = Path(str(filename)).suffix.lstrip(".") + except Exception: + ext = "" + info = { - "id": filename.rsplit(".", 1)[0], - "ext": filename.rsplit(".", 1)[1] if "." in filename else "bin", + "id": str(filename).rsplit(".", 1)[0] if "." in str(filename) else str(filename), + "ext": ext, "webpage_url": url, } diff --git a/SYS/pipeline_progress.py b/SYS/pipeline_progress.py new file mode 100644 index 0000000..970491f --- /dev/null +++ b/SYS/pipeline_progress.py @@ -0,0 +1,218 @@ +from __future__ import annotations + +import sys +from contextlib import contextmanager +from typing import Any, Iterator, Optional, Sequence, Tuple + + +class PipelineProgress: + """Small adapter around PipelineLiveProgress. + + This centralizes the boilerplate used across cmdlets: + - locating the active Live UI (if any) + - resolving the current pipe_index from stage context + - step-based progress (begin_pipe_steps/advance_pipe_step) + - optional pipe percent/status updates + - optional byte transfer bars + - optional local Live panel when a cmdlet runs standalone + + The class is intentionally defensive: all UI operations are best-effort. + """ + + def __init__(self, pipeline_module: Any): + self._ctx = pipeline_module + self._local_ui: Optional[Any] = None + self._local_attached: bool = False + + def ui_and_pipe_index(self) -> Tuple[Optional[Any], int]: + ui = None + try: + ui = self._ctx.get_live_progress() if hasattr(self._ctx, "get_live_progress") else None + except Exception: + ui = None + + pipe_idx: int = 0 + try: + stage_ctx = self._ctx.get_stage_context() if hasattr(self._ctx, "get_stage_context") else None + maybe_idx = getattr(stage_ctx, "pipe_index", None) if stage_ctx is not None else None + if isinstance(maybe_idx, int): + pipe_idx = int(maybe_idx) + except Exception: + pipe_idx = 0 + + return ui, pipe_idx + + def begin_steps(self, total_steps: int) -> None: + ui, pipe_idx = self.ui_and_pipe_index() + if ui is None: + return + try: + begin = getattr(ui, "begin_pipe_steps", None) + if callable(begin): + begin(int(pipe_idx), total_steps=int(total_steps)) + except Exception: + return + + def step(self, text: str) -> None: + ui, pipe_idx = self.ui_and_pipe_index() + if ui is None: + return + try: + adv = getattr(ui, "advance_pipe_step", None) + if callable(adv): + adv(int(pipe_idx), str(text)) + except Exception: + return + + def set_percent(self, percent: int) -> None: + ui, pipe_idx = self.ui_and_pipe_index() + if ui is None: + return + try: + set_pct = getattr(ui, "set_pipe_percent", None) + if callable(set_pct): + set_pct(int(pipe_idx), int(percent)) + except Exception: + return + + def set_status(self, text: str) -> None: + ui, pipe_idx = self.ui_and_pipe_index() + if ui is None: + return + try: + setter = getattr(ui, "set_pipe_status_text", None) + if callable(setter): + setter(int(pipe_idx), str(text)) + except Exception: + return + + def clear_status(self) -> None: + ui, pipe_idx = self.ui_and_pipe_index() + if ui is None: + return + try: + clr = getattr(ui, "clear_pipe_status_text", None) + if callable(clr): + clr(int(pipe_idx)) + except Exception: + return + + def begin_transfer(self, *, label: str, total: Optional[int] = None) -> None: + ui, _ = self.ui_and_pipe_index() + if ui is None: + return + try: + fn = getattr(ui, "begin_transfer", None) + if callable(fn): + fn(label=str(label or "transfer"), total=total) + except Exception: + return + + def update_transfer(self, *, label: str, completed: Optional[int], total: Optional[int] = None) -> None: + ui, _ = self.ui_and_pipe_index() + if ui is None: + return + try: + fn = getattr(ui, "update_transfer", None) + if callable(fn): + fn(label=str(label or "transfer"), completed=completed, total=total) + except Exception: + return + + def finish_transfer(self, *, label: str) -> None: + ui, _ = self.ui_and_pipe_index() + if ui is None: + return + try: + fn = getattr(ui, "finish_transfer", None) + if callable(fn): + fn(label=str(label or "transfer")) + except Exception: + return + + def on_emit(self, emitted: Any) -> None: + """Advance local pipe progress after pipeline_context.emit(). + + The shared PipelineExecutor wires on_emit automatically for pipelines. + Standalone cmdlet runs do not, so cmdlets call this explicitly. + """ + + if self._local_ui is None: + return + try: + self._local_ui.on_emit(0, emitted) + except Exception: + return + + def ensure_local_ui(self, *, label: str, total_items: int, items_preview: Optional[Sequence[Any]] = None) -> bool: + """Start a local PipelineLiveProgress panel if no shared UI exists.""" + + try: + existing = self._ctx.get_live_progress() if hasattr(self._ctx, "get_live_progress") else None + except Exception: + existing = None + + if existing is not None: + return False + if not bool(getattr(sys.stderr, "isatty", lambda: False)()): + return False + + try: + from models import PipelineLiveProgress + + ui = PipelineLiveProgress([str(label or "pipeline")], enabled=True) + ui.start() + try: + if hasattr(self._ctx, "set_live_progress"): + self._ctx.set_live_progress(ui) + self._local_attached = True + except Exception: + self._local_attached = False + + try: + ui.begin_pipe(0, total_items=max(1, int(total_items)), items_preview=list(items_preview or [])) + except Exception: + pass + + self._local_ui = ui + return True + except Exception: + self._local_ui = None + self._local_attached = False + return False + + def close_local_ui(self, *, force_complete: bool = True) -> None: + if self._local_ui is None: + return + try: + try: + self._local_ui.finish_pipe(0, force_complete=bool(force_complete)) + except Exception: + pass + try: + self._local_ui.stop() + except Exception: + pass + finally: + self._local_ui = None + try: + if self._local_attached and hasattr(self._ctx, "set_live_progress"): + self._ctx.set_live_progress(None) + except Exception: + pass + self._local_attached = False + + @contextmanager + def local_ui_if_needed( + self, + *, + label: str, + total_items: int, + items_preview: Optional[Sequence[Any]] = None, + ) -> Iterator["PipelineProgress"]: + created = self.ensure_local_ui(label=label, total_items=total_items, items_preview=items_preview) + try: + yield self + finally: + if created: + self.close_local_ui(force_complete=True) diff --git a/cmdlet/_shared.py b/cmdlet/_shared.py index 13e66fc..ac5f333 100644 --- a/cmdlet/_shared.py +++ b/cmdlet/_shared.py @@ -1585,9 +1585,46 @@ def coerce_to_pipe_object(value: Any, default_path: Optional[str] = None) -> mod "warnings", "path", "relationships", "is_temp", "action", "parent_hash", } - # Convert ResultItem to dict to preserve all attributes + # Convert common object-like results into a dict so we can preserve fields like + # hash/store/url when they come from result tables (e.g., get-url emits UrlItem). + # + # Priority: + # 1) explicit to_dict() + # 2) best-effort attribute extraction for known PipeObject-ish fields if hasattr(value, 'to_dict'): value = value.to_dict() + elif not isinstance(value, dict): + try: + obj_map: Dict[str, Any] = {} + for k in ( + "hash", + "store", + "provider", + "prov", + "tag", + "title", + "url", + "source_url", + "duration", + "duration_seconds", + "metadata", + "full_metadata", + "warnings", + "path", + "target", + "relationships", + "is_temp", + "action", + "parent_hash", + "extra", + "media_kind", + ): + if hasattr(value, k): + obj_map[k] = getattr(value, k) + if obj_map: + value = obj_map + except Exception: + pass if isinstance(value, dict): # Extract hash and store (canonical identifiers) @@ -1695,8 +1732,19 @@ def coerce_to_pipe_object(value: Any, default_path: Optional[str] = None) -> mod # Fallback: build from path argument or bare value hash_val = "unknown" path_val = default_path or getattr(value, "path", None) + url_val: Optional[str] = None title_val = None + # If the raw value is a string, treat it as either a URL or a file path. + # This is important for @-selection results that are plain URL strings. + if isinstance(value, str): + s = value.strip() + if s.lower().startswith(("http://", "https://")): + url_val = s + path_val = None + else: + path_val = s + if path_val and path_val != "unknown": try: from SYS.utils import sha256_file @@ -1708,8 +1756,9 @@ def coerce_to_pipe_object(value: Any, default_path: Optional[str] = None) -> mod except Exception: pass - # When coming from path argument, store should be "PATH" (file path, not a backend) - store_val = "PATH" + # When coming from a raw URL string, mark it explicitly as URL. + # Otherwise treat it as a local path. + store_val = "URL" if url_val else "PATH" pipe_obj = models.PipeObject( hash=hash_val, @@ -1717,6 +1766,8 @@ def coerce_to_pipe_object(value: Any, default_path: Optional[str] = None) -> mod provider=None, path=str(path_val) if path_val and path_val != "unknown" else None, title=title_val, + url=url_val, + source_url=url_val, tag=[], extra={}, ) diff --git a/cmdlet/add_file.py b/cmdlet/add_file.py index b2e469c..da195e3 100644 --- a/cmdlet/add_file.py +++ b/cmdlet/add_file.py @@ -12,6 +12,7 @@ import models import pipeline as ctx from API import HydrusNetwork as hydrus_wrapper from SYS.logger import log, debug +from SYS.pipeline_progress import PipelineProgress from SYS.utils_constant import ALL_SUPPORTED_EXTENSIONS from Store import Store from . import _shared as sh @@ -73,6 +74,7 @@ class Add_File(Cmdlet): def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: """Main execution entry point.""" parsed = parse_cmdlet_args(args, self) + progress = PipelineProgress(ctx) path_arg = parsed.get("path") location = parsed.get("store") @@ -80,6 +82,35 @@ class Add_File(Cmdlet): provider_room = parsed.get("room") delete_after = parsed.get("delete", False) + # Convenience: when piping a file into add-file, allow `-path ` + # to act as the destination export directory. + # Example: screen-shot "https://..." | add-file -path "C:\Users\Admin\Desktop" + if path_arg and not location and not provider_name: + try: + candidate_dir = Path(str(path_arg)) + if candidate_dir.exists() and candidate_dir.is_dir(): + piped_items = result if isinstance(result, list) else [result] + has_local_source = False + for it in piped_items: + try: + po = coerce_to_pipe_object(it, None) + src = str(getattr(po, "path", "") or "").strip() + if not src: + continue + if src.lower().startswith(("http://", "https://", "magnet:", "torrent:")): + continue + if Path(src).is_file(): + has_local_source = True + break + except Exception: + continue + if has_local_source: + debug(f"[add-file] Treating -path directory as destination: {candidate_dir}") + location = str(candidate_dir) + path_arg = None + except Exception: + pass + stage_ctx = ctx.get_stage_context() is_last_stage = (stage_ctx is None) or bool(getattr(stage_ctx, "is_last_stage", False)) @@ -93,7 +124,7 @@ class Add_File(Cmdlet): is_storage_backend_location = False # Decide which items to process. - # - If user provided -path, treat this invocation as single-item. + # - If user provided -path (and it was not reinterpreted as destination), treat this invocation as single-item. # - Otherwise, if piped input is a list, ingest each item. if path_arg: items_to_process: List[Any] = [result] @@ -102,6 +133,17 @@ class Add_File(Cmdlet): else: items_to_process = [result] + # Minimal step-based progress for single-item runs. + # Many add-file flows don't emit intermediate items, so without steps the pipe can look "stuck". + use_steps = False + steps_started = False + step2_done = False + try: + ui, _ = progress.ui_and_pipe_index() + use_steps = (ui is not None) and (len(items_to_process) == 1) + except Exception: + use_steps = False + debug(f"[add-file] INPUT result type={type(result).__name__}") if isinstance(result, list): debug(f"[add-file] INPUT result is list with {len(result)} items") @@ -235,6 +277,14 @@ class Add_File(Cmdlet): failures += 1 continue + is_url_target = isinstance(media_path_or_url, str) and str(media_path_or_url).lower().startswith( + ("http://", "https://", "magnet:", "torrent:") + ) + if use_steps and (not steps_started) and (not is_url_target): + progress.begin_steps(3) + progress.step("resolving source") + steps_started = True + # Update pipe_obj with resolved path pipe_obj.path = str(media_path_or_url) @@ -300,13 +350,34 @@ class Add_File(Cmdlet): pass temp_dir_to_cleanup = Path(tempfile.mkdtemp(prefix="medios_openlibrary_")) + + # Wire OpenLibrary download progress into pipeline Live UI (no tqdm spam). + def _ol_progress(kind: str, completed: int, total: Optional[int], label: str) -> None: + try: + if kind == "pages" and total: + progress.set_status(f"downloading pages {completed}/{total}") + progress.set_percent(int(round((completed / max(1, total)) * 100.0))) + elif kind == "bytes" and total: + progress.set_status(f"downloading {label} {completed}/{total} bytes") + progress.set_percent(int(round((completed / max(1, total)) * 100.0))) + else: + progress.set_status("downloading") + except Exception: + return + + try: + progress.set_percent(0) + progress.set_status("downloading openlibrary") + except Exception: + pass + sr = SearchResult( table="openlibrary", title=str(getattr(pipe_obj, "title", None) or "Unknown"), path=str(media_path_or_url), full_metadata=full_metadata if isinstance(full_metadata, dict) else {}, ) - downloaded = provider.download(sr, temp_dir_to_cleanup) + downloaded = provider.download(sr, temp_dir_to_cleanup, progress_callback=_ol_progress) if downloaded is None: log("[add-file] OpenLibrary download failed", file=sys.stderr) failures += 1 @@ -325,6 +396,13 @@ class Add_File(Cmdlet): pipe_obj.path = str(downloaded_path) delete_after_item = True + try: + if ui is not None: + ui.set_pipe_percent(int(pipe_idx), 100) + ui.set_pipe_status_text(int(pipe_idx), "downloaded") + except Exception: + pass + # For non-provider URLs, or if still a URL after provider attempt, delegate to download-media. if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith( ("http://", "https://", "magnet:", "torrent:") @@ -562,6 +640,10 @@ class Add_File(Cmdlet): failures += 1 continue + if use_steps and steps_started and (not step2_done): + progress.step("writing destination") + step2_done = True + if code == 0: successes += 1 else: @@ -619,6 +701,9 @@ class Add_File(Cmdlet): except Exception: pass + if use_steps and steps_started: + progress.step("finalized") + if successes > 0: return 0 return 1 diff --git a/cmdlet/add_url.py b/cmdlet/add_url.py index eb04fe7..5c761c5 100644 --- a/cmdlet/add_url.py +++ b/cmdlet/add_url.py @@ -34,6 +34,19 @@ class Add_Url(sh.Cmdlet): """Add URL to file via hash+store backend.""" parsed = sh.parse_cmdlet_args(args, self) + # Compatibility/piping fix: + # `SharedArgs.QUERY` is positional in the shared parser, so `add-url ` + # (and `@N | add-url `) can mistakenly parse the URL into `query`. + # If `url` is missing and `query` looks like an http(s) URL, treat it as `url`. + try: + if (not parsed.get("url")) and isinstance(parsed.get("query"), str): + q = str(parsed.get("query") or "").strip() + if q.startswith(("http://", "https://")): + parsed["url"] = q + parsed.pop("query", None) + except Exception: + pass + query_hash = sh.parse_single_hash_query(parsed.get("query")) if parsed.get("query") and not query_hash: log("Error: -query must be of the form hash:") diff --git a/cmdlet/delete_url.py b/cmdlet/delete_url.py index c9dd999..b4a9039 100644 --- a/cmdlet/delete_url.py +++ b/cmdlet/delete_url.py @@ -29,7 +29,7 @@ class Delete_Url(Cmdlet): arg=[ SharedArgs.QUERY, SharedArgs.STORE, - CmdletArg("url", required=True, description="URL to remove"), + CmdletArg("url", required=False, description="URL to remove (optional when piping url rows)"), ], detail=[ "- Removes URL association from file identified by hash+store", @@ -69,22 +69,24 @@ class Delete_Url(Cmdlet): log("Error: No store name provided") return 1 - if not url_arg: - log("Error: No URL provided") - return 1 - # Normalize hash (single-item mode) if not results and file_hash: file_hash = normalize_hash(file_hash) if not file_hash: log("Error: Invalid hash format") return 1 - - # Parse url (comma-separated) - urls = [u.strip() for u in str(url_arg).split(',') if u.strip()] - if not urls: - log("Error: No valid url provided") - return 1 + + from metadata import normalize_urls + + def _urls_from_arg(raw: Any) -> List[str]: + if raw is None: + return [] + # Support comma-separated input for backwards compatibility + if isinstance(raw, str) and "," in raw: + return [u.strip() for u in raw.split(",") if u.strip()] + return [u.strip() for u in normalize_urls(raw) if str(u).strip()] + + urls_from_cli = _urls_from_arg(url_arg) # Get backend and delete url try: @@ -145,7 +147,17 @@ class Delete_Url(Cmdlet): ) continue - batch.setdefault(store_text, []).append((normalized, list(urls))) + # Determine which URLs to delete. + # - If user passed an explicit , apply it to all items. + # - Otherwise, when piping url rows from get-url, delete the url(s) from each item. + item_urls = list(urls_from_cli) + if not item_urls: + item_urls = [u.strip() for u in normalize_urls(get_field(item, "url") or get_field(item, "source_url")) if str(u).strip()] + if not item_urls: + ctx.print_if_visible("[delete-url] Warning: Item has no url field; skipping", file=sys.stderr) + continue + + batch.setdefault(store_text, []).append((normalized, item_urls)) for store_text, pairs in batch.items(): try: @@ -168,24 +180,39 @@ class Delete_Url(Cmdlet): for h, ulist in bulk_pairs: backend.delete_url(h, ulist, config=config) + deleted_count = 0 + for _h, ulist in bulk_pairs: + deleted_count += len(ulist or []) ctx.print_if_visible( - f"✓ delete-url: {len(urls)} url(s) for {len(bulk_pairs)} item(s) in '{store_text}'", + f"✓ delete-url: {deleted_count} url(s) for {len(bulk_pairs)} item(s) in '{store_text}'", file=sys.stderr, ) for item in pass_through: existing = get_field(item, "url") - _set_item_url(item, _remove_urls(existing, list(urls))) + # In batch mode we removed the union of requested urls for the file. + # Using urls_from_cli (if present) matches the user's explicit intent; otherwise + # remove the piped url row(s). + remove_set = urls_from_cli + if not remove_set: + remove_set = [u.strip() for u in normalize_urls(get_field(item, "url") or get_field(item, "source_url")) if str(u).strip()] + _set_item_url(item, _remove_urls(existing, list(remove_set))) ctx.emit(item) return 0 # Single-item mode + if not urls_from_cli: + urls_from_cli = [u.strip() for u in normalize_urls(get_field(result, "url") or get_field(result, "source_url")) if str(u).strip()] + if not urls_from_cli: + log("Error: No URL provided") + return 1 + backend = storage[str(store_name)] - backend.delete_url(str(file_hash), urls, config=config) - ctx.print_if_visible(f"✓ delete-url: {len(urls)} url(s) removed", file=sys.stderr) + backend.delete_url(str(file_hash), list(urls_from_cli), config=config) + ctx.print_if_visible(f"✓ delete-url: {len(urls_from_cli)} url(s) removed", file=sys.stderr) if result is not None: existing = get_field(result, "url") - _set_item_url(result, _remove_urls(existing, list(urls))) + _set_item_url(result, _remove_urls(existing, list(urls_from_cli))) ctx.emit(result) return 0 diff --git a/cmdlet/download_file.py b/cmdlet/download_file.py index 4881da2..90fbf23 100644 --- a/cmdlet/download_file.py +++ b/cmdlet/download_file.py @@ -16,6 +16,7 @@ from urllib.parse import urlparse from SYS.download import DownloadError, _download_direct_file from SYS.logger import log, debug +from SYS.pipeline_progress import PipelineProgress import pipeline as pipeline_context from . import _shared as sh @@ -55,43 +56,673 @@ class Download_File(Cmdlet): """Main execution method.""" return self._run_impl(result, args, config) + @staticmethod + def _normalize_urls(parsed: Dict[str, Any]) -> List[str]: + raw_url = parsed.get("url", []) + if isinstance(raw_url, str): + raw_url = [raw_url] + + expanded_urls: List[str] = [] + for u in (raw_url or []): + if u is None: + continue + s = str(u).strip() + if not s: + continue + if "," in s: + parts = [p.strip() for p in s.split(",")] + expanded_urls.extend([p for p in parts if p]) + else: + expanded_urls.append(s) + + return expanded_urls + + @staticmethod + def _collect_piped_items_if_no_urls(result: Any, raw_urls: Sequence[str]) -> List[Any]: + if raw_urls: + return [] + if isinstance(result, list): + return list(result) + if result: + return [result] + return [] + + @staticmethod + def _safe_total_items(raw_urls: Sequence[str], piped_items: Sequence[Any]) -> int: + try: + return int(len(raw_urls or []) + len(piped_items or [])) + except Exception: + return 1 + + @staticmethod + def _build_preview(raw_urls: Sequence[str], piped_items: Sequence[Any], total_items: int) -> List[Any]: + try: + preview: List[Any] = [] + preview.extend(list(raw_urls or [])[: max(0, total_items)]) + if len(preview) < total_items: + preview.extend(list(piped_items or [])[: max(0, total_items - len(preview))]) + return preview + except Exception: + return [] + + @staticmethod + def _load_provider_registry() -> Dict[str, Any]: + try: + from ProviderCore.registry import ( + get_search_provider as _get_search_provider, + get_provider as _get_provider, + match_provider_name_for_url as _match_provider_name_for_url, + SearchResult as _SearchResult, + ) + + return { + "get_search_provider": _get_search_provider, + "get_provider": _get_provider, + "match_provider_name_for_url": _match_provider_name_for_url, + "SearchResult": _SearchResult, + } + except Exception: + return { + "get_search_provider": None, + "get_provider": None, + "match_provider_name_for_url": None, + "SearchResult": None, + } + + @staticmethod + def _openlibrary_edition_id_from_url(u: str) -> str: + try: + p = urlparse(str(u)) + parts = [x for x in (p.path or "").split("/") if x] + except Exception: + parts = [] + # /books/OL35443598M/... + if len(parts) >= 2 and str(parts[0]).lower() == "books": + return str(parts[1]).strip() + return "" + + @staticmethod + def _title_hint_from_url_slug(u: str) -> str: + try: + p = urlparse(str(u)) + parts = [x for x in (p.path or "").split("/") if x] + slug = parts[-1] if parts else "" + except Exception: + slug = "" + slug = (slug or "").strip().replace("_", " ") + return slug or "OpenLibrary" + + @staticmethod + def _path_from_download_result(result_obj: Any) -> Path: + file_path = None + if hasattr(result_obj, "path"): + file_path = getattr(result_obj, "path") + elif isinstance(result_obj, dict): + file_path = result_obj.get("path") + if not file_path: + file_path = str(result_obj) + return Path(str(file_path)) + + def _emit_local_file( + self, + *, + downloaded_path: Path, + source: Optional[str], + title_hint: Optional[str], + tags_hint: Optional[List[str]], + media_kind_hint: Optional[str], + full_metadata: Optional[Dict[str, Any]], + progress: PipelineProgress, + config: Dict[str, Any], + provider_hint: Optional[str] = None, + ) -> None: + title_val = (title_hint or downloaded_path.stem or "Unknown").strip() or downloaded_path.stem + hash_value = self._compute_file_hash(downloaded_path) + tag: List[str] = [] + if tags_hint: + tag.extend([str(t) for t in tags_hint if t]) + if not any(str(t).lower().startswith("title:") for t in tag): + tag.insert(0, f"title:{title_val}") + + payload: Dict[str, Any] = { + "path": str(downloaded_path), + "hash": hash_value, + "title": title_val, + "action": "cmdlet:download-file", + "download_mode": "file", + "store": "local", + "media_kind": media_kind_hint or "file", + "tag": tag, + } + if provider_hint: + payload["provider"] = str(provider_hint) + if full_metadata: + payload["full_metadata"] = full_metadata + if source and str(source).startswith("http"): + payload["url"] = source + elif source: + payload["source_url"] = source + + pipeline_context.emit(payload) + + # When running with a local progress UI (standalone cmdlet), ensure + # the pipe advances on emit. + progress.on_emit(payload) + + # Automatically register url with local library + if payload.get("url"): + pipe_obj = coerce_to_pipe_object(payload) + register_url_with_local_library(pipe_obj, config) + + def _process_explicit_urls( + self, + *, + raw_urls: Sequence[str], + final_output_dir: Path, + config: Dict[str, Any], + quiet_mode: bool, + registry: Dict[str, Any], + progress: PipelineProgress, + ) -> tuple[int, Optional[int]]: + downloaded_count = 0 + + SearchResult = registry.get("SearchResult") + get_provider = registry.get("get_provider") + match_provider_name_for_url = registry.get("match_provider_name_for_url") + + for url in raw_urls: + try: + debug(f"Processing URL: {url}") + + # Telegram message URLs are not direct files; route through the provider. + try: + parsed_url = urlparse(str(url)) + host = (parsed_url.hostname or "").lower().strip() + except Exception: + host = "" + + is_telegram = host in {"t.me", "telegram.me"} or host.endswith(".t.me") + if is_telegram and SearchResult: + try: + from ProviderCore.registry import get_provider as _get_provider + except Exception: + _get_provider = None + + if _get_provider is None: + raise DownloadError("Telegram provider registry not available") + + provider = _get_provider("telegram", config) + if provider is None: + raise DownloadError("Telegram provider not configured or not available (check telethon/app_id/api_hash)") + + sr = SearchResult(table="telegram", title=str(url), path=str(url), full_metadata={}) + downloaded_path = None + telegram_info: Optional[Dict[str, Any]] = None + if hasattr(provider, "download_url"): + try: + downloaded_path, telegram_info = provider.download_url(str(url), final_output_dir) # type: ignore[attr-defined] + except Exception as exc: + raise DownloadError(str(exc)) + else: + downloaded_path = provider.download(sr, final_output_dir) + + if not downloaded_path: + raise DownloadError("Telegram download returned no file") + + channel = "" + post = None + if isinstance(telegram_info, dict): + try: + chat_info_raw = telegram_info.get("chat") + msg_info_raw = telegram_info.get("message") + chat_info: Dict[str, Any] = chat_info_raw if isinstance(chat_info_raw, dict) else {} + msg_info: Dict[str, Any] = msg_info_raw if isinstance(msg_info_raw, dict) else {} + channel = str(chat_info.get("title") or chat_info.get("username") or "").strip() + post = msg_info.get("id") + except Exception: + channel = "" + post = None + + title_hint = None + tags_hint: List[str] = [] + if channel: + tags_hint.append(f"channel:{channel}") + if post is not None: + tags_hint.append(f"post:{post}") + if channel and post is not None: + title_hint = f"{channel} {post}" + elif post is not None: + title_hint = f"post:{post}" + else: + title_hint = downloaded_path.stem + + self._emit_local_file( + downloaded_path=downloaded_path, + source=str(url), + title_hint=title_hint, + tags_hint=tags_hint, + media_kind_hint="file", + full_metadata=telegram_info, + provider_hint="telegram", + progress=progress, + config=config, + ) + downloaded_count += 1 + debug("✓ Downloaded via Telegram provider and emitted") + continue + + # Provider URL routing (e.g. OpenLibrary book pages). + provider_name = None + if match_provider_name_for_url is not None: + try: + provider_name = match_provider_name_for_url(str(url)) + except Exception: + provider_name = None + + if provider_name and get_provider is not None and SearchResult is not None: + # OpenLibrary URLs should be handled by the OpenLibrary provider. + if provider_name == "openlibrary": + provider = get_provider("openlibrary", config) + if provider is None: + raise DownloadError("OpenLibrary provider not configured or not available") + + edition_id = self._openlibrary_edition_id_from_url(str(url)) + title_hint = self._title_hint_from_url_slug(str(url)) + + sr = SearchResult( + table="openlibrary", + title=title_hint, + path=str(url), + media_kind="book", + full_metadata={ + "openlibrary_id": edition_id, + }, + ) + + downloaded_path = None + try: + ui, _pipe_idx = progress.ui_and_pipe_index() + progress_cb = None + if ui is not None: + # High-level steps for OpenLibrary borrow/download flow. + progress.begin_steps(5) + + def _progress(kind: str, done: int, total: Optional[int], label: str) -> None: + # kind: + # - "step": advance step text + # - "pages": update pipe percent/status + # - "bytes": update transfer bar + if kind == "step": + progress.step(label) + return + + if kind == "pages": + t = int(total) if isinstance(total, int) else 0 + d = int(done) if isinstance(done, int) else 0 + if t > 0: + pct = int(round((max(0, min(d, t)) / max(1, t)) * 100.0)) + progress.set_percent(pct) + progress.set_status(f"downloading pages {d}/{t}") + else: + progress.set_status(f"downloading pages {d}") + return + + if kind == "bytes": + try: + lbl = str(label or "download") + except Exception: + lbl = "download" + progress.begin_transfer(label=lbl, total=total) + progress.update_transfer(label=lbl, completed=done, total=total) + try: + if isinstance(total, int) and total > 0 and int(done) >= int(total): + progress.finish_transfer(label=lbl) + except Exception: + pass + return + + progress_cb = _progress + + downloaded_path = provider.download(sr, final_output_dir, progress_callback=progress_cb) # type: ignore[call-arg] + except Exception as exc: + raise DownloadError(str(exc)) + + # Clear long-running status line after the download attempt. + progress.clear_status() + + if downloaded_path: + self._emit_local_file( + downloaded_path=Path(downloaded_path), + source=str(url), + title_hint=title_hint, + tags_hint=None, + media_kind_hint="book", + full_metadata=sr.full_metadata, + provider_hint="openlibrary", + progress=progress, + config=config, + ) + downloaded_count += 1 + continue + + # If OpenLibrary can't provide it (not lendable, no creds, etc), auto-search LibGen. + try: + fallback_query = str(title_hint or "").strip() + if fallback_query: + log( + f"[download-file] Not available on OpenLibrary; searching LibGen for: {fallback_query}", + file=sys.stderr, + ) + from cmdlet.search_provider import CMDLET as _SEARCH_PROVIDER_CMDLET + + exec_fn = getattr(_SEARCH_PROVIDER_CMDLET, "exec", None) + if callable(exec_fn): + ret = exec_fn(None, ["-provider", "libgen", "-query", fallback_query], config) + try: + table = pipeline_context.get_last_result_table() + items = pipeline_context.get_last_result_items() + if table is not None: + pipeline_context.set_last_result_table_overlay(table, items) + except Exception: + pass + + try: + return downloaded_count, int(ret) # type: ignore[arg-type] + except Exception: + return downloaded_count, 1 + except Exception: + pass + + log("[download-file] OpenLibrary URL could not be downloaded", file=sys.stderr) + continue + + # Generic provider URL handler (if a provider implements `download_url`). + provider = get_provider(provider_name, config) + if provider is not None and hasattr(provider, "download_url"): + try: + downloaded_path = provider.download_url(str(url), final_output_dir) # type: ignore[attr-defined] + except Exception as exc: + raise DownloadError(str(exc)) + + if downloaded_path: + self._emit_local_file( + downloaded_path=Path(downloaded_path), + source=str(url), + title_hint=Path(str(downloaded_path)).stem, + tags_hint=None, + media_kind_hint="file", + full_metadata=None, + provider_hint=str(provider_name), + progress=progress, + config=config, + ) + downloaded_count += 1 + continue + + # Otherwise, try provider.download(SearchResult) with the URL as the target. + if provider is not None: + try: + sr = SearchResult( + table=str(provider_name), + title=str(url), + path=str(url), + full_metadata={}, + ) + downloaded_path = provider.download(sr, final_output_dir) # type: ignore[call-arg] + except Exception: + downloaded_path = None + + if downloaded_path: + self._emit_local_file( + downloaded_path=Path(downloaded_path), + source=str(url), + title_hint=Path(str(downloaded_path)).stem, + tags_hint=None, + media_kind_hint="file", + full_metadata=None, + provider_hint=str(provider_name), + progress=progress, + config=config, + ) + downloaded_count += 1 + continue + + result_obj = _download_direct_file(str(url), final_output_dir, quiet=quiet_mode) + downloaded_path = self._path_from_download_result(result_obj) + + self._emit_local_file( + downloaded_path=downloaded_path, + source=str(url), + title_hint=downloaded_path.stem, + tags_hint=[f"title:{downloaded_path.stem}"], + media_kind_hint="file", + full_metadata=None, + progress=progress, + config=config, + ) + downloaded_count += 1 + debug("✓ Downloaded and emitted") + + except DownloadError as e: + log(f"Download failed for {url}: {e}", file=sys.stderr) + except Exception as e: + log(f"Error processing {url}: {e}", file=sys.stderr) + + return downloaded_count, None + + def _expand_provider_items( + self, + *, + piped_items: Sequence[Any], + registry: Dict[str, Any], + config: Dict[str, Any], + ) -> List[Any]: + get_search_provider = registry.get("get_search_provider") + expanded_items: List[Any] = [] + for item in piped_items: + try: + table = get_field(item, "table") + media_kind = get_field(item, "media_kind") + full_metadata = get_field(item, "full_metadata") + target = get_field(item, "path") or get_field(item, "url") + + if str(table or "").lower() == "alldebrid" and str(media_kind or "").lower() == "folder": + magnet_id = None + if isinstance(full_metadata, dict): + magnet_id = full_metadata.get("magnet_id") + if magnet_id is None and isinstance(target, str) and target.lower().startswith("alldebrid:magnet:"): + try: + magnet_id = int(target.split(":")[-1]) + except Exception: + magnet_id = None + + if magnet_id is not None and get_search_provider is not None: + provider = get_search_provider("alldebrid", config) + if provider is not None: + try: + files = provider.search("*", limit=10_000, filters={"view": "files", "magnet_id": int(magnet_id)}) + except Exception: + files = [] + + # If the magnet isn't ready, provider.search returns a single not-ready folder row. + if files and len(files) == 1 and getattr(files[0], "media_kind", "") == "folder": + detail = getattr(files[0], "detail", "") + log( + f"[download-file] AllDebrid magnet {magnet_id} not ready ({detail or 'unknown'})", + file=sys.stderr, + ) + else: + for sr in files: + expanded_items.append(sr.to_dict() if hasattr(sr, "to_dict") else sr) + continue + + expanded_items.append(item) + except Exception: + expanded_items.append(item) + + return expanded_items + + def _process_provider_items( + self, + *, + piped_items: Sequence[Any], + final_output_dir: Path, + config: Dict[str, Any], + quiet_mode: bool, + registry: Dict[str, Any], + progress: PipelineProgress, + ) -> int: + downloaded_count = 0 + get_search_provider = registry.get("get_search_provider") + SearchResult = registry.get("SearchResult") + + expanded_items = self._expand_provider_items(piped_items=piped_items, registry=registry, config=config) + + for item in expanded_items: + try: + table = get_field(item, "table") + title = get_field(item, "title") + target = get_field(item, "path") or get_field(item, "url") + media_kind = get_field(item, "media_kind") + tags_val = get_field(item, "tag") + tags_list: Optional[List[str]] + if isinstance(tags_val, list): + tags_list = [str(t) for t in tags_val if t] + else: + tags_list = None + + full_metadata = get_field(item, "full_metadata") + if (not full_metadata) and isinstance(item, dict) and isinstance(item.get("extra"), dict): + extra_md = item["extra"].get("full_metadata") + if isinstance(extra_md, dict): + full_metadata = extra_md + + # If this looks like a provider item and providers are available, prefer provider.download() + downloaded_path: Optional[Path] = None + attempted_provider_download = False + if table and get_search_provider and SearchResult: + provider = get_search_provider(str(table), config) + if provider is not None: + attempted_provider_download = True + sr = SearchResult( + table=str(table), + title=str(title or "Unknown"), + path=str(target or ""), + full_metadata=full_metadata if isinstance(full_metadata, dict) else {}, + ) + debug(f"[download-file] Downloading provider item via {table}: {sr.title}") + downloaded_path = provider.download(sr, final_output_dir) + + # OpenLibrary: if provider download failed, do NOT try to download the OpenLibrary page HTML. + if downloaded_path is None and attempted_provider_download and str(table or "").lower() == "openlibrary": + availability = None + reason = None + if isinstance(full_metadata, dict): + availability = full_metadata.get("availability") + reason = full_metadata.get("availability_reason") + msg = "[download-file] OpenLibrary item not downloadable" + if availability or reason: + msg += f" (availability={availability or ''} reason={reason or ''})" + log(msg, file=sys.stderr) + + # Fallback: run a LibGen title search so the user can pick an alternative source. + try: + title_text = str(title or "").strip() + if not title_text and isinstance(full_metadata, dict): + title_text = str(full_metadata.get("title") or "").strip() + if title_text: + log(f"[download-file] Not available on OpenLibrary; searching LibGen for: {title_text}", file=sys.stderr) + from cmdlet.search_provider import CMDLET as _SEARCH_PROVIDER_CMDLET + + fallback_query = title_text + exec_fn = getattr(_SEARCH_PROVIDER_CMDLET, "exec", None) + if not callable(exec_fn): + log( + "[download-file] search-provider cmdlet unavailable; cannot run LibGen fallback search", + file=sys.stderr, + ) + continue + + ret = exec_fn( + None, + ["-provider", "libgen", "-query", fallback_query], + config, + ) + + # Promote the search-provider table to a display overlay so it renders. + try: + table_obj = pipeline_context.get_last_result_table() + items_obj = pipeline_context.get_last_result_items() + if table_obj is not None: + pipeline_context.set_last_result_table_overlay(table_obj, items_obj) + except Exception: + pass + + try: + return int(ret) # type: ignore[arg-type] + except Exception: + return 1 + except Exception: + pass + + continue + + # Fallback: if we have a direct HTTP URL, download it directly + if downloaded_path is None and isinstance(target, str) and target.startswith("http"): + # Guard: provider landing pages (e.g. LibGen ads.php) are HTML, not files. + # Never download these as "files". + if str(table or "").lower() == "libgen": + low = target.lower() + if ("/ads.php" in low) or ("/file.php" in low) or ("/index.php" in low): + log( + "[download-file] Refusing to download LibGen landing page (expected provider to resolve file link)", + file=sys.stderr, + ) + continue + debug(f"[download-file] Provider item looks like direct URL, downloading: {target}") + suggested_name = str(title).strip() if title is not None else None + result_obj = _download_direct_file( + target, + final_output_dir, + quiet=quiet_mode, + suggested_filename=suggested_name, + ) + downloaded_path = self._path_from_download_result(result_obj) + + if downloaded_path is None: + log(f"Cannot download item (no provider handler / unsupported target): {title or target}", file=sys.stderr) + continue + + self._emit_local_file( + downloaded_path=downloaded_path, + source=str(target) if target else None, + title_hint=str(title) if title else downloaded_path.stem, + tags_hint=tags_list, + media_kind_hint=str(media_kind) if media_kind else None, + full_metadata=full_metadata if isinstance(full_metadata, dict) else None, + progress=progress, + config=config, + ) + downloaded_count += 1 + + except DownloadError as e: + log(f"Download failed: {e}", file=sys.stderr) + except Exception as e: + log(f"Error downloading item: {e}", file=sys.stderr) + + return downloaded_count + def _run_impl(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: """Main download implementation for direct HTTP files.""" + progress = PipelineProgress(pipeline_context) try: debug("Starting download-file") # Parse arguments parsed = parse_cmdlet_args(args, self) - # Extract explicit URL args (if any) - raw_url = parsed.get("url", []) - if isinstance(raw_url, str): - raw_url = [raw_url] - - # Allow comma-separated URLs in a single argument. - # Example: download-file "https://a.pdf,https://b.pdf" - expanded_urls: List[str] = [] - for u in (raw_url or []): - if u is None: - continue - s = str(u).strip() - if not s: - continue - if "," in s: - parts = [p.strip() for p in s.split(",")] - expanded_urls.extend([p for p in parts if p]) - else: - expanded_urls.append(s) - if expanded_urls: - raw_url = expanded_urls - - # If no URL args were provided, fall back to piped results (provider items) - piped_items: List[Any] = [] - if not raw_url: - if isinstance(result, list): - piped_items = result - elif result: - piped_items = [result] + raw_url = self._normalize_urls(parsed) + piped_items = self._collect_piped_items_if_no_urls(result, raw_url) if not raw_url and not piped_items: log("No url or piped items to download", file=sys.stderr) @@ -104,334 +735,38 @@ class Download_File(Cmdlet): debug(f"Output directory: {final_output_dir}") - # Download each URL and/or provider item - downloaded_count = 0 + # If the caller isn't running the shared pipeline Live progress UI (e.g. direct + # cmdlet execution), start a minimal local pipeline progress panel so downloads + # show consistent, Rich-formatted progress (like download-media). + total_items = self._safe_total_items(raw_url, piped_items) + preview = self._build_preview(raw_url, piped_items, total_items) + + progress.ensure_local_ui(label="download-file", total_items=total_items, items_preview=preview) + quiet_mode = bool(config.get("_quiet_background_output")) if isinstance(config, dict) else False + registry = self._load_provider_registry() - # Provider lookup is optional; keep import local to avoid overhead if unused - get_search_provider = None - SearchResult = None - try: - from ProviderCore.registry import get_search_provider as _get_search_provider, SearchResult as _SearchResult + downloaded_count = 0 + urls_downloaded, early_exit = self._process_explicit_urls( + raw_urls=raw_url, + final_output_dir=final_output_dir, + config=config, + quiet_mode=quiet_mode, + registry=registry, + progress=progress, + ) + downloaded_count += int(urls_downloaded) + if early_exit is not None: + return int(early_exit) - get_search_provider = _get_search_provider - SearchResult = _SearchResult - except Exception: - get_search_provider = None - SearchResult = None - - def _emit_local_file(downloaded_path: Path, source: Optional[str], title_hint: Optional[str], tags_hint: Optional[List[str]], media_kind_hint: Optional[str], full_metadata: Optional[Dict[str, Any]], provider_hint: Optional[str] = None) -> None: - title_val = (title_hint or downloaded_path.stem or "Unknown").strip() or downloaded_path.stem - hash_value = self._compute_file_hash(downloaded_path) - tag: List[str] = [] - if tags_hint: - tag.extend([str(t) for t in tags_hint if t]) - if not any(str(t).lower().startswith("title:") for t in tag): - tag.insert(0, f"title:{title_val}") - - payload: Dict[str, Any] = { - "path": str(downloaded_path), - "hash": hash_value, - "title": title_val, - "action": "cmdlet:download-file", - "download_mode": "file", - "store": "local", - "media_kind": media_kind_hint or "file", - "tag": tag, - } - if provider_hint: - payload["provider"] = str(provider_hint) - if full_metadata: - payload["full_metadata"] = full_metadata - if source and str(source).startswith("http"): - payload["url"] = source - elif source: - payload["source_url"] = source - - pipeline_context.emit(payload) - - # Automatically register url with local library - if payload.get("url"): - pipe_obj = coerce_to_pipe_object(payload) - register_url_with_local_library(pipe_obj, config) - - # 1) Explicit URL downloads - for url in raw_url: - try: - debug(f"Processing URL: {url}") - - # Telegram message URLs are not direct files; route through the provider. - try: - parsed = urlparse(str(url)) - host = (parsed.hostname or "").lower().strip() - except Exception: - host = "" - - is_telegram = host in {"t.me", "telegram.me"} or host.endswith(".t.me") - if is_telegram and SearchResult: - try: - from ProviderCore.registry import get_provider as _get_provider - except Exception: - _get_provider = None - - if _get_provider is None: - raise DownloadError("Telegram provider registry not available") - - provider = _get_provider("telegram", config) - if provider is None: - raise DownloadError("Telegram provider not configured or not available (check telethon/app_id/api_hash)") - - sr = SearchResult(table="telegram", title=str(url), path=str(url), full_metadata={}) - downloaded_path = None - telegram_info: Optional[Dict[str, Any]] = None - if hasattr(provider, "download_url"): - try: - downloaded_path, telegram_info = provider.download_url(str(url), final_output_dir) # type: ignore[attr-defined] - except Exception as exc: - raise DownloadError(str(exc)) - else: - downloaded_path = provider.download(sr, final_output_dir) - - if not downloaded_path: - raise DownloadError("Telegram download returned no file") - - channel = "" - post = None - if isinstance(telegram_info, dict): - try: - chat_info = telegram_info.get("chat") if isinstance(telegram_info.get("chat"), dict) else {} - msg_info = telegram_info.get("message") if isinstance(telegram_info.get("message"), dict) else {} - channel = str(chat_info.get("title") or chat_info.get("username") or "").strip() - post = msg_info.get("id") - except Exception: - channel = "" - post = None - - title_hint = None - tags_hint: List[str] = [] - if channel: - tags_hint.append(f"channel:{channel}") - if post is not None: - tags_hint.append(f"post:{post}") - if channel and post is not None: - title_hint = f"{channel} {post}" - elif post is not None: - title_hint = f"post:{post}" - else: - title_hint = downloaded_path.stem - - _emit_local_file( - downloaded_path=downloaded_path, - source=str(url), - title_hint=title_hint, - tags_hint=tags_hint, - media_kind_hint="file", - full_metadata=telegram_info, - provider_hint="telegram", - ) - downloaded_count += 1 - debug("✓ Downloaded via Telegram provider and emitted") - continue - - result_obj = _download_direct_file(url, final_output_dir, quiet=quiet_mode) - file_path = None - if hasattr(result_obj, "path"): - file_path = getattr(result_obj, "path") - elif isinstance(result_obj, dict): - file_path = result_obj.get("path") - if not file_path: - file_path = str(result_obj) - downloaded_path = Path(str(file_path)) - - _emit_local_file( - downloaded_path=downloaded_path, - source=url, - title_hint=downloaded_path.stem, - tags_hint=[f"title:{downloaded_path.stem}"], - media_kind_hint="file", - full_metadata=None, - ) - downloaded_count += 1 - debug("✓ Downloaded and emitted") - - except DownloadError as e: - log(f"Download failed for {url}: {e}", file=sys.stderr) - except Exception as e: - log(f"Error processing {url}: {e}", file=sys.stderr) - - # 2) Provider item downloads (piped results) - # Expand provider "folder" rows into their contained files when possible (e.g., AllDebrid magnets). - expanded_items: List[Any] = [] - for item in piped_items: - try: - table = get_field(item, "table") - media_kind = get_field(item, "media_kind") - full_metadata = get_field(item, "full_metadata") - target = get_field(item, "path") or get_field(item, "url") - - if str(table or "").lower() == "alldebrid" and str(media_kind or "").lower() == "folder": - magnet_id = None - if isinstance(full_metadata, dict): - magnet_id = full_metadata.get("magnet_id") - if magnet_id is None and isinstance(target, str) and target.lower().startswith("alldebrid:magnet:"): - try: - magnet_id = int(target.split(":")[-1]) - except Exception: - magnet_id = None - - if magnet_id is not None and get_search_provider is not None: - provider = get_search_provider("alldebrid", config) - if provider is not None: - try: - files = provider.search("*", limit=10_000, filters={"view": "files", "magnet_id": int(magnet_id)}) - except Exception: - files = [] - - # If the magnet isn't ready, provider.search returns a single not-ready folder row. - if files and len(files) == 1 and getattr(files[0], "media_kind", "") == "folder": - detail = getattr(files[0], "detail", "") - log(f"[download-file] AllDebrid magnet {magnet_id} not ready ({detail or 'unknown'})", file=sys.stderr) - else: - for sr in files: - expanded_items.append(sr.to_dict() if hasattr(sr, "to_dict") else sr) - continue - - expanded_items.append(item) - except Exception: - expanded_items.append(item) - - for item in expanded_items: - try: - table = get_field(item, "table") - title = get_field(item, "title") - target = get_field(item, "path") or get_field(item, "url") - media_kind = get_field(item, "media_kind") - tags_val = get_field(item, "tag") - tags_list: Optional[List[str]] - if isinstance(tags_val, list): - tags_list = [str(t) for t in tags_val if t] - else: - tags_list = None - - full_metadata = get_field(item, "full_metadata") - if (not full_metadata) and isinstance(item, dict) and isinstance(item.get("extra"), dict): - extra_md = item["extra"].get("full_metadata") - if isinstance(extra_md, dict): - full_metadata = extra_md - - # If this looks like a provider item and providers are available, prefer provider.download() - downloaded_path: Optional[Path] = None - attempted_provider_download = False - if table and get_search_provider and SearchResult: - provider = get_search_provider(str(table), config) - if provider is not None: - attempted_provider_download = True - sr = SearchResult( - table=str(table), - title=str(title or "Unknown"), - path=str(target or ""), - full_metadata=full_metadata if isinstance(full_metadata, dict) else {}, - ) - debug(f"[download-file] Downloading provider item via {table}: {sr.title}") - downloaded_path = provider.download(sr, final_output_dir) - - # OpenLibrary: if provider download failed, do NOT try to download the OpenLibrary page HTML. - if downloaded_path is None and attempted_provider_download and str(table or "").lower() == "openlibrary": - availability = None - reason = None - if isinstance(full_metadata, dict): - availability = full_metadata.get("availability") - reason = full_metadata.get("availability_reason") - msg = "[download-file] OpenLibrary item not downloadable" - if availability or reason: - msg += f" (availability={availability or ''} reason={reason or ''})" - log(msg, file=sys.stderr) - - # Fallback: run a LibGen title search so the user can pick an alternative source. - try: - title_text = str(title or "").strip() - if not title_text and isinstance(full_metadata, dict): - title_text = str(full_metadata.get("title") or "").strip() - if title_text: - log(f"[download-file] Not available on OpenLibrary; searching LibGen for: {title_text}", file=sys.stderr) - from cmdlet.search_provider import CMDLET as _SEARCH_PROVIDER_CMDLET - # Use plain title text (LibGen mirrors can be finicky with fielded query prefixes). - fallback_query = title_text - exec_fn = getattr(_SEARCH_PROVIDER_CMDLET, "exec", None) - if not callable(exec_fn): - log("[download-file] search-provider cmdlet unavailable; cannot run LibGen fallback search", file=sys.stderr) - continue - - ret = exec_fn( - None, - ["-provider", "libgen", "-query", fallback_query], - config, - ) - - # download-file is treated as an action command by the pipeline printer. - # Promote the search-provider table to a display overlay so it renders. - try: - table = pipeline_context.get_last_result_table() - items = pipeline_context.get_last_result_items() - if table is not None: - pipeline_context.set_last_result_table_overlay(table, items) - except Exception: - pass - - try: - return int(ret) # type: ignore[arg-type] - except Exception: - return 1 - except Exception: - pass - - continue - - # Fallback: if we have a direct HTTP URL, download it directly - if downloaded_path is None and isinstance(target, str) and target.startswith("http"): - # Guard: provider landing pages (e.g. LibGen ads.php) are HTML, not files. - # Never download these as "files". - if str(table or "").lower() == "libgen": - low = target.lower() - if ("/ads.php" in low) or ("/file.php" in low) or ("/index.php" in low): - log("[download-file] Refusing to download LibGen landing page (expected provider to resolve file link)", file=sys.stderr) - continue - debug(f"[download-file] Provider item looks like direct URL, downloading: {target}") - # Use provider title as filename hint so multiple items don't overwrite as downloaded_file.bin - suggested_name = str(title).strip() if title is not None else None - result_obj = _download_direct_file( - target, - final_output_dir, - quiet=quiet_mode, - suggested_filename=suggested_name, - ) - file_path = None - if hasattr(result_obj, "path"): - file_path = getattr(result_obj, "path") - elif isinstance(result_obj, dict): - file_path = result_obj.get("path") - if not file_path: - file_path = str(result_obj) - downloaded_path = Path(str(file_path)) - - if downloaded_path is None: - log(f"Cannot download item (no provider handler / unsupported target): {title or target}", file=sys.stderr) - continue - - _emit_local_file( - downloaded_path=downloaded_path, - source=str(target) if target else None, - title_hint=str(title) if title else downloaded_path.stem, - tags_hint=tags_list, - media_kind_hint=str(media_kind) if media_kind else None, - full_metadata=full_metadata if isinstance(full_metadata, dict) else None, - ) - downloaded_count += 1 - - except DownloadError as e: - log(f"Download failed: {e}", file=sys.stderr) - except Exception as e: - log(f"Error downloading item: {e}", file=sys.stderr) + downloaded_count += self._process_provider_items( + piped_items=piped_items, + final_output_dir=final_output_dir, + config=config, + quiet_mode=quiet_mode, + registry=registry, + progress=progress, + ) if downloaded_count > 0: debug(f"✓ Successfully processed {downloaded_count} file(s)") @@ -444,6 +779,9 @@ class Download_File(Cmdlet): log(f"Error in download-file: {e}", file=sys.stderr) return 1 + finally: + progress.close_local_ui(force_complete=True) + def _resolve_output_dir(self, parsed: Dict[str, Any], config: Dict[str, Any]) -> Optional[Path]: """Resolve the output directory from storage location or config.""" output_dir_arg = parsed.get("path") or parsed.get("output") diff --git a/cmdlet/download_media.py b/cmdlet/download_media.py index b1de2dc..cf824b7 100644 --- a/cmdlet/download_media.py +++ b/cmdlet/download_media.py @@ -22,11 +22,13 @@ import sys import tempfile import time import traceback +from contextlib import AbstractContextManager, nullcontext from pathlib import Path -from typing import Any, Dict, Iterator, List, Optional, Sequence +from typing import Any, Dict, Iterator, List, Optional, Sequence, cast from urllib.parse import urlparse from SYS.logger import log, debug +from SYS.pipeline_progress import PipelineProgress from SYS.utils import sha256_file from models import DownloadError, DownloadOptions, DownloadMediaResult, DebugLogger, ProgressBar import pipeline as pipeline_context @@ -112,6 +114,7 @@ try: from yt_dlp.extractor import gen_extractors # type: ignore except Exception as exc: yt_dlp = None # type: ignore + gen_extractors = None # type: ignore YTDLP_IMPORT_ERROR = exc else: YTDLP_IMPORT_ERROR = None @@ -155,6 +158,8 @@ def _format_chapters_note(info: Dict[str, Any]) -> Optional[str]: title_raw = ch.get("title") or ch.get("name") or ch.get("chapter") try: + if start_raw is None: + continue start_s = int(float(start_raw)) except Exception: continue @@ -218,197 +223,122 @@ def _best_subtitle_sidecar(media_path: Path) -> Optional[Path]: if p.suffix.lower() in _SUBTITLE_EXTS: candidates.append(p) - if not candidates: - return None + # Prefer VTT then SRT then others. + preferred_order = [".vtt", ".srt", ".ass", ".ssa", ".lrc"] + for ext in preferred_order: + for p in candidates: + if p.suffix.lower() == ext: + return p - def _rank(path: Path) -> tuple[int, int, float, str]: - name = path.name.lower() - lang_rank = 0 if ".en." in name or name.endswith(".en" + path.suffix.lower()) else 1 - ext = path.suffix.lower() - ext_rank_map = {".vtt": 0, ".srt": 1, ".ass": 2, ".ssa": 3, ".lrc": 4} - ext_rank = ext_rank_map.get(ext, 9) - try: - mtime = float(path.stat().st_mtime) - except Exception: - mtime = 0.0 - return (lang_rank, ext_rank, -mtime, name) - - candidates.sort(key=_rank) - return candidates[0] + return candidates[0] if candidates else None except Exception: return None -def _read_text_file(path: Path, *, max_bytes: int = 1_500_000) -> Optional[str]: +def _read_text_file(path: Path) -> Optional[str]: try: - data = path.read_bytes() + return path.read_text(encoding="utf-8", errors="ignore") except Exception: return None - if not data: - return None - if len(data) > max_bytes: - data = data[:max_bytes] - try: - return data.decode("utf-8", errors="replace") - except Exception: - try: - return data.decode(errors="replace") - except Exception: - return None def _ensure_yt_dlp_ready() -> None: - if yt_dlp is not None: - return - detail = str(YTDLP_IMPORT_ERROR or "yt-dlp is not installed") - raise DownloadError(f"yt-dlp module not available: {detail}") + if YTDLP_IMPORT_ERROR is not None: + raise DownloadError(f"yt-dlp import error: {YTDLP_IMPORT_ERROR}") + if yt_dlp is None: + raise DownloadError("yt-dlp is not available") + + +def _get_extractors() -> List[Any]: + global _EXTRACTOR_CACHE + if _EXTRACTOR_CACHE is not None: + return _EXTRACTOR_CACHE + _ensure_yt_dlp_ready() + assert gen_extractors is not None + try: + _EXTRACTOR_CACHE = list(gen_extractors()) + except Exception: + _EXTRACTOR_CACHE = [] + return _EXTRACTOR_CACHE def is_url_supported_by_ytdlp(url: str) -> bool: - if yt_dlp is None: + if not url or not isinstance(url, str): return False - global _EXTRACTOR_CACHE - if _EXTRACTOR_CACHE is None: - try: - _EXTRACTOR_CACHE = [ie for ie in gen_extractors()] # type: ignore[arg-type] - except Exception: - _EXTRACTOR_CACHE = [] - for extractor in _EXTRACTOR_CACHE: - try: - if not extractor.suitable(url): + if YTDLP_IMPORT_ERROR is not None: + return False + try: + parsed = urlparse(url) + if not parsed.scheme or not parsed.netloc: + return False + except Exception: + return False + try: + for ie in _get_extractors(): + try: + if ie.suitable(url) and ie.IE_NAME != "generic": + return True + except Exception: continue - except Exception: - continue - name = getattr(extractor, "IE_NAME", "") - if name.lower() == "generic": - continue - return True + except Exception: + return False return False def list_formats( url: str, + *, no_playlist: bool = False, playlist_items: Optional[str] = None, cookiefile: Optional[str] = None, ) -> Optional[List[Dict[str, Any]]]: + if not is_url_supported_by_ytdlp(url): + return None _ensure_yt_dlp_ready() - try: - assert yt_dlp is not None - ydl_opts: Dict[str, Any] = {"quiet": True, "no_warnings": True, "socket_timeout": 30} - if no_playlist: - ydl_opts["noplaylist"] = True - if playlist_items: - ydl_opts["playlist_items"] = playlist_items - if cookiefile: - ydl_opts["cookiefile"] = cookiefile + assert yt_dlp is not None - debug(f"Fetching format list for: {url}") + ydl_opts: Dict[str, Any] = { + "quiet": True, + "no_warnings": True, + "skip_download": True, + "noprogress": True, + } + if cookiefile: + ydl_opts["cookiefile"] = str(cookiefile) + if no_playlist: + ydl_opts["noplaylist"] = True + if playlist_items: + ydl_opts["playlist_items"] = str(playlist_items) + + try: with yt_dlp.YoutubeDL(ydl_opts) as ydl: # type: ignore[arg-type] info = ydl.extract_info(url, download=False) - - if not isinstance(info, dict): - log("No formats available", file=sys.stderr) - return None - - formats = info.get("formats") or [] - - # Some URLs (notably playlist contexts) yield a playlist-shaped payload with - # `entries` rather than a direct video payload. If so, try to pull formats - # from the first concrete entry. - if (not formats) and isinstance(info.get("entries"), list): - try: - for entry in info.get("entries") or []: - if not isinstance(entry, dict): - continue - entry_formats = entry.get("formats") - if isinstance(entry_formats, list) and entry_formats: - formats = entry_formats - break - except Exception: - pass - - if not isinstance(formats, list) or not formats: - log("No formats available", file=sys.stderr) - return None - - result_formats: List[Dict[str, Any]] = [] - for fmt in formats: - if not isinstance(fmt, dict): - continue - result_formats.append( - { - "format_id": fmt.get("format_id", ""), - "format": fmt.get("format", ""), - "ext": fmt.get("ext", ""), - "resolution": fmt.get("resolution", ""), - "width": fmt.get("width"), - "height": fmt.get("height"), - "fps": fmt.get("fps"), - "vcodec": fmt.get("vcodec", "none"), - "acodec": fmt.get("acodec", "none"), - "filesize": fmt.get("filesize"), - "abr": fmt.get("abr"), - "tbr": fmt.get("tbr"), - } - ) - - debug(f"Found {len(result_formats)} available formats") - return result_formats or None - except Exception as e: - log(f"✗ Error fetching formats: {e}", file=sys.stderr) + except Exception: return None - -def _pick_best_audio_format_id(formats: List[Dict[str, Any]]) -> Optional[str]: - audio_only: List[Dict[str, Any]] = [] - for fmt in formats: - if not isinstance(fmt, dict): - continue - format_id = str(fmt.get("format_id") or "").strip() - if not format_id: - continue - vcodec = str(fmt.get("vcodec") or "none").lower() - acodec = str(fmt.get("acodec") or "none").lower() - if vcodec != "none": - continue - if not acodec or acodec == "none": - continue - audio_only.append(fmt) - - if not audio_only: + if not isinstance(info, dict): return None - - def score(f: Dict[str, Any]) -> tuple[float, float]: - tbr = f.get("tbr") - abr = f.get("abr") - bitrate = 0.0 - for candidate in (tbr, abr): - try: - if candidate is not None: - bitrate = max(bitrate, float(candidate)) - except Exception: - pass - size = 0.0 - try: - fs = f.get("filesize") - if fs is not None: - size = float(fs) - except Exception: - pass - return (bitrate, size) - - best = max(audio_only, key=score) - best_id = str(best.get("format_id") or "").strip() - return best_id or None + formats = info.get("formats") + if not isinstance(formats, list): + return None + out: List[Dict[str, Any]] = [] + for f in formats: + if isinstance(f, dict): + out.append(f) + return out -def _download_with_sections_via_cli(url: str, ytdl_options: Dict[str, Any], sections: List[str], quiet: bool = False) -> tuple[Optional[str], Dict[str, Any]]: +def _download_with_sections_via_cli( + url: str, + ytdl_options: Dict[str, Any], + sections: List[str], + quiet: bool = False, +) -> tuple[Optional[str], Dict[str, Any]]: sections_list = ytdl_options.get("download_sections", []) if not sections_list: return "", {} - session_id = hashlib.md5((url + str(time.time()) + ''.join(random.choices(string.ascii_letters, k=10))).encode()).hexdigest()[:12] + session_id = hashlib.md5((url + str(time.time()) + "".join(random.choices(string.ascii_letters, k=10))).encode()).hexdigest()[:12] first_section_info = None total_sections = len(sections_list) @@ -528,7 +458,6 @@ def _download_with_sections_via_cli(url: str, ytdl_options: Dict[str, Any], sect return session_id, first_section_info or {} - def _iter_download_entries(info: Dict[str, Any]) -> Iterator[Dict[str, Any]]: queue: List[Dict[str, Any]] = [info] seen: set[int] = set() @@ -614,6 +543,7 @@ def _extract_sha256(info: Dict[str, Any]) -> Optional[str]: return None + def _progress_callback(status: Dict[str, Any]) -> None: """Simple progress callback using logger.""" event = status.get("status") @@ -814,7 +744,7 @@ def download_media( if ytdl_options.get("download_sections"): # For clip (download_sections), keep pipeline Live UI active and suppress # yt-dlp/ffmpeg CLI spam when running in quiet/pipeline mode. - live_ui, _ = _live_ui_and_pipe_index() + live_ui, _ = PipelineProgress(pipeline_context).ui_and_pipe_index() quiet_sections = bool(opts.quiet) or (live_ui is not None) session_id, first_section_info = _download_with_sections_via_cli( opts.url, @@ -1045,7 +975,7 @@ def download_media( log(f"Unexpected yt-dlp response: {type(info)}", file=sys.stderr) raise DownloadError("Unexpected yt-dlp response type") - info_dict: Dict[str, Any] = info + info_dict: Dict[str, Any] = cast(Dict[str, Any], info) if debug_logger is not None: debug_logger.write_record( "ytdlp-info", @@ -1240,6 +1170,1297 @@ class Download_Media(Cmdlet): config["_quiet_background_output"] = True return self._run_impl(result, args, config) + @staticmethod + def _normalize_urls(parsed: Dict[str, Any]) -> List[str]: + raw_url = parsed.get("url", []) + if isinstance(raw_url, str): + raw_url = [raw_url] + + expanded_urls: List[str] = [] + for u in (raw_url or []): + if u is None: + continue + s = str(u).strip() + if not s: + continue + if "," in s: + parts = [p.strip() for p in s.split(",")] + expanded_urls.extend([p for p in parts if p]) + else: + expanded_urls.append(s) + return expanded_urls + + @staticmethod + def _append_urls_from_piped_result(raw_urls: List[str], result: Any) -> List[str]: + if raw_urls: + return raw_urls + if not result: + return raw_urls + + results_to_check = result if isinstance(result, list) else [result] + for item in results_to_check: + try: + url = get_field(item, "url") or get_field(item, "target") + except Exception: + url = None + if url: + raw_urls.append(url) + return raw_urls + + @staticmethod + def _filter_supported_urls(raw_urls: Sequence[str]) -> tuple[List[str], List[str]]: + supported = [url for url in (raw_urls or []) if is_url_supported_by_ytdlp(url)] + # Preserve original debug semantics: count unique unsupported URLs. + unsupported = list(set(raw_urls or []) - set(supported or [])) + return supported, unsupported + + def _parse_query_keyed_spec(self, query_spec: Optional[str]) -> Dict[str, List[str]]: + if not query_spec: + return {} + try: + return self._parse_keyed_csv_spec(str(query_spec), default_key="hash") + except Exception: + return {} + + @staticmethod + def _extract_hash_override(query_spec: Optional[str], query_keyed: Dict[str, List[str]]) -> Optional[str]: + try: + hash_values = query_keyed.get("hash", []) if isinstance(query_keyed, dict) else [] + hash_candidate = (hash_values[-1] if hash_values else None) + if hash_candidate: + return sh.parse_single_hash_query(f"hash:{hash_candidate}") + + # Backwards-compatible: treat a non-keyed query as a hash query. + return sh.parse_single_hash_query(str(query_spec)) if query_spec else None + except Exception: + return None + + def _parse_clip_ranges_and_apply_items( + self, + *, + clip_spec: Optional[str], + query_keyed: Dict[str, List[str]], + parsed: Dict[str, Any], + query_spec: Optional[str], + ) -> tuple[Optional[List[tuple[int, int]]], bool, List[str]]: + clip_ranges: Optional[List[tuple[int, int]]] = None + clip_values: List[str] = [] + item_values: List[str] = [] + + if clip_spec: + # Support keyed clip syntax: + # -clip "clip:3m4s-3m14s,1h22m-1h33m,item:2-3" + keyed = self._parse_keyed_csv_spec(str(clip_spec), default_key="clip") + clip_values.extend(keyed.get("clip", []) or []) + item_values.extend(keyed.get("item", []) or []) + + # Allow the same keyed spec language inside -query so users can do: + # download-media -query "clip:1m-1m15s,2m1s-2m11s" + if query_keyed: + clip_values.extend(query_keyed.get("clip", []) or []) + item_values.extend(query_keyed.get("item", []) or []) + + if item_values and not parsed.get("item"): + parsed["item"] = ",".join([v for v in item_values if v]) + + if clip_values: + clip_ranges = self._parse_time_ranges(",".join([v for v in clip_values if v])) + if not clip_ranges: + bad_spec = clip_spec or query_spec + log(f"Invalid clip format: {bad_spec}", file=sys.stderr) + return None, True, clip_values + + return clip_ranges, False, clip_values + + @staticmethod + def _init_storage(config: Dict[str, Any]) -> tuple[Optional[Any], bool]: + storage = None + hydrus_available = True + try: + from Store import Store + storage = Store(config=config or {}, suppress_debug=True) + from API.HydrusNetwork import is_hydrus_available + hydrus_available = bool(is_hydrus_available(config or {})) + except Exception: + storage = None + return storage, hydrus_available + + @staticmethod + def _cookiefile_str(ytdlp_tool: YtDlpTool) -> Optional[str]: + try: + cookie_path = ytdlp_tool.resolve_cookiefile() + if cookie_path is not None and cookie_path.is_file(): + return str(cookie_path) + except Exception: + pass + return None + + def _list_formats_cached( + self, + u: str, + *, + playlist_items_value: Optional[str], + formats_cache: Dict[str, Optional[List[Dict[str, Any]]]], + ytdlp_tool: YtDlpTool, + ) -> Optional[List[Dict[str, Any]]]: + key = f"{u}||{playlist_items_value or ''}" + if key in formats_cache: + return formats_cache[key] + fmts = list_formats( + u, + no_playlist=False, + playlist_items=playlist_items_value, + cookiefile=self._cookiefile_str(ytdlp_tool), + ) + formats_cache[key] = fmts + return fmts + + @staticmethod + def _canonicalize_url_for_storage(*, requested_url: str, ytdlp_tool: YtDlpTool, playlist_items: Optional[str]) -> str: + # Prefer yt-dlp's canonical webpage URL (e.g. strips timestamps/redirects). + # Fall back to the requested URL if probing fails. + # Important: when playlist item selection is used, avoid probing (can hang on large playlists). + if playlist_items: + return str(requested_url) + try: + cf = None + try: + cookie_path = ytdlp_tool.resolve_cookiefile() + if cookie_path is not None and cookie_path.is_file(): + cf = str(cookie_path) + except Exception: + cf = None + pr = probe_url(requested_url, no_playlist=False, timeout_seconds=15, cookiefile=cf) + if isinstance(pr, dict): + for key in ("webpage_url", "original_url", "url", "requested_url"): + value = pr.get(key) + if isinstance(value, str) and value.strip(): + return value.strip() + except Exception: + pass + return str(requested_url) + + def _preflight_url_duplicate( + self, + *, + storage: Any, + hydrus_available: bool, + final_output_dir: Path, + candidate_url: str, + extra_urls: Optional[Sequence[str]] = None, + ) -> bool: + # NOTE: download-media sets _quiet_background_output=True when running in a pipeline to + # reduce background noise. URL de-dup is interactive and must still run in pipelines. + if storage is None: + debug("Preflight URL check skipped: storage unavailable") + return True + + debug(f"Preflight URL check: candidate={candidate_url}") + + try: + from metadata import normalize_urls + except Exception: + normalize_urls = None # type: ignore[assignment] + + needles: List[str] = [] + if normalize_urls is not None: + for raw in [candidate_url, *(list(extra_urls) if extra_urls else [])]: + try: + needles.extend(normalize_urls(raw)) + except Exception: + continue + # Fallback: always have at least one needle + if not needles: + needles = [str(candidate_url)] + + # Deduplicate needles (preserve order) + seen_needles: List[str] = [] + for needle in needles: + if needle and needle not in seen_needles: + seen_needles.append(needle) + needles = seen_needles + + try: + debug(f"Preflight URL needles: {needles}") + except Exception: + pass + + url_matches: List[Dict[str, Any]] = [] + try: + from Store.HydrusNetwork import HydrusNetwork + + # Avoid searching the temp/download directory backend during dedup. + # We only want to warn about duplicates in real stores. + backend_names_all = storage.list_searchable_backends() + backend_names: List[str] = [] + skipped: List[str] = [] + for backend_name in backend_names_all: + try: + backend = storage[backend_name] + except Exception: + continue + + try: + if str(backend_name).strip().lower() == "temp": + skipped.append(backend_name) + continue + except Exception: + pass + + # Heuristic: if a Folder backend points at the configured temp output dir, skip it. + try: + backend_location = getattr(backend, "_location", None) + if backend_location and final_output_dir: + backend_path = Path(str(backend_location)).expanduser().resolve() + temp_path = Path(str(final_output_dir)).expanduser().resolve() + if backend_path == temp_path: + skipped.append(backend_name) + continue + except Exception: + pass + + backend_names.append(backend_name) + + try: + if skipped: + debug(f"Preflight backends: {backend_names} (skipped temp: {skipped})") + else: + debug(f"Preflight backends: {backend_names}") + except Exception: + pass + + for backend_name in backend_names: + backend = storage[backend_name] + if isinstance(backend, HydrusNetwork) and not hydrus_available: + continue + + backend_hits: List[Dict[str, Any]] = [] + for needle in needles: + try: + backend_hits = backend.search(f"url:{needle}", limit=25) or [] + if backend_hits: + break + except Exception: + continue + if backend_hits: + url_matches.extend([dict(x) if isinstance(x, dict) else {"title": str(x)} for x in backend_hits]) + + if len(url_matches) >= 25: + url_matches = url_matches[:25] + break + except Exception: + url_matches = [] + + if not url_matches: + debug("Preflight URL check: no matches") + return True + + table = ResultTable(f"URL already exists ({len(url_matches)} match(es))") + results_list: List[Dict[str, Any]] = [] + for item in url_matches: + if "title" not in item: + item["title"] = item.get("name") or item.get("target") or item.get("path") or "Result" + + # Keep the full payload for history/inspection, but display a focused table. + # Use shared extractors so Ext/Size/Store/Hash remain consistent everywhere. + try: + from result_table import build_display_row + except Exception: + build_display_row = None # type: ignore + + if callable(build_display_row): + display_row = build_display_row(item, keys=["title", "store", "hash", "ext", "size"]) + else: + display_row = { + "title": item.get("title"), + "store": item.get("store"), + "hash": item.get("hash") or item.get("file_hash") or item.get("sha256"), + "ext": str(item.get("ext") or ""), + "size": item.get("size") or item.get("size_bytes"), + } + table.add_result(display_row) + results_list.append(item) + + pipeline_context.set_current_stage_table(table) + pipeline_context.set_last_result_table(table, results_list) + + suspend = getattr(pipeline_context, "suspend_live_progress", None) + used_suspend = False + + cm: AbstractContextManager[Any] = nullcontext() + if callable(suspend): + try: + maybe_cm = suspend() + if maybe_cm is not None: + cm = maybe_cm # type: ignore[assignment] + used_suspend = True + except Exception: + cm = nullcontext() + used_suspend = False + + with cm: + get_stderr_console().print(table) + setattr(table, "_rendered_by_cmdlet", True) + if not Confirm.ask("Continue anyway?", default=False, console=get_stderr_console()): + if used_suspend: + try: + pipeline_context.request_pipeline_stop(reason="duplicate-url declined", exit_code=0) + except Exception: + pass + return False + return True + + def _preflight_url_duplicates_bulk( + self, + *, + storage: Any, + hydrus_available: bool, + final_output_dir: Path, + urls: Sequence[str], + ) -> bool: + """Preflight URL de-dup for a batch of URLs. + + Purpose: + - Avoid per-item interactive URL checks inside a playlist loop. + - Let the user see ALL duplicates up front, before any downloads start. + """ + if storage is None: + debug("Bulk URL preflight skipped: storage unavailable") + return True + + unique_urls: List[str] = [] + for u in urls or []: + s = str(u or "").strip() + if s and s not in unique_urls: + unique_urls.append(s) + if len(unique_urls) <= 1: + return True + + try: + from metadata import normalize_urls + except Exception: + normalize_urls = None # type: ignore[assignment] + + def _httpish(value: str) -> bool: + try: + return bool(value) and (value.startswith("http://") or value.startswith("https://")) + except Exception: + return False + + url_needles: Dict[str, List[str]] = {} + for u in unique_urls: + needles: List[str] = [] + if normalize_urls is not None: + try: + needles.extend([n for n in (normalize_urls(u) or []) if isinstance(n, str)]) + except Exception: + needles = [] + if not needles: + needles = [u] + # Prefer http(s) needles for store lookups. + filtered: List[str] = [] + for n in needles: + n2 = str(n or "").strip() + if not n2: + continue + if not _httpish(n2): + continue + if n2 not in filtered: + filtered.append(n2) + url_needles[u] = filtered if filtered else [u] + + # Determine backends once (same filtering as per-URL preflight). + backend_names: List[str] = [] + try: + backend_names_all = storage.list_searchable_backends() + except Exception: + backend_names_all = [] + + for backend_name in backend_names_all: + try: + backend = storage[backend_name] + except Exception: + continue + + try: + if str(backend_name).strip().lower() == "temp": + continue + except Exception: + pass + + try: + backend_location = getattr(backend, "_location", None) + if backend_location and final_output_dir: + backend_path = Path(str(backend_location)).expanduser().resolve() + temp_path = Path(str(final_output_dir)).expanduser().resolve() + if backend_path == temp_path: + continue + except Exception: + pass + + backend_names.append(backend_name) + + if not backend_names: + debug("Bulk URL preflight skipped: no searchable backends") + return True + + # Collect matches as display rows (cap to keep output reasonable) + seen_pairs: set[tuple[str, str]] = set() + matched_urls: set[str] = set() + match_rows: List[Dict[str, Any]] = [] + max_rows = 200 + + try: + from Store.HydrusNetwork import HydrusNetwork + except Exception: + HydrusNetwork = None # type: ignore + + for backend_name in backend_names: + if len(match_rows) >= max_rows: + break + try: + backend = storage[backend_name] + except Exception: + continue + + if HydrusNetwork is not None and isinstance(backend, HydrusNetwork): + if not hydrus_available: + continue + + client = getattr(backend, "_client", None) + if client is None: + continue + + for original_url, needles in url_needles.items(): + if len(match_rows) >= max_rows: + break + if (original_url, str(backend_name)) in seen_pairs: + continue + + # Fast-path: ask Hydrus whether it already knows this URL. + found_hash: Optional[str] = None + found = False + for needle in (needles or [])[:3]: + if not _httpish(needle): + continue + try: + from API.HydrusNetwork import HydrusRequestSpec + + spec = HydrusRequestSpec( + method="GET", + endpoint="/add_urls/get_url_files", + query={"url": needle}, + ) + response = client._perform_request(spec) # type: ignore[attr-defined] + raw_hashes = None + if isinstance(response, dict): + raw_hashes = response.get("hashes") or response.get("file_hashes") + raw_ids = response.get("file_ids") + has_ids = isinstance(raw_ids, list) and len(raw_ids) > 0 + has_hashes = isinstance(raw_hashes, list) and len(raw_hashes) > 0 + if has_hashes: + try: + found_hash = str(raw_hashes[0]).strip() # type: ignore[index] + except Exception: + found_hash = None + if has_ids or has_hashes: + found = True + break + except Exception: + continue + + if not found: + continue + + seen_pairs.add((original_url, str(backend_name))) + matched_urls.add(original_url) + display_row = { + "title": "(exists)", + "store": str(backend_name), + "hash": found_hash or "", + "url": original_url, + "columns": [ + ("Title", "(exists)"), + ("Store", str(backend_name)), + ("Hash", found_hash or ""), + ("URL", original_url), + ], + } + match_rows.append(display_row) + continue + + # Generic backends: use the existing search() contract. + for original_url, needles in url_needles.items(): + if len(match_rows) >= max_rows: + break + if (original_url, str(backend_name)) in seen_pairs: + continue + + backend_hits: List[Dict[str, Any]] = [] + for needle in (needles or [])[:3]: + try: + backend_hits = backend.search(f"url:{needle}", limit=1) or [] + if backend_hits: + break + except Exception: + continue + + if not backend_hits: + continue + + seen_pairs.add((original_url, str(backend_name))) + matched_urls.add(original_url) + hit = backend_hits[0] + title = hit.get("title") or hit.get("name") or hit.get("target") or hit.get("path") or "(exists)" + file_hash = hit.get("hash") or hit.get("file_hash") or hit.get("sha256") or "" + + try: + from result_table import build_display_row + except Exception: + build_display_row = None # type: ignore + + extracted = { + "title": str(title), + "store": str(hit.get("store") or backend_name), + "hash": str(file_hash or ""), + "ext": "", + "size": None, + } + if callable(build_display_row): + try: + extracted = build_display_row(hit, keys=["title", "store", "hash", "ext", "size"]) + except Exception: + pass + # Ensure we still prefer the precomputed values for title/store/hash. + extracted["title"] = str(title) + extracted["store"] = str(hit.get("store") or backend_name) + extracted["hash"] = str(file_hash or "") + + ext = extracted.get("ext") + size_val = extracted.get("size") + + display_row = { + "title": str(title), + "store": str(hit.get("store") or backend_name), + "hash": str(file_hash or ""), + "ext": str(ext or ""), + "size": size_val, + "url": original_url, + "columns": [ + ("Title", str(title)), + ("Store", str(hit.get("store") or backend_name)), + ("Hash", str(file_hash or "")), + ("Ext", str(ext or "")), + ("Size", size_val), + ("URL", original_url), + ], + } + match_rows.append(display_row) + + if not match_rows: + debug("Bulk URL preflight: no matches") + return True + + # This table is non-interactive and intentionally wide (we want URL + ext/size). + table = ResultTable(f"URL already exists ({len(matched_urls)} url(s))", max_columns=10) + table.set_no_choice(True) + try: + table.set_preserve_order(True) + except Exception: + pass + + for row in match_rows: + table.add_result(row) + + # Display as an overlay so we don't clobber the current selectable table/history. + try: + pipeline_context.set_last_result_table_overlay(table, match_rows) + except Exception: + pass + + get_stderr_console().print(table) + setattr(table, "_rendered_by_cmdlet", True) + + if not Confirm.ask("Continue anyway?", default=False, console=get_stderr_console()): + return False + return True + + def _maybe_show_playlist_table(self, *, url: str, ytdlp_tool: YtDlpTool) -> bool: + """Show a normal selectable playlist table when URL yields multiple entries.""" + try: + cf = self._cookiefile_str(ytdlp_tool) + pr = probe_url(url, no_playlist=False, timeout_seconds=15, cookiefile=cf) + except Exception: + pr = None + if not isinstance(pr, dict): + return False + entries = pr.get("entries") + if not isinstance(entries, list) or len(entries) <= 1: + return False + + # Display table (limit rows to keep output reasonable) + max_rows = 200 + display_entries = entries[:max_rows] + + def _entry_to_url(entry: Any) -> Optional[str]: + if not isinstance(entry, dict): + return None + # Prefer explicit absolute URLs when present + for key in ("webpage_url", "original_url", "url"): + v = entry.get(key) + if isinstance(v, str) and v.strip(): + s = v.strip() + try: + if urlparse(s).scheme in {"http", "https"}: + return s + except Exception: + return s + + # Best-effort YouTube fallback from id + entry_id = entry.get("id") + if isinstance(entry_id, str) and entry_id.strip(): + extractor_name = str(pr.get("extractor") or pr.get("extractor_key") or "").lower() + if "youtube" in extractor_name: + return f"https://www.youtube.com/watch?v={entry_id.strip()}" + return None + + table = ResultTable() + safe_url = str(url or "").strip() + table.title = f'download-media -url "{safe_url}"' if safe_url else "download-media" + table.set_source_command("download-media", []) + try: + table.set_preserve_order(True) + except Exception: + pass + + results_list: List[Dict[str, Any]] = [] + for idx, entry in enumerate(display_entries, 1): + title = None + uploader = None + duration = None + entry_url = _entry_to_url(entry) + try: + if isinstance(entry, dict): + title = entry.get("title") + uploader = entry.get("uploader") or pr.get("uploader") + duration = entry.get("duration") + except Exception: + pass + + row: Dict[str, Any] = { + "table": "download-media", + "title": str(title or f"Item {idx}"), + "detail": str(uploader or ""), + "media_kind": "playlist-item", + "playlist_index": idx, + "_selection_args": (["-url", str(entry_url)] if entry_url else ["-url", str(url), "-item", str(idx)]), + "url": entry_url, + "target": entry_url, + "columns": [ + ("#", str(idx)), + ("Title", str(title or "")), + ("Duration", str(duration or "")), + ("Uploader", str(uploader or "")), + ], + } + results_list.append(row) + table.add_result(row) + + pipeline_context.set_current_stage_table(table) + pipeline_context.set_last_result_table(table, results_list) + + get_stderr_console().print(table) + setattr(table, "_rendered_by_cmdlet", True) + return True + + def _maybe_show_format_table_for_single_url( + self, + *, + mode: str, + clip_spec: Any, + clip_values: Sequence[str], + playlist_items: Optional[str], + ytdl_format: Any, + supported_url: Sequence[str], + playlist_selection_handled: bool, + ytdlp_tool: YtDlpTool, + formats_cache: Dict[str, Optional[List[Dict[str, Any]]]], + storage: Any, + hydrus_available: bool, + final_output_dir: Path, + args: Sequence[str], + ) -> Optional[int]: + # If no -item, no explicit -format specified, and single URL, show the format table. + # Do NOT stop to show formats when -audio is used (auto-pick) or when -clip is used. + if ( + mode != "audio" + and not clip_spec + and not playlist_items + and not ytdl_format + and len(supported_url) == 1 + and not playlist_selection_handled + ): + url = supported_url[0] + + canonical_url = self._canonicalize_url_for_storage( + requested_url=url, + ytdlp_tool=ytdlp_tool, + playlist_items=playlist_items, + ) + if not self._preflight_url_duplicate( + storage=storage, + hydrus_available=hydrus_available, + final_output_dir=final_output_dir, + candidate_url=canonical_url, + extra_urls=[url], + ): + log(f"Skipping download: {url}", file=sys.stderr) + return 0 + + formats = self._list_formats_cached( + url, + playlist_items_value=None, + formats_cache=formats_cache, + ytdlp_tool=ytdlp_tool, + ) + + if formats and len(formats) > 1: + # Formatlist filtering + # + # Goal: + # - Keep the list useful (hide non-media entries like storyboards) + # - But NEVER filter down so far that the user can't browse/pick formats. + def _is_browseable_format(fmt: Any) -> bool: + if not isinstance(fmt, dict): + return False + format_id = str(fmt.get("format_id") or "").strip() + if not format_id: + return False + ext = str(fmt.get("ext") or "").strip().lower() + if ext in {"mhtml", "json"}: + return False + note = str(fmt.get("format_note") or "").lower() + if "storyboard" in note: + return False + if format_id.lower().startswith("sb"): + return False + vcodec = str(fmt.get("vcodec", "none")) + acodec = str(fmt.get("acodec", "none")) + # Keep anything with at least one stream. + return not (vcodec == "none" and acodec == "none") + + candidate_formats = [f for f in formats if _is_browseable_format(f)] + filtered_formats = candidate_formats if candidate_formats else list(formats) + + debug(f"Formatlist: showing {len(filtered_formats)} formats (raw={len(formats)})") + + # Build the base command that will be replayed with @N selection + base_cmd = f'download-media "{url}"' + remaining_args = [arg for arg in args if arg not in [url] and not arg.startswith('-')] + if remaining_args: + base_cmd += ' ' + ' '.join(remaining_args) + + # Create result table for display + table = ResultTable(title=f"Available formats for {url}", max_columns=10, preserve_order=True) + table.set_table("ytdlp.formatlist") + table.set_source_command("download-media", [url]) + + results_list: List[Dict[str, Any]] = [] + for idx, fmt in enumerate(filtered_formats, 1): + resolution = fmt.get("resolution", "") + ext = fmt.get("ext", "") + vcodec = fmt.get("vcodec", "none") + acodec = fmt.get("acodec", "none") + filesize = fmt.get("filesize") + filesize_approx = fmt.get("filesize_approx") + format_id = fmt.get("format_id", "") + + # If the chosen format is video-only (no audio stream), automatically + # request best audio too so the resulting file has sound. + selection_format_id = format_id + try: + if vcodec != "none" and acodec == "none" and format_id: + selection_format_id = f"{format_id}+ba" + except Exception: + selection_format_id = format_id + + size_str = "" + size_prefix = "" + size_bytes = filesize + if not size_bytes: + size_bytes = filesize_approx + if size_bytes: + size_prefix = "~" + try: + if isinstance(size_bytes, (int, float)) and size_bytes > 0: + size_mb = float(size_bytes) / (1024 * 1024) + size_str = f"{size_prefix}{size_mb:.1f}MB" + except Exception: + size_str = "" + + desc_parts: List[str] = [] + if resolution and resolution != "audio only": + desc_parts.append(resolution) + if ext: + desc_parts.append(str(ext).upper()) + if vcodec != "none": + desc_parts.append(f"v:{vcodec}") + if acodec != "none": + desc_parts.append(f"a:{acodec}") + if size_str: + desc_parts.append(size_str) + format_desc = " | ".join(desc_parts) + + format_dict = { + "table": "download-media", + "title": f"Format {format_id}", + "url": url, + "target": url, + "detail": format_desc, + "annotations": [ext, resolution] if resolution else [ext], + "media_kind": "format", + "cmd": base_cmd, + "columns": [ + ("ID", format_id), + ("Resolution", resolution or "N/A"), + ("Ext", ext), + ("Size", size_str or ""), + ("Video", vcodec), + ("Audio", acodec), + ], + "full_metadata": { + "format_id": format_id, + "url": url, + "item_selector": selection_format_id, + }, + "_selection_args": None, + } + + selection_args: List[str] = ["-format", selection_format_id] + try: + if (not clip_spec) and clip_values: + selection_args.extend(["-clip", ",".join([v for v in clip_values if v])]) + except Exception: + pass + format_dict["_selection_args"] = selection_args + + results_list.append(format_dict) + table.add_result(format_dict) + + try: + get_stderr_console().print(table) + setattr(table, "_rendered_by_cmdlet", True) + except Exception: + pass + + pipeline_context.set_current_stage_table(table) + pipeline_context.set_last_result_table(table, results_list) + + log(f"", file=sys.stderr) + return 0 + + return None + + def _download_supported_urls( + self, + *, + supported_url: Sequence[str], + ytdlp_tool: YtDlpTool, + args: Sequence[str], + config: Dict[str, Any], + final_output_dir: Path, + mode: str, + clip_spec: Any, + clip_ranges: Optional[List[tuple[int, int]]], + query_hash_override: Optional[str], + embed_chapters: bool, + write_sub: bool, + quiet_mode: bool, + playlist_items: Optional[str], + ytdl_format: Any, + skip_per_url_preflight: bool, + forced_single_format_id: Optional[str], + forced_single_format_for_batch: bool, + formats_cache: Dict[str, Optional[List[Dict[str, Any]]]], + storage: Any, + hydrus_available: bool, + ) -> int: + downloaded_count = 0 + downloaded_pipe_objects: List[Dict[str, Any]] = [] + pipe_seq = 0 + clip_sections_spec = self._build_clip_sections_spec(clip_ranges) + + if clip_sections_spec: + try: + debug(f"Clip sections spec: {clip_sections_spec}") + except Exception: + pass + + for url in supported_url: + try: + debug(f"Processing: {url}") + + canonical_url = self._canonicalize_url_for_storage( + requested_url=url, + ytdlp_tool=ytdlp_tool, + playlist_items=playlist_items, + ) + + if not skip_per_url_preflight: + if not self._preflight_url_duplicate( + storage=storage, + hydrus_available=hydrus_available, + final_output_dir=final_output_dir, + candidate_url=canonical_url, + extra_urls=[url], + ): + log(f"Skipping download: {url}", file=sys.stderr) + continue + + PipelineProgress(pipeline_context).begin_steps(2) + + actual_format = ytdl_format + actual_playlist_items = playlist_items + + if playlist_items and not ytdl_format: + import re + + if re.search(r"[^0-9,-]", playlist_items): + actual_format = playlist_items + actual_playlist_items = None + + if mode == "audio" and not actual_format: + actual_format = "bestaudio" + + if mode == "video" and not actual_format: + configured = (ytdlp_tool.default_format("video") or "").strip() + if configured and configured != "bestvideo+bestaudio/best": + actual_format = configured + + forced_single_applied = False + if forced_single_format_for_batch and forced_single_format_id and not ytdl_format and not actual_playlist_items: + actual_format = forced_single_format_id + forced_single_applied = True + + if ( + actual_format + and isinstance(actual_format, str) + and mode != "audio" + and "+" not in actual_format + and "/" not in actual_format + and "[" not in actual_format + and actual_format not in {"best", "bv", "ba", "b"} + and not forced_single_applied + ): + try: + formats = self._list_formats_cached( + url, + playlist_items_value=actual_playlist_items, + formats_cache=formats_cache, + ytdlp_tool=ytdlp_tool, + ) + if formats: + fmt_match = next( + (f for f in formats if str(f.get("format_id", "")) == actual_format), + None, + ) + if fmt_match: + vcodec = str(fmt_match.get("vcodec", "none")) + acodec = str(fmt_match.get("acodec", "none")) + if vcodec != "none" and acodec == "none": + debug(f"Selected video-only format {actual_format}; using {actual_format}+ba for audio") + actual_format = f"{actual_format}+ba" + except Exception: + pass + + attempted_single_format_fallback = False + while True: + try: + opts = DownloadOptions( + url=url, + mode=mode, + output_dir=final_output_dir, + ytdl_format=actual_format, + cookies_path=ytdlp_tool.resolve_cookiefile(), + clip_sections=clip_sections_spec, + playlist_items=actual_playlist_items, + quiet=quiet_mode, + no_playlist=False, + embed_chapters=embed_chapters, + write_sub=write_sub, + ) + + PipelineProgress(pipeline_context).step("downloading") + debug(f"Starting download with 5-minute timeout...") + result_obj = _download_with_timeout(opts, timeout_seconds=300) + debug(f"Download completed, building pipe object...") + break + except DownloadError as e: + cause = getattr(e, "__cause__", None) + detail = "" + try: + detail = str(cause or "") + except Exception: + detail = "" + + if ("requested format is not available" in (detail or "").lower()) and mode != "audio": + if ( + forced_single_format_for_batch + and forced_single_format_id + and not ytdl_format + and not actual_playlist_items + and not attempted_single_format_fallback + ): + attempted_single_format_fallback = True + actual_format = forced_single_format_id + debug(f"Only one format available (playlist preflight); retrying with: {actual_format}") + continue + + formats = self._list_formats_cached( + url, + playlist_items_value=actual_playlist_items, + formats_cache=formats_cache, + ytdlp_tool=ytdlp_tool, + ) + if ( + (not attempted_single_format_fallback) + and isinstance(formats, list) + and len(formats) == 1 + and isinstance(formats[0], dict) + ): + only = formats[0] + fallback_format = str(only.get("format_id") or "").strip() + selection_format_id = fallback_format + try: + vcodec = str(only.get("vcodec", "none")) + acodec = str(only.get("acodec", "none")) + if vcodec != "none" and acodec == "none" and fallback_format: + selection_format_id = f"{fallback_format}+ba" + except Exception: + selection_format_id = fallback_format + + if selection_format_id: + attempted_single_format_fallback = True + actual_format = selection_format_id + debug(f"Only one format available; retrying with: {actual_format}") + continue + + if formats: + formats_to_show = formats + + table = ResultTable(title=f"Available formats for {url}", max_columns=10, preserve_order=True) + table.set_table("ytdlp.formatlist") + table.set_source_command("download-media", [url]) + + results_list: List[Dict[str, Any]] = [] + for idx, fmt in enumerate(formats_to_show, 1): + resolution = fmt.get("resolution", "") + ext = fmt.get("ext", "") + vcodec = fmt.get("vcodec", "none") + acodec = fmt.get("acodec", "none") + filesize = fmt.get("filesize") + filesize_approx = fmt.get("filesize_approx") + format_id = fmt.get("format_id", "") + + selection_format_id = format_id + try: + if vcodec != "none" and acodec == "none" and format_id: + selection_format_id = f"{format_id}+ba" + except Exception: + selection_format_id = format_id + + size_str = "" + size_prefix = "" + size_bytes = filesize + if not size_bytes: + size_bytes = filesize_approx + if size_bytes: + size_prefix = "~" + try: + if isinstance(size_bytes, (int, float)) and size_bytes > 0: + size_mb = float(size_bytes) / (1024 * 1024) + size_str = f"{size_prefix}{size_mb:.1f}MB" + except Exception: + size_str = "" + + desc_parts: List[str] = [] + if resolution and resolution != "audio only": + desc_parts.append(str(resolution)) + if ext: + desc_parts.append(str(ext).upper()) + if vcodec != "none": + desc_parts.append(f"v:{vcodec}") + if acodec != "none": + desc_parts.append(f"a:{acodec}") + if size_str: + desc_parts.append(size_str) + format_desc = " | ".join(desc_parts) + + format_dict: Dict[str, Any] = { + "table": "download-media", + "title": f"Format {format_id}", + "url": url, + "target": url, + "detail": format_desc, + "media_kind": "format", + "columns": [ + ("ID", format_id), + ("Resolution", resolution or "N/A"), + ("Ext", ext), + ("Size", size_str or ""), + ("Video", vcodec), + ("Audio", acodec), + ], + "full_metadata": { + "format_id": format_id, + "url": url, + "item_selector": selection_format_id, + }, + "_selection_args": ["-format", selection_format_id], + } + + results_list.append(format_dict) + table.add_result(format_dict) + + pipeline_context.set_current_stage_table(table) + pipeline_context.set_last_result_table(table, results_list) + + try: + get_stderr_console().print(table) + setattr(table, "_rendered_by_cmdlet", True) + except Exception: + pass + + PipelineProgress(pipeline_context).step("awaiting selection") + + log("Requested format is not available; select a working format with @N", file=sys.stderr) + return 0 + + raise + + results_to_emit: List[Any] = [] + if isinstance(result_obj, list): + results_to_emit = list(result_obj) + else: + paths = getattr(result_obj, "paths", None) + if isinstance(paths, list) and paths: + for p in paths: + try: + p_path = Path(p) + except Exception: + continue + try: + if p_path.suffix.lower() in _SUBTITLE_EXTS: + continue + except Exception: + pass + if not p_path.exists() or p_path.is_dir(): + continue + try: + hv = sha256_file(p_path) + except Exception: + hv = None + results_to_emit.append( + DownloadMediaResult( + path=p_path, + info=getattr(result_obj, "info", {}) or {}, + tag=list(getattr(result_obj, "tag", []) or []), + source_url=getattr(result_obj, "source_url", None) or opts.url, + hash_value=hv, + ) + ) + else: + results_to_emit = [result_obj] + + pipe_objects: List[Dict[str, Any]] = [] + for downloaded in results_to_emit: + po = self._build_pipe_object(downloaded, url, opts) + pipe_seq += 1 + try: + po.setdefault("pipe_index", pipe_seq) + except Exception: + pass + + try: + info = downloaded.info if isinstance(getattr(downloaded, "info", None), dict) else {} + except Exception: + info = {} + chapters_text = _format_chapters_note(info) if embed_chapters else None + if chapters_text: + notes = po.get("notes") + if not isinstance(notes, dict): + notes = {} + notes.setdefault("chapters", chapters_text) + po["notes"] = notes + + if write_sub: + try: + media_path = Path(str(po.get("path") or "")) + except Exception: + media_path = None + + if media_path is not None and media_path.exists() and media_path.is_file(): + sub_path = _best_subtitle_sidecar(media_path) + if sub_path is not None: + sub_text = _read_text_file(sub_path) + if sub_text: + notes = po.get("notes") + if not isinstance(notes, dict): + notes = {} + notes["sub"] = sub_text + po["notes"] = notes + try: + sub_path.unlink() + except Exception: + pass + + pipe_objects.append(po) + + try: + if clip_ranges and len(pipe_objects) == len(clip_ranges): + source_hash = query_hash_override or self._find_existing_hash_for_url( + storage, + canonical_url, + hydrus_available=hydrus_available, + ) + self._apply_clip_decorations(pipe_objects, clip_ranges, source_king_hash=source_hash) + except Exception: + pass + + debug(f"Emitting {len(pipe_objects)} result(s) to pipeline...") + + PipelineProgress(pipeline_context).step("finalized") + + stage_ctx = pipeline_context.get_stage_context() + emit_enabled = bool(stage_ctx is not None and not getattr(stage_ctx, "is_last_stage", False)) + for pipe_obj_dict in pipe_objects: + if emit_enabled: + pipeline_context.emit(pipe_obj_dict) + + if pipe_obj_dict.get("url"): + pipe_obj = coerce_to_pipe_object(pipe_obj_dict) + register_url_with_local_library(pipe_obj, config) + + try: + downloaded_pipe_objects.append(pipe_obj_dict) + except Exception: + pass + + downloaded_count += len(pipe_objects) + debug("✓ Downloaded and emitted") + + except DownloadError as e: + log(f"Download failed for {url}: {e}", file=sys.stderr) + except Exception as e: + log(f"Error processing {url}: {e}", file=sys.stderr) + + if downloaded_count > 0: + debug(f"✓ Successfully processed {downloaded_count} URL(s)") + return 0 + + log("No downloads completed", file=sys.stderr) + return 1 + def _run_impl(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: """Main download implementation for yt-dlp-supported url.""" try: @@ -1250,52 +2471,17 @@ class Download_Media(Cmdlet): # Parse arguments parsed = parse_cmdlet_args(args, self) - # Extract options - raw_url = parsed.get("url", []) - if isinstance(raw_url, str): - raw_url = [raw_url] + raw_url = self._normalize_urls(parsed) + raw_url = self._append_urls_from_piped_result(raw_url, result) - # Allow a single quoted argument containing multiple URLs separated by commas. - # Example: download-media "https://a,https://b" -audio - expanded_urls: List[str] = [] - for u in (raw_url or []): - if u is None: - continue - s = str(u).strip() - if not s: - continue - if "," in s: - parts = [p.strip() for p in s.split(",")] - expanded_urls.extend([p for p in parts if p]) - else: - expanded_urls.append(s) - if expanded_urls: - raw_url = expanded_urls - - # If no url provided via args, try to extract from piped result - if not raw_url and result: - # Handle single result or list of results - results_to_check = result if isinstance(result, list) else [result] - for item in results_to_check: - # Try to get URL from various possible fields - url = get_field(item, "url") or get_field(item, "target") - if url: - raw_url.append(url) - - # Filter to yt-dlp supported url only - supported_url = [ - url for url in raw_url - if is_url_supported_by_ytdlp(url) - ] + supported_url, unsupported_list = self._filter_supported_urls(raw_url) if not supported_url: log("No yt-dlp-supported url to download", file=sys.stderr) return 1 - # Log unsupported url if any - unsupported = set(raw_url) - set(supported_url) - if unsupported: - debug(f"Skipping {len(unsupported)} unsupported url (use download-file for direct downloads)") + if unsupported_list: + debug(f"Skipping {len(unsupported_list)} unsupported url (use download-file for direct downloads)") # Get output directory final_output_dir = self._resolve_output_dir(parsed, config) @@ -1313,27 +2499,14 @@ class Download_Media(Cmdlet): # -query "hash:" # -query "clip:1m-1m15s,2m1s-2m11s" # -query "hash:,clip:1m-1m15s,item:2-3" - query_keyed: Dict[str, List[str]] = {} - if query_spec: - try: - query_keyed = self._parse_keyed_csv_spec(str(query_spec), default_key="hash") - except Exception: - query_keyed = {} + query_keyed = self._parse_query_keyed_spec(str(query_spec) if query_spec is not None else None) # Optional: allow an explicit hash via -query "hash:". # This is used as the preferred king hash for multi-clip relationships. - query_hash_override: Optional[str] = None - try: - hash_values = query_keyed.get("hash", []) if isinstance(query_keyed, dict) else [] - hash_candidate = (hash_values[-1] if hash_values else None) - if hash_candidate: - # Re-wrap for the shared parser which expects the `hash:` prefix. - query_hash_override = sh.parse_single_hash_query(f"hash:{hash_candidate}") - else: - # Backwards-compatible: treat a non-keyed query as a hash query. - query_hash_override = sh.parse_single_hash_query(str(query_spec)) if query_spec else None - except Exception: - query_hash_override = None + query_hash_override = self._extract_hash_override( + str(query_spec) if query_spec is not None else None, + query_keyed, + ) # Always enable chapters + subtitles so downstream pipes (e.g. mpv) can consume them. embed_chapters = True @@ -1341,33 +2514,14 @@ class Download_Media(Cmdlet): mode = "audio" if parsed.get("audio") else "video" - # Parse clip range(s) if specified - clip_ranges: Optional[List[tuple[int, int]]] = None - clip_values: List[str] = [] - item_values: List[str] = [] - - if clip_spec: - # Support keyed clip syntax: - # -clip "clip:3m4s-3m14s,1h22m-1h33m,item:2-3" - keyed = self._parse_keyed_csv_spec(str(clip_spec), default_key="clip") - clip_values.extend(keyed.get("clip", []) or []) - item_values.extend(keyed.get("item", []) or []) - - # Allow the same keyed spec language inside -query so users can do: - # download-media -query "clip:1m-1m15s,2m1s-2m11s" - if query_keyed: - clip_values.extend(query_keyed.get("clip", []) or []) - item_values.extend(query_keyed.get("item", []) or []) - - if item_values and not parsed.get("item"): - parsed["item"] = ",".join([v for v in item_values if v]) - - if clip_values: - clip_ranges = self._parse_time_ranges(",".join([v for v in clip_values if v])) - if not clip_ranges: - bad_spec = clip_spec or query_spec - log(f"Invalid clip format: {bad_spec}", file=sys.stderr) - return 1 + clip_ranges, clip_invalid, clip_values = self._parse_clip_ranges_and_apply_items( + clip_spec=str(clip_spec) if clip_spec is not None else None, + query_keyed=query_keyed, + parsed=parsed, + query_spec=str(query_spec) if query_spec is not None else None, + ) + if clip_invalid: + return 1 if clip_ranges: try: @@ -1377,641 +2531,18 @@ class Download_Media(Cmdlet): quiet_mode = bool(config.get("_quiet_background_output")) if isinstance(config, dict) else False - storage = None - hydrus_available = True - try: - from Store import Store - storage = Store(config=config or {}, suppress_debug=True) - from API.HydrusNetwork import is_hydrus_available - hydrus_available = bool(is_hydrus_available(config or {})) - except Exception: - storage = None - - def _preflight_url_duplicate(candidate_url: str, extra_urls: Optional[Sequence[str]] = None) -> bool: - # NOTE: download-media sets _quiet_background_output=True when running in a pipeline to - # reduce background noise. URL de-dup is interactive and must still run in pipelines. - if storage is None: - debug("Preflight URL check skipped: storage unavailable") - return True - - debug(f"Preflight URL check: candidate={candidate_url}") - - try: - from metadata import normalize_urls - except Exception: - normalize_urls = None # type: ignore[assignment] - - needles: List[str] = [] - if normalize_urls is not None: - for raw in [candidate_url, *(list(extra_urls) if extra_urls else [])]: - try: - needles.extend(normalize_urls(raw)) - except Exception: - continue - # Fallback: always have at least one needle - if not needles: - needles = [str(candidate_url)] - - # Deduplicate needles (preserve order) - seen_needles: List[str] = [] - for needle in needles: - if needle and needle not in seen_needles: - seen_needles.append(needle) - needles = seen_needles - - try: - debug(f"Preflight URL needles: {needles}") - except Exception: - pass - - url_matches: List[Dict[str, Any]] = [] - try: - from Store.HydrusNetwork import HydrusNetwork - - # Avoid searching the temp/download directory backend during dedup. - # We only want to warn about duplicates in real stores. - backend_names_all = storage.list_searchable_backends() - backend_names: List[str] = [] - skipped: List[str] = [] - for backend_name in backend_names_all: - try: - backend = storage[backend_name] - except Exception: - continue - - try: - if str(backend_name).strip().lower() == "temp": - skipped.append(backend_name) - continue - except Exception: - pass - - # Heuristic: if a Folder backend points at the configured temp output dir, skip it. - try: - backend_location = getattr(backend, "_location", None) - if backend_location and final_output_dir: - backend_path = Path(str(backend_location)).expanduser().resolve() - temp_path = Path(str(final_output_dir)).expanduser().resolve() - if backend_path == temp_path: - skipped.append(backend_name) - continue - except Exception: - pass - - backend_names.append(backend_name) - - try: - if skipped: - debug(f"Preflight backends: {backend_names} (skipped temp: {skipped})") - else: - debug(f"Preflight backends: {backend_names}") - except Exception: - pass - - for backend_name in backend_names: - backend = storage[backend_name] - if isinstance(backend, HydrusNetwork) and not hydrus_available: - continue - - backend_hits: List[Dict[str, Any]] = [] - for needle in needles: - try: - backend_hits = backend.search(f"url:{needle}", limit=25) or [] - if backend_hits: - break - except Exception: - continue - if backend_hits: - url_matches.extend([dict(x) if isinstance(x, dict) else {"title": str(x)} for x in backend_hits]) - - if len(url_matches) >= 25: - url_matches = url_matches[:25] - break - except Exception: - url_matches = [] - - if not url_matches: - debug("Preflight URL check: no matches") - return True - - table = ResultTable(f"URL already exists ({len(url_matches)} match(es))") - results_list: List[Dict[str, Any]] = [] - for item in url_matches: - if "title" not in item: - item["title"] = item.get("name") or item.get("target") or item.get("path") or "Result" - - # Keep the full payload for history/inspection, but display a focused table. - # Use shared extractors so Ext/Size/Store/Hash remain consistent everywhere. - try: - from result_table import build_display_row - except Exception: - build_display_row = None # type: ignore - - if callable(build_display_row): - display_row = build_display_row(item, keys=["title", "store", "hash", "ext", "size"]) - else: - display_row = { - "title": item.get("title"), - "store": item.get("store"), - "hash": item.get("hash") or item.get("file_hash") or item.get("sha256"), - "ext": str(item.get("ext") or ""), - "size": item.get("size") or item.get("size_bytes"), - } - table.add_result(display_row) - results_list.append(item) - - pipeline_context.set_current_stage_table(table) - pipeline_context.set_last_result_table(table, results_list) - - try: - from contextlib import nullcontext - except Exception: - nullcontext = None # type: ignore - - suspend = getattr(pipeline_context, "suspend_live_progress", None) - cm = suspend() if callable(suspend) else (nullcontext() if nullcontext else None) - if cm is None: - get_stderr_console().print(table) - setattr(table, "_rendered_by_cmdlet", True) - if not Confirm.ask("Continue anyway?", default=False, console=get_stderr_console()): - return False - else: - with cm: - get_stderr_console().print(table) - setattr(table, "_rendered_by_cmdlet", True) - if not Confirm.ask("Continue anyway?", default=False, console=get_stderr_console()): - try: - pipeline_context.request_pipeline_stop(reason="duplicate-url declined", exit_code=0) - except Exception: - pass - return False - return True - - def _preflight_url_duplicates_bulk(urls: Sequence[str]) -> bool: - """Preflight URL de-dup for a batch of URLs. - - Purpose: - - Avoid per-item interactive URL checks inside a playlist loop. - - Let the user see ALL duplicates up front, before any downloads start. - """ - if storage is None: - debug("Bulk URL preflight skipped: storage unavailable") - return True - - unique_urls: List[str] = [] - for u in urls or []: - s = str(u or "").strip() - if s and s not in unique_urls: - unique_urls.append(s) - if len(unique_urls) <= 1: - return True - - try: - from metadata import normalize_urls - except Exception: - normalize_urls = None # type: ignore[assignment] - - def _httpish(value: str) -> bool: - try: - return bool(value) and (value.startswith("http://") or value.startswith("https://")) - except Exception: - return False - - url_needles: Dict[str, List[str]] = {} - for u in unique_urls: - needles: List[str] = [] - if normalize_urls is not None: - try: - needles.extend([n for n in (normalize_urls(u) or []) if isinstance(n, str)]) - except Exception: - needles = [] - if not needles: - needles = [u] - # Prefer http(s) needles for store lookups. - filtered: List[str] = [] - for n in needles: - n2 = str(n or "").strip() - if not n2: - continue - if not _httpish(n2): - continue - if n2 not in filtered: - filtered.append(n2) - url_needles[u] = filtered if filtered else [u] - - # Determine backends once (same filtering as per-URL preflight). - backend_names: List[str] = [] - try: - backend_names_all = storage.list_searchable_backends() - except Exception: - backend_names_all = [] - - for backend_name in backend_names_all: - try: - backend = storage[backend_name] - except Exception: - continue - - try: - if str(backend_name).strip().lower() == "temp": - continue - except Exception: - pass - - try: - backend_location = getattr(backend, "_location", None) - if backend_location and final_output_dir: - backend_path = Path(str(backend_location)).expanduser().resolve() - temp_path = Path(str(final_output_dir)).expanduser().resolve() - if backend_path == temp_path: - continue - except Exception: - pass - - backend_names.append(backend_name) - - if not backend_names: - debug("Bulk URL preflight skipped: no searchable backends") - return True - - # Collect matches as display rows (cap to keep output reasonable) - seen_pairs: set[tuple[str, str]] = set() - matched_urls: set[str] = set() - match_rows: List[Dict[str, Any]] = [] - max_rows = 200 - - try: - from Store.HydrusNetwork import HydrusNetwork - except Exception: - HydrusNetwork = None # type: ignore - - for backend_name in backend_names: - if len(match_rows) >= max_rows: - break - try: - backend = storage[backend_name] - except Exception: - continue - - if HydrusNetwork is not None and isinstance(backend, HydrusNetwork): - if not hydrus_available: - continue - - client = getattr(backend, "_client", None) - if client is None: - continue - - for original_url, needles in url_needles.items(): - if len(match_rows) >= max_rows: - break - if (original_url, str(backend_name)) in seen_pairs: - continue - - # Fast-path: ask Hydrus whether it already knows this URL. - found_hash: Optional[str] = None - found = False - for needle in (needles or [])[:3]: - if not _httpish(needle): - continue - try: - from API.HydrusNetwork import HydrusRequestSpec - - spec = HydrusRequestSpec( - method="GET", - endpoint="/add_urls/get_url_files", - query={"url": needle}, - ) - response = client._perform_request(spec) # type: ignore[attr-defined] - raw_hashes = None - if isinstance(response, dict): - raw_hashes = response.get("hashes") or response.get("file_hashes") - raw_ids = response.get("file_ids") - has_ids = isinstance(raw_ids, list) and len(raw_ids) > 0 - has_hashes = isinstance(raw_hashes, list) and len(raw_hashes) > 0 - if has_hashes: - try: - found_hash = str(raw_hashes[0]).strip() # type: ignore[index] - except Exception: - found_hash = None - if has_ids or has_hashes: - found = True - break - except Exception: - continue - - if not found: - continue - - seen_pairs.add((original_url, str(backend_name))) - matched_urls.add(original_url) - display_row = { - "title": "(exists)", - "store": str(backend_name), - "hash": found_hash or "", - "url": original_url, - "columns": [ - ("Title", "(exists)"), - ("Store", str(backend_name)), - ("Hash", found_hash or ""), - ("URL", original_url), - ], - } - match_rows.append(display_row) - continue - - # Generic backends: use the existing search() contract. - for original_url, needles in url_needles.items(): - if len(match_rows) >= max_rows: - break - if (original_url, str(backend_name)) in seen_pairs: - continue - - backend_hits: List[Dict[str, Any]] = [] - for needle in (needles or [])[:3]: - try: - backend_hits = backend.search(f"url:{needle}", limit=1) or [] - if backend_hits: - break - except Exception: - continue - - if not backend_hits: - continue - - seen_pairs.add((original_url, str(backend_name))) - matched_urls.add(original_url) - hit = backend_hits[0] - title = hit.get("title") or hit.get("name") or hit.get("target") or hit.get("path") or "(exists)" - file_hash = hit.get("hash") or hit.get("file_hash") or hit.get("sha256") or "" - - try: - from result_table import build_display_row - except Exception: - build_display_row = None # type: ignore - - extracted = { - "title": str(title), - "store": str(hit.get("store") or backend_name), - "hash": str(file_hash or ""), - "ext": "", - "size": None, - } - if callable(build_display_row): - try: - extracted = build_display_row(hit, keys=["title", "store", "hash", "ext", "size"]) - except Exception: - pass - # Ensure we still prefer the precomputed values for title/store/hash. - extracted["title"] = str(title) - extracted["store"] = str(hit.get("store") or backend_name) - extracted["hash"] = str(file_hash or "") - - ext = extracted.get("ext") - size_val = extracted.get("size") - - display_row = { - "title": str(title), - "store": str(hit.get("store") or backend_name), - "hash": str(file_hash or ""), - "ext": str(ext or ""), - "size": size_val, - "url": original_url, - "columns": [ - ("Title", str(title)), - ("Store", str(hit.get("store") or backend_name)), - ("Hash", str(file_hash or "")), - ("Ext", str(ext or "")), - ("Size", size_val), - ("URL", original_url), - ], - } - match_rows.append(display_row) - - if not match_rows: - debug("Bulk URL preflight: no matches") - return True - - # This table is non-interactive and intentionally wide (we want URL + ext/size). - table = ResultTable(f"URL already exists ({len(matched_urls)} url(s))", max_columns=10) - table.set_no_choice(True) - try: - table.set_preserve_order(True) - except Exception: - pass - - for row in match_rows: - table.add_result(row) - - # Display as an overlay so we don't clobber the current selectable table/history. - try: - pipeline_context.set_last_result_table_overlay(table, match_rows) - except Exception: - pass - - get_stderr_console().print(table) - setattr(table, "_rendered_by_cmdlet", True) - - if not Confirm.ask("Continue anyway?", default=False, console=get_stderr_console()): - return False - return True - - def _canonicalize_url_for_storage(requested_url: str) -> str: - # Prefer yt-dlp's canonical webpage URL (e.g. strips timestamps/redirects). - # Fall back to the requested URL if probing fails. - # Important: when playlist item selection is used, avoid probing (can hang on large playlists). - if playlist_items: - return str(requested_url) - try: - cf = None - try: - cookie_path = ytdlp_tool.resolve_cookiefile() - if cookie_path is not None and cookie_path.is_file(): - cf = str(cookie_path) - except Exception: - cf = None - pr = probe_url(requested_url, no_playlist=False, timeout_seconds=15, cookiefile=cf) - if isinstance(pr, dict): - for key in ("webpage_url", "original_url", "url", "requested_url"): - value = pr.get(key) - if isinstance(value, str) and value.strip(): - return value.strip() - except Exception: - pass - return str(requested_url) + storage, hydrus_available = self._init_storage(config if isinstance(config, dict) else {}) # Check if we need to show format selection playlist_items = str(parsed.get("item")) if parsed.get("item") else None ytdl_format = parsed.get("format") playlist_selection_handled = False - def _parse_at_selection(choice: str, *, max_index: int) -> Optional[List[int]]: - """Parse @ selection syntax (@2, @2-5, @{1,3,5}, @2,5,7) into 1-based indices.""" - raw = str(choice or "").strip() - if not raw: - return None - - if raw.lower() in {"q", "quit", "cancel"}: - return None - - if raw == "@*" or raw == "*": - return list(range(1, max_index + 1)) - - if raw.startswith("@"): - raw = raw[1:].strip() - - if raw.startswith("{") and raw.endswith("}"): - raw = raw[1:-1].strip() - - if not raw: - return None - - indices: set[int] = set() - for part in raw.split(","): - part = part.strip() - if not part: - continue - if "-" in part: - left, right = [p.strip() for p in part.split("-", 1)] - if not left or not right: - return None - try: - start = int(left) - end = int(right) - except ValueError: - return None - if start < 1 or end < 1: - return None - if end < start: - start, end = end, start - for i in range(start, end + 1): - if 1 <= i <= max_index: - indices.add(i) - else: - try: - i = int(part) - except ValueError: - return None - if 1 <= i <= max_index: - indices.add(i) - if not indices: - return None - return sorted(indices) - - def _maybe_show_playlist_table(url: str) -> bool: - """If URL appears to be a playlist/channel/collection, show a normal selectable table. - - This intentionally avoids a special input() prompt so the user can use - the regular REPL prompt with autocomplete and standard @ selection: - download-media -url "" (shows table) - @* | download-media [options] | add-file ... - - Returns True if a playlist table was shown. - """ - try: - cf = None - try: - cookie_path = ytdlp_tool.resolve_cookiefile() - if cookie_path is not None and cookie_path.is_file(): - cf = str(cookie_path) - except Exception: - cf = None - pr = probe_url(url, no_playlist=False, timeout_seconds=15, cookiefile=cf) - except Exception: - pr = None - if not isinstance(pr, dict): - return False - entries = pr.get("entries") - if not isinstance(entries, list) or len(entries) <= 1: - return False - - # Display table (limit rows to keep output reasonable) - max_rows = 200 - display_entries = entries[:max_rows] - total = len(entries) - - def _entry_to_url(entry: Any) -> Optional[str]: - if not isinstance(entry, dict): - return None - # Prefer explicit absolute URLs when present - for key in ("webpage_url", "original_url", "url"): - v = entry.get(key) - if isinstance(v, str) and v.strip(): - s = v.strip() - try: - if urlparse(s).scheme in {"http", "https"}: - return s - except Exception: - return s - - # Best-effort YouTube fallback from id - entry_id = entry.get("id") - if isinstance(entry_id, str) and entry_id.strip(): - extractor_name = str(pr.get("extractor") or pr.get("extractor_key") or "").lower() - if "youtube" in extractor_name: - return f"https://www.youtube.com/watch?v={entry_id.strip()}" - return None - - table = ResultTable() - safe_url = str(url or "").strip() - table.title = f'download-media -url "{safe_url}"' if safe_url else "download-media" - # Selection tables should expand '@N' into a runnable command. - # For playlist-item rows we prefer the concrete per-item URL so the - # expanded command targets a single video (not the whole playlist). - table.set_source_command("download-media", []) - try: - table.set_preserve_order(True) - except Exception: - pass - - results_list: List[Dict[str, Any]] = [] - for idx, entry in enumerate(display_entries, 1): - title = None - uploader = None - duration = None - entry_url = _entry_to_url(entry) - try: - if isinstance(entry, dict): - title = entry.get("title") - uploader = entry.get("uploader") or pr.get("uploader") - duration = entry.get("duration") - except Exception: - pass - - row: Dict[str, Any] = { - "table": "download-media", - "title": str(title or f"Item {idx}"), - "detail": str(uploader or ""), - "media_kind": "playlist-item", - "playlist_index": idx, - # Enable '@N' expansion into a concrete command. - # Prefer selecting the resolved per-item URL when available. - "_selection_args": (["-url", str(entry_url)] if entry_url else ["-url", str(url), "-item", str(idx)]), - # Critical for normal @ selection piping: downstream cmdlets - # (including download-media itself) look for url/target. - "url": entry_url, - "target": entry_url, - "columns": [ - ("#", str(idx)), - ("Title", str(title or "")), - ("Duration", str(duration or "")), - ("Uploader", str(uploader or "")), - ], - } - results_list.append(row) - table.add_result(row) - - pipeline_context.set_current_stage_table(table) - pipeline_context.set_last_result_table(table, results_list) - - get_stderr_console().print(table) - setattr(table, "_rendered_by_cmdlet", True) - return True - # Playlist/multi-entry detection: if the URL has multiple items and the user didn't # specify -item or -format, show a normal selectable table and return. if len(supported_url) == 1 and not playlist_items and not ytdl_format: candidate_url = supported_url[0] - if _maybe_show_playlist_table(candidate_url): + if self._maybe_show_playlist_table(url=candidate_url, ytdlp_tool=ytdlp_tool): playlist_selection_handled = True # Let the user pick items using the normal REPL prompt: # @* | download-media ... @@ -2020,7 +2551,12 @@ class Download_Media(Cmdlet): # Bulk preflight for playlist selections (per-entry URLs): check all URLs once before downloading. skip_per_url_preflight = False if len(supported_url) > 1: - if not _preflight_url_duplicates_bulk(list(supported_url)): + if not self._preflight_url_duplicates_bulk( + storage=storage, + hydrus_available=hydrus_available, + final_output_dir=final_output_dir, + urls=list(supported_url), + ): return 0 skip_per_url_preflight = True @@ -2029,34 +2565,17 @@ class Download_Media(Cmdlet): # and per-item --list-formats calls (e.g. Bandcamp albums). formats_cache: Dict[str, Optional[List[Dict[str, Any]]]] = {} - def _cookiefile_str() -> Optional[str]: - try: - cookie_path = ytdlp_tool.resolve_cookiefile() - if cookie_path is not None and cookie_path.is_file(): - return str(cookie_path) - except Exception: - pass - return None - - def _list_formats_cached(u: str, *, playlist_items_value: Optional[str]) -> Optional[List[Dict[str, Any]]]: - key = f"{u}||{playlist_items_value or ''}" - if key in formats_cache: - return formats_cache[key] - fmts = list_formats( - u, - no_playlist=False, - playlist_items=playlist_items_value, - cookiefile=_cookiefile_str(), - ) - formats_cache[key] = fmts - return fmts - forced_single_format_id: Optional[str] = None forced_single_format_for_batch = False if len(supported_url) > 1 and not playlist_items and not ytdl_format: try: sample_url = str(supported_url[0]) - fmts = _list_formats_cached(sample_url, playlist_items_value=None) + fmts = self._list_formats_cached( + sample_url, + playlist_items_value=None, + formats_cache=formats_cache, + ytdlp_tool=ytdlp_tool, + ) if isinstance(fmts, list) and len(fmts) == 1 and isinstance(fmts[0], dict): only_id = str(fmts[0].get("format_id") or "").strip() if only_id: @@ -2068,585 +2587,47 @@ class Download_Media(Cmdlet): except Exception: forced_single_format_id = None forced_single_format_for_batch = False - - # If no -item, no explicit -format specified, and single URL, show the format table. - # Do NOT stop to show formats when -audio is used (auto-pick) or when -clip is used. - if ( - mode != "audio" - and not clip_spec - and not playlist_items - and not ytdl_format - and len(supported_url) == 1 - and not playlist_selection_handled - ): - url = supported_url[0] - canonical_url = _canonicalize_url_for_storage(url) - if not _preflight_url_duplicate(canonical_url, extra_urls=[url]): - log(f"Skipping download: {url}", file=sys.stderr) - return 0 + early_ret = self._maybe_show_format_table_for_single_url( + mode=mode, + clip_spec=clip_spec, + clip_values=clip_values, + playlist_items=playlist_items, + ytdl_format=ytdl_format, + supported_url=supported_url, + playlist_selection_handled=playlist_selection_handled, + ytdlp_tool=ytdlp_tool, + formats_cache=formats_cache, + storage=storage, + hydrus_available=hydrus_available, + final_output_dir=final_output_dir, + args=args, + ) + if early_ret is not None: + return int(early_ret) - formats = _list_formats_cached(url, playlist_items_value=None) - - if formats and len(formats) > 1: - # Formatlist filtering - # - # Goal: - # - Keep the list useful (hide non-media entries like storyboards) - # - But NEVER filter down so far that the user can't browse/pick formats. - # - # The old filtering was too aggressive (e.g. width>=640, one per resolution), - # which often hid most YouTube formats. - def _is_browseable_format(fmt: Any) -> bool: - if not isinstance(fmt, dict): - return False - format_id = str(fmt.get("format_id") or "").strip() - if not format_id: - return False - ext = str(fmt.get("ext") or "").strip().lower() - if ext in {"mhtml", "json"}: - return False - note = str(fmt.get("format_note") or "").lower() - if "storyboard" in note: - return False - if format_id.lower().startswith("sb"): - return False - vcodec = str(fmt.get("vcodec", "none")) - acodec = str(fmt.get("acodec", "none")) - # Keep anything with at least one stream. - return not (vcodec == "none" and acodec == "none") - - candidate_formats = [f for f in formats if _is_browseable_format(f)] - filtered_formats = candidate_formats if candidate_formats else list(formats) - - debug(f"Formatlist: showing {len(filtered_formats)} formats (raw={len(formats)})") - - # Build the base command that will be replayed with @N selection - # Include any additional args from the original command - base_cmd = f'download-media "{url}"' - # Preserve any additional pipeline stages if this is in a pipeline - remaining_args = [arg for arg in args if arg not in [url] and not arg.startswith('-')] - if remaining_args: - base_cmd += ' ' + ' '.join(remaining_args) - - # Create result table for display - # NOTE: ResultTable defaults to max_columns=5; for formatlist we want more columns - # (including Size) so the user can compare formats. - table = ResultTable(title=f"Available formats for {url}", max_columns=10, preserve_order=True) - table.set_table("ytdlp.formatlist") - table.set_source_command("download-media", [url]) - - # Collect results for table - results_list = [] - - # Emit format results for selection - for idx, fmt in enumerate(filtered_formats, 1): - resolution = fmt.get("resolution", "") - ext = fmt.get("ext", "") - vcodec = fmt.get("vcodec", "none") - acodec = fmt.get("acodec", "none") - filesize = fmt.get("filesize") - filesize_approx = fmt.get("filesize_approx") - format_id = fmt.get("format_id", "") - - # If the chosen format is video-only (no audio stream), automatically - # request best audio too so the resulting file has sound. - selection_format_id = format_id - try: - if vcodec != "none" and acodec == "none" and format_id: - selection_format_id = f"{format_id}+ba" - except Exception: - selection_format_id = format_id - - # Format size (prefer exact filesize; fall back to filesize_approx) - size_str = "" - size_prefix = "" - size_bytes = filesize - if not size_bytes: - size_bytes = filesize_approx - if size_bytes: - size_prefix = "~" - try: - if isinstance(size_bytes, (int, float)) and size_bytes > 0: - size_mb = float(size_bytes) / (1024 * 1024) - size_str = f"{size_prefix}{size_mb:.1f}MB" - except Exception: - size_str = "" - - # Build format description - desc_parts = [] - if resolution and resolution != "audio only": - desc_parts.append(resolution) - if ext: - desc_parts.append(ext.upper()) - if vcodec != "none": - desc_parts.append(f"v:{vcodec}") - if acodec != "none": - desc_parts.append(f"a:{acodec}") - if size_str: - desc_parts.append(size_str) - - format_desc = " | ".join(desc_parts) - - # Build format dict for emission and table - format_dict = { - "table": "download-media", - "title": f"Format {format_id}", - "url": url, - "target": url, - "detail": format_desc, - "annotations": [ext, resolution] if resolution else [ext], - "media_kind": "format", - "cmd": base_cmd, - # Put Size early so it's visible even with smaller column caps. - "columns": [ - ("ID", format_id), - ("Resolution", resolution or "N/A"), - ("Ext", ext), - ("Size", size_str or ""), - ("Video", vcodec), - ("Audio", acodec), - ], - "full_metadata": { - "format_id": format_id, - "url": url, - "item_selector": selection_format_id, - }, - "_selection_args": None, - } - - # Preserve clip settings across @N selection. - # Some runners only append row selection args; make sure clip intent - # survives even when it was provided via -query "clip:...". - selection_args: List[str] = ["-format", selection_format_id] - try: - if (not clip_spec) and clip_values: - selection_args.extend(["-clip", ",".join([v for v in clip_values if v])]) - except Exception: - pass - format_dict["_selection_args"] = selection_args - - # Add to results list and table (don't emit - formats should wait for @N selection) - results_list.append(format_dict) - table.add_result(format_dict) - - # Render and display the table - # Some runners (e.g. cmdnat) do not automatically render stage tables. - # Since this branch is explicitly interactive (user must pick @N), always - # print the table here and mark it as already rendered to avoid duplicates - # in runners that also print tables (e.g. CLI.py). - try: - get_stderr_console().print(table) - setattr(table, "_rendered_by_cmdlet", True) - except Exception: - pass - - # Set the result table so it displays and is available for @N selection - pipeline_context.set_current_stage_table(table) - pipeline_context.set_last_result_table(table, results_list) - - log(f"", file=sys.stderr) - return 0 - - # Download each URL - downloaded_count = 0 - downloaded_pipe_objects: List[Dict[str, Any]] = [] - pipe_seq = 0 - clip_sections_spec = self._build_clip_sections_spec(clip_ranges) - - if clip_sections_spec: - try: - debug(f"Clip sections spec: {clip_sections_spec}") - except Exception: - pass - - for url in supported_url: - try: - debug(f"Processing: {url}") - - canonical_url = _canonicalize_url_for_storage(url) - - # Preflight: warn if URL already exists in storage backends. - # For playlist selections we already did a bulk preflight; skip per-item checks. - if not skip_per_url_preflight: - if not _preflight_url_duplicate(canonical_url, extra_urls=[url]): - log(f"Skipping download: {url}", file=sys.stderr) - continue - - # Step progress is per-URL download. - # Keep steps meaningful: long-running download + finalize. - # (Fast internal bookkeeping should not be steps.) - _begin_live_steps(2) - - # If playlist_items is specified but looks like a format ID (e.g. from table selection), - # treat it as a format selector instead of playlist items. - # This handles the case where @N selection passes -item - actual_format = ytdl_format - actual_playlist_items = playlist_items - - if playlist_items and not ytdl_format: - # Heuristic: if it contains non-numeric chars (excluding ranges/commas) - # it is likely a format ID (e.g. '140-drc', 'best', '137+140') - import re - if re.search(r"[^0-9,-]", playlist_items): - actual_format = playlist_items - actual_playlist_items = None - - # For -audio, default to yt-dlp's built-in bestaudio selector. - # This should *not* require interactive format picking. - if mode == "audio" and not actual_format: - actual_format = "bestaudio" - - # If no explicit format is provided for video mode, allow a config override. - if mode == "video" and not actual_format: - configured = (ytdlp_tool.default_format("video") or "").strip() - if configured and configured != "bestvideo+bestaudio/best": - actual_format = configured - - # If the batch has exactly one available format, force it. - # This prevents the "Requested format is not available" error loop entirely. - forced_single_applied = False - if forced_single_format_for_batch and forced_single_format_id and not ytdl_format and not actual_playlist_items: - actual_format = forced_single_format_id - forced_single_applied = True - - # If a single format id was chosen and it is video-only, auto-merge best audio. - if ( - actual_format - and isinstance(actual_format, str) - and mode != "audio" - and "+" not in actual_format - and "/" not in actual_format - and "[" not in actual_format - and actual_format not in {"best", "bv", "ba", "b"} - and not forced_single_applied - ): - try: - formats = _list_formats_cached(url, playlist_items_value=actual_playlist_items) - if formats: - fmt_match = next( - (f for f in formats if str(f.get("format_id", "")) == actual_format), - None, - ) - if fmt_match: - vcodec = str(fmt_match.get("vcodec", "none")) - acodec = str(fmt_match.get("acodec", "none")) - if vcodec != "none" and acodec == "none": - debug( - f"Selected video-only format {actual_format}; using {actual_format}+ba for audio" - ) - actual_format = f"{actual_format}+ba" - except Exception: - pass - - attempted_single_format_fallback = False - while True: - try: - opts = DownloadOptions( - url=url, - mode=mode, - output_dir=final_output_dir, - ytdl_format=actual_format, - cookies_path=ytdlp_tool.resolve_cookiefile(), - clip_sections=clip_sections_spec, - playlist_items=actual_playlist_items, - quiet=quiet_mode, - no_playlist=False, - embed_chapters=embed_chapters, - write_sub=write_sub, - ) - - _step("downloading") - # Use timeout wrapper to prevent hanging - debug(f"Starting download with 5-minute timeout...") - result_obj = _download_with_timeout(opts, timeout_seconds=300) - debug(f"Download completed, building pipe object...") - break - except DownloadError as e: - # If yt-dlp failed due to an unavailable default format and there is only - # one available format, auto-retry with that format instead of prompting. - cause = getattr(e, "__cause__", None) - detail = "" - try: - detail = str(cause or "") - except Exception: - detail = "" - - if ("requested format is not available" in (detail or "").lower()) and mode != "audio": - # If we already discovered there's only one format for the batch, - # retry directly with it instead of re-listing formats per item. - if ( - forced_single_format_for_batch - and forced_single_format_id - and not ytdl_format - and not actual_playlist_items - and not attempted_single_format_fallback - ): - attempted_single_format_fallback = True - actual_format = forced_single_format_id - debug(f"Only one format available (playlist preflight); retrying with: {actual_format}") - continue - - formats = _list_formats_cached(url, playlist_items_value=actual_playlist_items) - if ( - (not attempted_single_format_fallback) - and isinstance(formats, list) - and len(formats) == 1 - and isinstance(formats[0], dict) - ): - only = formats[0] - fallback_format = str(only.get("format_id") or "").strip() - selection_format_id = fallback_format - try: - vcodec = str(only.get("vcodec", "none")) - acodec = str(only.get("acodec", "none")) - if vcodec != "none" and acodec == "none" and fallback_format: - selection_format_id = f"{fallback_format}+ba" - except Exception: - selection_format_id = fallback_format - - if selection_format_id: - attempted_single_format_fallback = True - actual_format = selection_format_id - debug(f"Only one format available; retrying with: {actual_format}") - continue - - # Fall back to interactive selection when there are multiple formats. - if formats: - formats_to_show = formats - - table = ResultTable(title=f"Available formats for {url}", max_columns=10, preserve_order=True) - table.set_table("ytdlp.formatlist") - table.set_source_command("download-media", [url]) - - results_list: List[Dict[str, Any]] = [] - for idx, fmt in enumerate(formats_to_show, 1): - resolution = fmt.get("resolution", "") - ext = fmt.get("ext", "") - vcodec = fmt.get("vcodec", "none") - acodec = fmt.get("acodec", "none") - filesize = fmt.get("filesize") - filesize_approx = fmt.get("filesize_approx") - format_id = fmt.get("format_id", "") - - selection_format_id = format_id - try: - if vcodec != "none" and acodec == "none" and format_id: - selection_format_id = f"{format_id}+ba" - except Exception: - selection_format_id = format_id - - size_str = "" - size_prefix = "" - size_bytes = filesize - if not size_bytes: - size_bytes = filesize_approx - if size_bytes: - size_prefix = "~" - try: - if isinstance(size_bytes, (int, float)) and size_bytes > 0: - size_mb = float(size_bytes) / (1024 * 1024) - size_str = f"{size_prefix}{size_mb:.1f}MB" - except Exception: - size_str = "" - - desc_parts: List[str] = [] - if resolution and resolution != "audio only": - desc_parts.append(str(resolution)) - if ext: - desc_parts.append(str(ext).upper()) - if vcodec != "none": - desc_parts.append(f"v:{vcodec}") - if acodec != "none": - desc_parts.append(f"a:{acodec}") - if size_str: - desc_parts.append(size_str) - format_desc = " | ".join(desc_parts) - - format_dict: Dict[str, Any] = { - "table": "download-media", - "title": f"Format {format_id}", - "url": url, - "target": url, - "detail": format_desc, - "media_kind": "format", - "columns": [ - ("ID", format_id), - ("Resolution", resolution or "N/A"), - ("Ext", ext), - ("Size", size_str or ""), - ("Video", vcodec), - ("Audio", acodec), - ], - "full_metadata": { - "format_id": format_id, - "url": url, - "item_selector": selection_format_id, - }, - "_selection_args": ["-format", selection_format_id], - } - - results_list.append(format_dict) - table.add_result(format_dict) - - pipeline_context.set_current_stage_table(table) - pipeline_context.set_last_result_table(table, results_list) - - # Always print for interactive selection. - try: - get_stderr_console().print(table) - setattr(table, "_rendered_by_cmdlet", True) - except Exception: - pass - - # Complete the step sequence: we return here and the user must - # re-run with @N selection. - _step("awaiting selection") - - log("Requested format is not available; select a working format with @N", file=sys.stderr) - return 0 - - # Non-format DownloadError: surface and skip this URL. - raise - - # Expand result set: - # - playlists return a list - # - section clips return a single DownloadMediaResult with `paths` populated - results_to_emit: List[Any] = [] - if isinstance(result_obj, list): - results_to_emit = list(result_obj) - else: - paths = getattr(result_obj, "paths", None) - if isinstance(paths, list) and paths: - # Create one DownloadMediaResult per section file - for p in paths: - try: - p_path = Path(p) - except Exception: - continue - # Sidecars (subtitles) should never be piped as standalone items. - # They are handled separately and attached to notes. - try: - if p_path.suffix.lower() in _SUBTITLE_EXTS: - continue - except Exception: - pass - if not p_path.exists() or p_path.is_dir(): - continue - try: - hv = sha256_file(p_path) - except Exception: - hv = None - results_to_emit.append( - DownloadMediaResult( - path=p_path, - info=getattr(result_obj, "info", {}) or {}, - tag=list(getattr(result_obj, "tag", []) or []), - source_url=getattr(result_obj, "source_url", None) or opts.url, - hash_value=hv, - ) - ) - else: - results_to_emit = [result_obj] - - # Build PipeObjects first so we can attach cross-clip relationships. - pipe_objects: List[Dict[str, Any]] = [] - for downloaded in results_to_emit: - po = self._build_pipe_object(downloaded, url, opts) - pipe_seq += 1 - try: - po.setdefault("pipe_index", pipe_seq) - except Exception: - pass - - # Attach chapter timestamps for downstream consumers (e.g., mpv scripts) - # even if container embedding fails. - try: - info = downloaded.info if isinstance(getattr(downloaded, "info", None), dict) else {} - except Exception: - info = {} - chapters_text = _format_chapters_note(info) if embed_chapters else None - if chapters_text: - notes = po.get("notes") - if not isinstance(notes, dict): - notes = {} - notes.setdefault("chapters", chapters_text) - po["notes"] = notes - - if write_sub: - try: - media_path = Path(str(po.get("path") or "")) - except Exception: - media_path = None - - if media_path is not None and media_path.exists() and media_path.is_file(): - sub_path = _best_subtitle_sidecar(media_path) - if sub_path is not None: - sub_text = _read_text_file(sub_path) - if sub_text: - notes = po.get("notes") - if not isinstance(notes, dict): - notes = {} - notes["sub"] = sub_text - po["notes"] = notes - # We keep subtitles as notes; do not leave a sidecar that later stages - # might try to ingest as a file. - try: - sub_path.unlink() - except Exception: - pass - - pipe_objects.append(po) - - # If this is a clip download, decorate titles/tags so the title: tag is clip-based. - # Relationship tags are only added when multiple clips exist. - try: - if clip_ranges and len(pipe_objects) == len(clip_ranges): - source_hash = query_hash_override or self._find_existing_hash_for_url(storage, canonical_url, hydrus_available=hydrus_available) - self._apply_clip_decorations(pipe_objects, clip_ranges, source_king_hash=source_hash) - except Exception: - pass - - debug(f"Emitting {len(pipe_objects)} result(s) to pipeline...") - - # Mark complete *before* the first emit, because the pipeline clears the - # status line on emit(). - _step("finalized") - - stage_ctx = pipeline_context.get_stage_context() - emit_enabled = bool(stage_ctx is not None and not getattr(stage_ctx, "is_last_stage", False)) - for pipe_obj_dict in pipe_objects: - # Only emit when there is a downstream stage. - # This keeps `download-media` from producing a result table when run standalone. - if emit_enabled: - pipeline_context.emit(pipe_obj_dict) - - # Automatically register url with local library - if pipe_obj_dict.get("url"): - pipe_obj = coerce_to_pipe_object(pipe_obj_dict) - register_url_with_local_library(pipe_obj, config) - - # Keep a copy so we can optionally run a tail pipeline entered at the playlist prompt. - try: - downloaded_pipe_objects.append(pipe_obj_dict) - except Exception: - pass - - downloaded_count += len(pipe_objects) - debug("✓ Downloaded and emitted") - - except DownloadError as e: - log(f"Download failed for {url}: {e}", file=sys.stderr) - except Exception as e: - log(f"Error processing {url}: {e}", file=sys.stderr) - - if downloaded_count > 0: - debug(f"✓ Successfully processed {downloaded_count} URL(s)") - - return 0 - - log("No downloads completed", file=sys.stderr) - return 1 + return self._download_supported_urls( + supported_url=supported_url, + ytdlp_tool=ytdlp_tool, + args=args, + config=config, + final_output_dir=final_output_dir, + mode=mode, + clip_spec=clip_spec, + clip_ranges=clip_ranges, + query_hash_override=query_hash_override, + embed_chapters=embed_chapters, + write_sub=write_sub, + quiet_mode=quiet_mode, + playlist_items=playlist_items, + ytdl_format=ytdl_format, + skip_per_url_preflight=skip_per_url_preflight, + forced_single_format_id=forced_single_format_id, + forced_single_format_for_batch=forced_single_format_for_batch, + formats_cache=formats_cache, + storage=storage, + hydrus_available=hydrus_available, + ) except Exception as e: log(f"Error in download-media: {e}", file=sys.stderr) diff --git a/cmdlet/get_file.py b/cmdlet/get_file.py index 73aaf03..9dc094a 100644 --- a/cmdlet/get_file.py +++ b/cmdlet/get_file.py @@ -126,7 +126,7 @@ class Get_File(sh.Cmdlet): except Exception as exc: log(f"Error opening browser: {exc}", file=sys.stderr) else: - log(f"Opened in browser: {source_path}", file=sys.stderr) + debug(f"Opened in browser: {source_path}", file=sys.stderr) # Emit result for pipeline ctx.emit({ diff --git a/cmdlet/get_tag.py b/cmdlet/get_tag.py index 9b77758..85cb620 100644 --- a/cmdlet/get_tag.py +++ b/cmdlet/get_tag.py @@ -47,6 +47,210 @@ except ImportError: extract_title = None +def _dedup_tags_preserve_order(tags: List[str]) -> List[str]: + """Deduplicate tags case-insensitively while preserving order.""" + out: List[str] = [] + seen: set[str] = set() + for t in tags or []: + if not isinstance(t, str): + continue + s = t.strip() + if not s: + continue + key = s.lower() + if key in seen: + continue + seen.add(key) + out.append(s) + return out + + +def _extract_subtitle_tags(info: Dict[str, Any]) -> List[str]: + """Extract subtitle availability tags from a yt-dlp info dict. + + Produces multi-valued tags so languages can coexist: + - subs: + - subs_auto: + """ + def _langs(value: Any) -> List[str]: + if not isinstance(value, dict): + return [] + langs: List[str] = [] + for k in value.keys(): + if not isinstance(k, str): + continue + lang = k.strip().lower() + if lang: + langs.append(lang) + return sorted(set(langs)) + + out: List[str] = [] + for lang in _langs(info.get("subtitles")): + out.append(f"subs:{lang}") + for lang in _langs(info.get("automatic_captions")): + out.append(f"subs_auto:{lang}") + return out + + +def _scrape_ytdlp_info(url: str) -> Optional[Dict[str, Any]]: + """Fetch a yt-dlp info dict without downloading media.""" + if not isinstance(url, str) or not url.strip(): + return None + url = url.strip() + + # Prefer the Python module when available (faster, avoids shell quoting issues). + try: + import yt_dlp # type: ignore + opts: Any = { + "quiet": True, + "no_warnings": True, + "skip_download": True, + "noprogress": True, + "socket_timeout": 15, + "retries": 1, + "playlist_items": "1-10", + } + with yt_dlp.YoutubeDL(opts) as ydl: + info = ydl.extract_info(url, download=False) + return info if isinstance(info, dict) else None + except Exception: + pass + + # Fallback to yt-dlp CLI if the module isn't available. + try: + import json as json_module + cmd = [ + "yt-dlp", + "-J", + "--no-warnings", + "--skip-download", + "--playlist-items", + "1-10", + url, + ] + result = subprocess.run(cmd, capture_output=True, text=True, timeout=30) + if result.returncode != 0: + return None + payload = (result.stdout or "").strip() + if not payload: + return None + data = json_module.loads(payload) + return data if isinstance(data, dict) else None + except Exception: + return None + + +def _resolve_candidate_urls_for_item( + result: Any, + backend: Any, + file_hash: str, + config: Dict[str, Any], +) -> List[str]: + """Get candidate URLs from backend and/or piped result.""" + try: + from metadata import normalize_urls + except Exception: + normalize_urls = None # type: ignore[assignment] + + urls: List[str] = [] + # 1) Backend URL association (best source of truth) + try: + backend_urls = backend.get_url(file_hash, config=config) + if backend_urls: + if normalize_urls: + urls.extend(normalize_urls(backend_urls)) + else: + urls.extend([str(u).strip() for u in backend_urls if isinstance(u, str) and str(u).strip()]) + except Exception: + pass + + # 2) Backend metadata url field + try: + meta = backend.get_metadata(file_hash, config=config) + if isinstance(meta, dict) and meta.get("url"): + if normalize_urls: + urls.extend(normalize_urls(meta.get("url"))) + else: + raw = meta.get("url") + if isinstance(raw, list): + urls.extend([str(u).strip() for u in raw if isinstance(u, str) and str(u).strip()]) + elif isinstance(raw, str) and raw.strip(): + urls.append(raw.strip()) + except Exception: + pass + + # 3) Piped result fields + def _get(obj: Any, key: str, default: Any = None) -> Any: + if isinstance(obj, dict): + return obj.get(key, default) + return getattr(obj, key, default) + + for key in ("url", "webpage_url", "source_url", "target"): + val = _get(result, key, None) + if not val: + continue + if normalize_urls: + urls.extend(normalize_urls(val)) + continue + if isinstance(val, str) and val.strip(): + urls.append(val.strip()) + elif isinstance(val, list): + urls.extend([str(u).strip() for u in val if isinstance(u, str) and str(u).strip()]) + + meta_field = _get(result, "metadata", None) + if isinstance(meta_field, dict) and meta_field.get("url"): + val = meta_field.get("url") + if normalize_urls: + urls.extend(normalize_urls(val)) + elif isinstance(val, list): + urls.extend([str(u).strip() for u in val if isinstance(u, str) and str(u).strip()]) + elif isinstance(val, str) and val.strip(): + urls.append(val.strip()) + + # Dedup + return _dedup_tags_preserve_order(urls) + + +def _pick_supported_ytdlp_url(urls: List[str]) -> Optional[str]: + """Pick the first URL that looks supported by yt-dlp (best effort).""" + if not urls: + return None + + def _is_hydrus_file_url(u: str) -> bool: + text = str(u or "").strip().lower() + if not text: + return False + # Hydrus-local file URLs are retrievable blobs, not original source pages. + # yt-dlp generally can't extract meaningful metadata from these. + return ("/get_files/file" in text) and ("hash=" in text) + + http_urls: List[str] = [] + for u in urls: + text = str(u or "").strip() + if text.lower().startswith(("http://", "https://")): + http_urls.append(text) + + # Prefer non-Hydrus URLs for yt-dlp scraping. + candidates = [u for u in http_urls if not _is_hydrus_file_url(u)] + if not candidates: + return None + + # Prefer a true support check when the Python module is available. + try: + from SYS.download import is_url_supported_by_ytdlp + for text in candidates: + try: + if is_url_supported_by_ytdlp(text): + return text + except Exception: + continue + except Exception: + pass + + # Fallback: use the first non-Hydrus http(s) URL and let extraction decide. + return candidates[0] if candidates else None + + _scrape_isbn_metadata = _ol_scrape_isbn_metadata # type: ignore[assignment] _scrape_openlibrary_metadata = _ol_scrape_openlibrary_metadata # type: ignore[assignment] @@ -853,7 +1057,12 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: scrape_url = parsed_args.get("scrape") scrape_requested = scrape_flag_present or scrape_url is not None - if scrape_requested and (not scrape_url or str(scrape_url).strip() == ""): + # Convenience: `-scrape` with no value defaults to `ytdlp` (store-backed URL scrape). + if scrape_flag_present and (scrape_url is None or str(scrape_url).strip() == ""): + scrape_url = "ytdlp" + scrape_requested = True + + if scrape_requested and (scrape_url is None or str(scrape_url).strip() == ""): log("-scrape requires a URL or provider name", file=sys.stderr) return 1 @@ -861,6 +1070,123 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: if scrape_requested and scrape_url: import json as json_module + if str(scrape_url).strip().lower() == "ytdlp": + # Scrape metadata from the selected item's URL via yt-dlp (no download), + # then OVERWRITE all existing tags (including title:). + # + # This mode requires a store-backed item (hash + store). + # + # NOTE: We intentionally do not reuse _scrape_url_metadata() here because it + # performs namespace deduplication that would collapse multi-valued tags. + file_hash = normalize_hash(hash_override) or normalize_hash(get_field(result, "hash", None)) + store_name = get_field(result, "store", None) + subject_path = get_field(result, "path", None) or get_field(result, "target", None) or get_field(result, "filename", None) + item_title = get_field(result, "title", None) or get_field(result, "name", None) or get_field(result, "filename", None) + + # Only run overwrite-apply when the item is store-backed. + # If this is a URL-only PipeObject, fall through to provider mode below. + if file_hash and store_name and str(file_hash).strip().lower() != "unknown" and str(store_name).strip().upper() not in {"PATH", "URL"}: + try: + from Store import Store + storage = Store(config) + backend = storage[str(store_name)] + except Exception as exc: + log(f"Failed to resolve store backend '{store_name}': {exc}", file=sys.stderr) + return 1 + + candidate_urls = _resolve_candidate_urls_for_item(result, backend, file_hash, config) + scrape_target = _pick_supported_ytdlp_url(candidate_urls) + if not scrape_target: + log( + "No yt-dlp-supported source URL found for this item (Hydrus /get_files/file URLs are ignored). ", + file=sys.stderr, + ) + log( + "Add the original page URL to the file (e.g. via add-url), then retry get-tag -scrape.", + file=sys.stderr, + ) + return 1 + + info = _scrape_ytdlp_info(scrape_target) + if not info: + log("yt-dlp could not extract metadata for this URL (unsupported or failed)", file=sys.stderr) + return 1 + + try: + from metadata import extract_ytdlp_tags + except Exception: + extract_ytdlp_tags = None # type: ignore[assignment] + + # Prefer the top-level metadata, but if this is a playlist container, use + # the first entry for per-item fields like subtitles. + info_for_subs = info + entries = info.get("entries") if isinstance(info, dict) else None + if isinstance(entries, list) and entries: + first = entries[0] + if isinstance(first, dict): + info_for_subs = first + + tags: List[str] = [] + if extract_ytdlp_tags: + try: + tags.extend(extract_ytdlp_tags(info)) + except Exception: + pass + + # Subtitle availability tags + try: + tags.extend(_extract_subtitle_tags(info_for_subs if isinstance(info_for_subs, dict) else {})) + except Exception: + pass + + # Ensure we actually have something to apply. + tags = _dedup_tags_preserve_order(tags) + if not tags: + log("No tags extracted from yt-dlp metadata", file=sys.stderr) + return 1 + + # Full overwrite: delete all existing tags, then add the new set. + try: + existing_tags, _src = backend.get_tag(file_hash, config=config) + except Exception: + existing_tags = [] + try: + if existing_tags: + backend.delete_tag(file_hash, list(existing_tags), config=config) + except Exception as exc: + debug(f"[get_tag] ytdlp overwrite: delete_tag failed: {exc}") + try: + backend.add_tag(file_hash, list(tags), config=config) + except Exception as exc: + log(f"Failed to apply yt-dlp tags: {exc}", file=sys.stderr) + return 1 + + # Show updated tags + try: + updated_tags, _src = backend.get_tag(file_hash, config=config) + except Exception: + updated_tags = tags + if not updated_tags: + updated_tags = tags + + _emit_tags_as_table( + tags_list=list(updated_tags), + file_hash=file_hash, + store=str(store_name), + service_name=None, + config=config, + item_title=str(item_title or "ytdlp"), + path=str(subject_path) if subject_path else None, + subject={ + "hash": file_hash, + "store": str(store_name), + "path": str(subject_path) if subject_path else None, + "title": item_title, + "extra": {"applied_provider": "ytdlp", "scrape_url": scrape_target}, + }, + ) + return 0 + if scrape_url.startswith("http://") or scrape_url.startswith("https://"): # URL scraping (existing behavior) title, tags, formats, playlist_items = _scrape_url_metadata(scrape_url) @@ -951,7 +1277,16 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: else: combined_query = f"{title_hint} {artist_hint}" - query_hint = identifier_query or combined_query or title_hint + # yt-dlp isn't a search provider; it requires a URL. + url_hint: Optional[str] = None + if provider.name == "ytdlp": + raw_url = get_field(result, "url", None) or get_field(result, "source_url", None) or get_field(result, "target", None) + if isinstance(raw_url, list) and raw_url: + raw_url = raw_url[0] + if isinstance(raw_url, str) and raw_url.strip().startswith(("http://", "https://")): + url_hint = raw_url.strip() + + query_hint = url_hint or identifier_query or combined_query or title_hint if not query_hint: log("No title or identifier available to search for metadata", file=sys.stderr) return 1 @@ -967,6 +1302,27 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: if not items: log("No metadata results found", file=sys.stderr) return 1 + + # For yt-dlp, emit tags directly (there is no meaningful multi-result selection step). + if provider.name == "ytdlp": + try: + tags = [str(t) for t in provider.to_tags(items[0]) if t is not None] + except Exception: + tags = [] + if not tags: + log("No tags extracted from yt-dlp metadata", file=sys.stderr) + return 1 + _emit_tags_as_table( + tags_list=list(tags), + file_hash=None, + store="url", + service_name=None, + config=config, + item_title=str(items[0].get("title") or "ytdlp"), + path=None, + subject={"provider": "ytdlp", "url": str(query_hint)}, + ) + return 0 from result_table import ResultTable table = ResultTable(f"Metadata: {provider.name}") @@ -1040,7 +1396,10 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: return 0 # Apply tags to the store backend (no sidecar writing here). - apply_tags = _filter_scraped_tags([str(t) for t in result_tags if t is not None]) + if str(result_provider).strip().lower() == "ytdlp": + apply_tags = [str(t) for t in result_tags if t is not None] + else: + apply_tags = _filter_scraped_tags([str(t) for t in result_tags if t is not None]) if not apply_tags: log("No applicable scraped tags to apply (title:/artist:/source: are skipped)", file=sys.stderr) return 0 @@ -1167,6 +1526,11 @@ try: except Exception: _SCRAPE_CHOICES = ["itunes", "openlibrary", "googlebooks", "google", "musicbrainz"] +# Special scrape mode: pull tags from an item's URL via yt-dlp (no download) +if "ytdlp" not in _SCRAPE_CHOICES: + _SCRAPE_CHOICES.append("ytdlp") + _SCRAPE_CHOICES = sorted(_SCRAPE_CHOICES) + class Get_Tag(Cmdlet): """Class-based get-tag cmdlet with self-registration.""" @@ -1195,7 +1559,7 @@ class Get_Tag(Cmdlet): CmdletArg( name="-scrape", type="string", - description="Scrape metadata from URL or provider name (returns tags as JSON or table)", + description="Scrape metadata from URL/provider, or use 'ytdlp' to scrape from the item's URL and overwrite tags", required=False, choices=_SCRAPE_CHOICES, ) diff --git a/cmdlet/screen_shot.py b/cmdlet/screen_shot.py index 3eb2a87..99014a3 100644 --- a/cmdlet/screen_shot.py +++ b/cmdlet/screen_shot.py @@ -14,10 +14,11 @@ import httpx from dataclasses import dataclass, field from pathlib import Path from typing import Any, Dict, List, Optional, Sequence, Tuple -from urllib.parse import urlsplit, quote, urljoin +from urllib.parse import urlsplit, quote, urljoin, unquote from SYS.logger import log, debug from API.HTTP import HTTPClient +from SYS.pipeline_progress import PipelineProgress from SYS.utils import ensure_directory, unique_path, unique_preserve_order from . import _shared as sh @@ -31,54 +32,6 @@ get_field = sh.get_field parse_cmdlet_args = sh.parse_cmdlet_args import pipeline as pipeline_context - -def _live_ui_and_pipe_index() -> tuple[Optional[Any], int]: - ui = None - try: - ui = pipeline_context.get_live_progress() if hasattr(pipeline_context, "get_live_progress") else None - except Exception: - ui = None - - pipe_idx: int = 0 - try: - stage_ctx = pipeline_context.get_stage_context() if hasattr(pipeline_context, "get_stage_context") else None - maybe_idx = getattr(stage_ctx, "pipe_index", None) if stage_ctx is not None else None - if isinstance(maybe_idx, int): - pipe_idx = int(maybe_idx) - except Exception: - pipe_idx = 0 - - return ui, pipe_idx - - -def _begin_live_steps(total_steps: int) -> None: - """Declare the total number of steps for this cmdlet run (per-pipe).""" - ui, pipe_idx = _live_ui_and_pipe_index() - if ui is None: - return - try: - begin = getattr(ui, "begin_pipe_steps", None) - if callable(begin): - begin(int(pipe_idx), total_steps=int(total_steps)) - except Exception: - return - - -def _step(text: str) -> None: - """Emit a *new* step. - - Each call increments the step counter and advances percent automatically. - """ - ui, pipe_idx = _live_ui_and_pipe_index() - if ui is None: - return - try: - adv = getattr(ui, "advance_pipe_step", None) - if callable(adv): - adv(int(pipe_idx), str(text)) - except Exception: - return - # ============================================================================ # CMDLET Metadata Declaration # ============================================================================ @@ -115,6 +68,10 @@ USER_AGENT = ( DEFAULT_VIEWPORT: dict[str, int] = {"width": 1920, "height": 1080} ARCHIVE_TIMEOUT = 30.0 +# WebP has a hard maximum dimension per side. +# Pillow typically fails with: "encoding error 5: Image size exceeds WebP limit of 16383 pixels" +WEBP_MAX_DIM = 16_383 + # Configurable selectors for specific websites SITE_SELECTORS: Dict[str, List[str]] = { "twitter.com": [ @@ -200,6 +157,80 @@ def _slugify_url(url: str) -> str: return slug[:100] +def _tags_from_url(url: str) -> List[str]: + """Derive simple tags from a URL. + + - site: (strips leading www.) + - title: derived from the last path segment, with extension removed + and separators (-, _, %) normalized to spaces. + """ + + u = str(url or "").strip() + if not u: + return [] + + parsed = None + try: + parsed = urlsplit(u) + host = str(getattr(parsed, "hostname", None) or getattr(parsed, "netloc", "") or "").strip().lower() + except Exception: + parsed = None + host = "" + + if host: + # Drop credentials and port if present. + if "@" in host: + host = host.rsplit("@", 1)[-1] + if ":" in host: + host = host.split(":", 1)[0] + if host.startswith("www."): + host = host[len("www.") :] + + path = "" + if parsed is not None: + try: + path = str(getattr(parsed, "path", "") or "") + except Exception: + path = "" + + last = "" + if path: + try: + last = path.rsplit("/", 1)[-1] + except Exception: + last = "" + + try: + last = unquote(last or "") + except Exception: + last = last or "" + + if last and "." in last: + # Drop a single trailing extension (e.g. .html, .php). + last = last.rsplit(".", 1)[0] + + for sep in ("_", "-", "%"): + if last and sep in last: + last = last.replace(sep, " ") + + title = " ".join(str(last or "").split()).strip().lower() + + tags: List[str] = [] + if host: + tags.append(f"site:{host}") + if title: + tags.append(f"title:{title}") + return tags + + +def _title_from_url(url: str) -> str: + """Return the normalized title derived from a URL's last path segment.""" + for t in _tags_from_url(url): + if str(t).lower().startswith("title:"): + return str(t)[len("title:") :].strip() + return "" + + def _normalise_format(fmt: Optional[str]) -> str: """Normalize output format to valid values.""" if not fmt: @@ -218,6 +249,89 @@ def _format_suffix(fmt: str) -> str: return ".jpg" return f".{fmt}" + +def _convert_to_webp( + src_png: Path, + dst_webp: Path, + *, + quality: int = 90, + method: int = 6, + max_dim: int = WEBP_MAX_DIM, + downscale_if_oversize: bool = True, +) -> bool: + """Convert a PNG screenshot to WebP via Pillow. + + Playwright does not currently support emitting WebP directly. + """ + if not src_png or not Path(src_png).is_file(): + raise ScreenshotError(f"Source image not found: {src_png}") + + dst_webp = Path(dst_webp) + try: + dst_webp.parent.mkdir(parents=True, exist_ok=True) + except Exception: + pass + + try: + from PIL import Image + except Exception as exc: + raise ScreenshotError(f"Pillow is required for webp conversion: {exc}") from exc + + # Write atomically to avoid partial files if conversion is interrupted. + tmp_path = unique_path(dst_webp.with_suffix(".tmp.webp")) + try: + with Image.open(src_png) as im: + did_downscale = False + save_kwargs: Dict[str, Any] = { + "format": "WEBP", + "quality": int(quality), + "method": int(method), + } + + # Preserve alpha when present; Pillow handles it for WEBP. + # Normalize palette images to RGBA to avoid odd palette artifacts. + if im.mode == "P": + im = im.convert("RGBA") + + # WebP enforces a hard max dimension per side (16383px). + # When full-page captures are very tall, downscale proportionally to fit. + try: + w, h = im.size + except Exception: + w, h = 0, 0 + + if downscale_if_oversize and isinstance(max_dim, int) and max_dim > 0 and (w > max_dim or h > max_dim): + scale = 1.0 + try: + scale = min(float(max_dim) / float(w), float(max_dim) / float(h)) + except Exception: + scale = 1.0 + + if scale > 0.0 and scale < 1.0: + new_w = max(1, int(w * scale)) + new_h = max(1, int(h * scale)) + debug( + f"[_convert_to_webp] Image exceeds WebP limit ({w}x{h}); downscaling -> {new_w}x{new_h}" + ) + try: + resample = getattr(getattr(Image, "Resampling", Image), "LANCZOS", None) + if resample is None: + resample = getattr(Image, "LANCZOS", 1) + im = im.resize((new_w, new_h), resample=resample) + did_downscale = True + except Exception as exc: + debug(f"[_convert_to_webp] Downscale failed; attempting direct WEBP save anyway: {exc}") + + im.save(tmp_path, **save_kwargs) + + tmp_path.replace(dst_webp) + return bool(did_downscale) + finally: + try: + tmp_path.unlink(missing_ok=True) + except Exception: + pass + def _matched_site_selectors(url: str) -> List[str]: """Return SITE_SELECTORS for a matched domain; empty if no match. @@ -231,6 +345,16 @@ def _matched_site_selectors(url: str) -> List[str]: return sels +def _selectors_for_url(url: str) -> List[str]: + """Return selectors to try for a URL. + + For now, prefer a minimal behavior: only return known SITE_SELECTORS. + (The cmdlet already falls back to full-page capture when no selectors match.) + """ + + return _matched_site_selectors(url) + + def _platform_preprocess(url: str, page: Any, warnings: List[str], timeout_ms: int = 10_000) -> None: """Best-effort page tweaks for popular platforms before capture.""" try: @@ -366,11 +490,11 @@ def _prepare_output_path(options: ScreenshotOptions) -> Path: return unique_path(path) -def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str]) -> None: +def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str], progress: PipelineProgress) -> None: """Capture screenshot using Playwright.""" debug(f"[_capture] Starting capture for {options.url} -> {destination}") try: - _step("loading launching browser") + progress.step("loading launching browser") tool = options.playwright_tool or PlaywrightTool({}) # Ensure Chromium engine is used for the screen-shot cmdlet (force for consistency) @@ -405,16 +529,16 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str]) try: with tool.open_page(headless=headless) as page: - _step("loading navigating") + progress.step("loading navigating") debug(f"Navigating to {options.url}...") try: tool.goto(page, options.url) debug("Page loaded successfully") - _step("loading page loaded") + progress.step("loading page loaded") except PlaywrightTimeoutError: warnings.append("navigation timeout; capturing current page state") debug("Navigation timeout; proceeding with current state") - _step("loading navigation timeout") + progress.step("loading navigation timeout") # Skip article lookup by default (wait_for_article defaults to False) if options.wait_for_article: @@ -430,9 +554,9 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str]) debug(f"Waiting {options.wait_after_load}s for page stabilization...") time.sleep(min(10.0, max(0.0, options.wait_after_load))) - _step("loading stabilized") + progress.step("loading stabilized") - _step("capturing preparing") + progress.step("capturing preparing") if options.replace_video_posters: debug("Replacing video elements with posters...") page.evaluate( @@ -453,7 +577,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str]) if options.prefer_platform_target and format_name != "pdf": debug(f"[_capture] Target capture enabled") debug("Attempting platform-specific content capture...") - _step("capturing locating target") + progress.step("capturing locating target") try: _platform_preprocess(options.url, page, warnings) except Exception as e: @@ -478,7 +602,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str]) el.scroll_into_view_if_needed(timeout=1000) except Exception: pass - _step("capturing output") + progress.step("capturing output") debug(f"Capturing element to {destination}...") el.screenshot(path=str(destination), type=("jpeg" if format_name == "jpeg" else None)) element_captured = True @@ -489,14 +613,14 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str]) debug(f"Failed to capture element: {exc}") # Fallback to default capture paths if element_captured: - _step("capturing saved") + progress.step("capturing saved") elif format_name == "pdf": debug("Generating PDF...") page.emulate_media(media="print") - _step("capturing output") + progress.step("capturing output") page.pdf(path=str(destination), print_background=True) debug(f"PDF saved to {destination}") - _step("capturing saved") + progress.step("capturing saved") else: debug(f"Capturing full page to {destination}...") screenshot_kwargs: Dict[str, Any] = {"path": str(destination)} @@ -504,20 +628,20 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str]) screenshot_kwargs["type"] = "jpeg" screenshot_kwargs["quality"] = 90 if options.full_page: - _step("capturing output") + progress.step("capturing output") page.screenshot(full_page=True, **screenshot_kwargs) else: article = page.query_selector("article") if article is not None: article_kwargs = dict(screenshot_kwargs) article_kwargs.pop("full_page", None) - _step("capturing output") + progress.step("capturing output") article.screenshot(**article_kwargs) else: - _step("capturing output") + progress.step("capturing output") page.screenshot(**screenshot_kwargs) debug(f"Screenshot saved to {destination}") - _step("capturing saved") + progress.step("capturing saved") except Exception as exc: debug(f"[_capture] Exception launching browser/page: {exc}") msg = str(exc).lower() @@ -532,7 +656,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str]) raise ScreenshotError(f"Failed to capture screenshot: {exc}") from exc -def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult: +def _capture_screenshot(options: ScreenshotOptions, progress: PipelineProgress) -> ScreenshotResult: """Capture a screenshot for the given options.""" debug(f"[_capture_screenshot] Preparing capture for {options.url}") requested_format = _normalise_format(options.output_format) @@ -543,8 +667,8 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult: will_convert = requested_format == "webp" will_archive = bool(options.archive and options.url) total_steps = 9 + (1 if will_target else 0) + (1 if will_convert else 0) + (1 if will_archive else 0) - _begin_live_steps(total_steps) - _step("loading starting") + progress.begin_steps(total_steps) + progress.step("loading starting") # Playwright screenshots do not natively support WebP output. # Capture as PNG, then convert via Pillow. @@ -553,17 +677,22 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult: capture_path = unique_path(destination.with_suffix(".png")) debug(f"[_capture_screenshot] Requested webp; capturing intermediate png -> {capture_path}") options.output_format = "png" - _capture(options, capture_path, warnings) + _capture(options, capture_path, warnings, progress) if requested_format == "webp": - _step("capturing converting to webp") + progress.step("capturing converting to webp") debug(f"[_capture_screenshot] Converting png -> webp: {destination}") try: - _convert_to_webp(capture_path, destination) - try: - capture_path.unlink(missing_ok=True) - except Exception: - pass + did_downscale = _convert_to_webp(capture_path, destination) + if did_downscale: + warnings.append( + f"webp conversion used downscaling to fit {WEBP_MAX_DIM}px limit; keeping original png: {capture_path.name}" + ) + else: + try: + capture_path.unlink(missing_ok=True) + except Exception: + pass except Exception as exc: warnings.append(f"webp conversion failed; keeping png: {exc}") destination = capture_path @@ -572,7 +701,7 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult: url: List[str] = [options.url] if options.url else [] archive_url: List[str] = [] if options.archive and options.url: - _step("capturing archiving") + progress.step("capturing archiving") debug(f"[_capture_screenshot] Archiving enabled for {options.url}") archives, archive_warnings = _archive_url(options.url, options.archive_timeout) archive_url.extend(archives) @@ -580,7 +709,7 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult: if archives: url = unique_preserve_order([*url, *archives]) - _step("capturing finalized") + progress.step("capturing finalized") applied_tag = unique_preserve_order(list(tag for tag in options.tag if tag.strip())) @@ -627,6 +756,8 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: ) return 1 + progress = PipelineProgress(pipeline_context) + # ======================================================================== # ARGUMENT PARSING # ======================================================================== @@ -685,32 +816,6 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: debug(f"[_run] url to process: {[u for u, _ in url_to_process]}") - # If the caller isn't running the shared pipeline Live progress UI (e.g. direct - # cmdlet execution), start a minimal local pipeline progress panel so this cmdlet - # still shows step-level progress. - local_progress_ui = None - try: - existing_ui = pipeline_context.get_live_progress() if hasattr(pipeline_context, "get_live_progress") else None - except Exception: - existing_ui = None - try: - if existing_ui is None and bool(getattr(sys.stderr, "isatty", lambda: False)()): - from models import PipelineLiveProgress - - local_progress_ui = PipelineLiveProgress(["screen-shot"], enabled=True) - local_progress_ui.start() - try: - if hasattr(pipeline_context, "set_live_progress"): - pipeline_context.set_live_progress(local_progress_ui) - except Exception: - pass - try: - local_progress_ui.begin_pipe(0, total_items=len(url_to_process), items_preview=[u for u, _ in url_to_process]) - except Exception: - pass - except Exception: - local_progress_ui = None - # ======================================================================== # OUTPUT DIRECTORY RESOLUTION - Priority chain # ======================================================================== @@ -749,6 +854,18 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: ensure_directory(screenshot_dir) + # If the caller isn't running the shared pipeline Live progress UI (e.g. direct + # cmdlet execution), start a minimal local pipeline progress panel so this cmdlet + # still shows step-level progress. + try: + progress.ensure_local_ui( + label="screen-shot", + total_items=len(url_to_process), + items_preview=[u for u, _ in url_to_process], + ) + except Exception: + pass + # ======================================================================== # PREPARE SCREENSHOT OPTIONS # ======================================================================== @@ -850,7 +967,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: options.target_selectors = auto_selectors debug(f"[screen_shot] Auto selectors matched for url: {auto_selectors}") - screenshot_result = _capture_screenshot(options) + screenshot_result = _capture_screenshot(options, progress) # Log results and warnings debug(f"Screenshot captured to {screenshot_result.path}") @@ -875,15 +992,18 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: capture_date = datetime.now().date().isoformat() upstream_title = _clean_title(_extract_item_title(origin_item)) - display_title = upstream_title or url + url_title = _title_from_url(url) + display_title = upstream_title or url_title or url upstream_tags = _extract_item_tags(origin_item) filtered_upstream_tags = [ t for t in upstream_tags if not str(t).strip().lower().startswith(("type:", "date:")) ] + + url_tags = _tags_from_url(url) merged_tags = unique_preserve_order( - ["type:screenshot", f"date:{capture_date}"] + filtered_upstream_tags + ["type:screenshot", f"date:{capture_date}"] + filtered_upstream_tags + url_tags ) pipe_obj = create_pipe_object_result( @@ -910,11 +1030,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: all_emitted.append(pipe_obj) # If we created a local progress UI, advance it per completed item. - if local_progress_ui is not None: - try: - local_progress_ui.on_emit(0, pipe_obj) - except Exception: - pass + progress.on_emit(pipe_obj) except ScreenshotError as exc: log(f"Error taking screenshot of {url}: {exc}", file=sys.stderr) @@ -925,23 +1041,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: traceback.print_exc(file=sys.stderr) exit_code = 1 - try: - if local_progress_ui is not None: - try: - local_progress_ui.finish_pipe(0, force_complete=True) - except Exception: - pass - finally: - if local_progress_ui is not None: - try: - local_progress_ui.stop() - except Exception: - pass - try: - if hasattr(pipeline_context, "set_live_progress"): - pipeline_context.set_live_progress(None) - except Exception: - pass + progress.close_local_ui(force_complete=True) if not all_emitted: log(f"No screenshots were successfully captured", file=sys.stderr) diff --git a/cmdnat/matrix.py b/cmdnat/matrix.py index e973461..5e930e2 100644 --- a/cmdnat/matrix.py +++ b/cmdnat/matrix.py @@ -336,6 +336,18 @@ def _resolve_upload_path(item: Any, config: Dict[str, Any]) -> Optional[str]: def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: # Internal stage: send previously selected items to selected rooms. if any(str(a).lower() == "-send" for a in (args or [])): + # Ensure we don't re-print the rooms picker table on the send stage. + try: + if hasattr(ctx, "set_last_result_table_overlay"): + ctx.set_last_result_table_overlay(None, None, None) + except Exception: + pass + try: + if hasattr(ctx, "set_current_stage_table"): + ctx.set_current_stage_table(None) + except Exception: + pass + rooms = _normalize_to_list(result) room_ids: List[str] = [] for r in rooms: @@ -430,7 +442,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: log("No joined rooms found.", file=sys.stderr) return 0 - table = ResultTable("Matrix Rooms") + table = ResultTable("Matrix Rooms (select with @N)") table.set_table("matrix") table.set_source_command(".matrix", []) @@ -461,12 +473,6 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: ctx.set_last_result_table_overlay(table, room_items) ctx.set_current_stage_table(table) ctx.set_pending_pipeline_tail([[".matrix", "-send"]], ".matrix") - - print() - from rich_display import stdout_console - - stdout_console().print(table) - print("\nSelect room(s) with @N (e.g. @1 or @1-3) to send the selected item(s)") return 0 CMDLET = Cmdlet( diff --git a/readme.md b/readme.md index 3c9c216..7674a36 100644 --- a/readme.md +++ b/readme.md @@ -1,6 +1,6 @@ # Medios-Macina -Medios-Macina is a CLI-first media ingestion and management toolkit focused on reliably downloading, tagging, and storing media (audio, video, images, and text) from a variety of providers and sources. It is designed around a compact, pipeable command language ("cmdlets") so complex workflows can be composed simply and repeatably. +Medios-Macina is a CLI media manager and toolkit focused on downloading, tagging, and media storage (audio, video, images, and text) from a variety of providers and sources. It is designed around a compact, pipeable command language ("cmdlets") so complex workflows can be composed simply and repeatably. ## Highlights ✅ - Flexible pipeline-based CLI: chain cmdlets with `|` and use saved selections with `@N`.