This commit is contained in:
nose
2025-12-22 02:11:53 -08:00
parent d0b821b5dd
commit 16316bb3fd
20 changed files with 4218 additions and 2422 deletions

View File

@@ -147,7 +147,11 @@ class HydrusNetwork:
file_size = file_path.stat().st_size
headers["Content-Type"] = spec.content_type or "application/octet-stream"
headers["Content-Length"] = str(file_size)
# Do not set Content-Length when streaming an iterator body.
# If the file size changes between stat() and read() (or the source is truncated),
# h11 will raise: "Too little data for declared Content-Length".
# Let httpx choose chunked transfer encoding for safety.
headers.pop("Content-Length", None)
logger.debug(f"{self._log_prefix()} Uploading file {file_path.name} ({file_size} bytes)")

875
CLI.py
View File

@@ -1245,25 +1245,17 @@ class PipelineExecutor:
stages.append(current)
return stages
def execute_tokens(self, tokens: List[str]) -> None:
from cmdlet import REGISTRY
import pipeline as ctx
@staticmethod
def _try_clear_pipeline_stop(ctx: Any) -> None:
try:
try:
if hasattr(ctx, "clear_pipeline_stop"):
ctx.clear_pipeline_stop()
except Exception:
pass
stages = self._split_stages(tokens)
if not stages:
print("Invalid pipeline syntax\n")
return
pending_tail = ctx.get_pending_pipeline_tail() if hasattr(ctx, "get_pending_pipeline_tail") else []
pending_source = ctx.get_pending_pipeline_source() if hasattr(ctx, "get_pending_pipeline_source") else None
if hasattr(ctx, "clear_pipeline_stop"):
ctx.clear_pipeline_stop()
except Exception:
pass
@staticmethod
def _maybe_seed_current_stage_table(ctx: Any) -> None:
try:
if hasattr(ctx, "get_current_stage_table") and not ctx.get_current_stage_table():
display_table = ctx.get_display_table() if hasattr(ctx, "get_display_table") else None
if display_table:
@@ -1272,188 +1264,512 @@ class PipelineExecutor:
last_table = ctx.get_last_result_table() if hasattr(ctx, "get_last_result_table") else None
if last_table:
ctx.set_current_stage_table(last_table)
except Exception:
pass
@staticmethod
def _maybe_apply_pending_pipeline_tail(ctx: Any, stages: List[List[str]]) -> List[List[str]]:
try:
pending_tail = ctx.get_pending_pipeline_tail() if hasattr(ctx, "get_pending_pipeline_tail") else []
pending_source = ctx.get_pending_pipeline_source() if hasattr(ctx, "get_pending_pipeline_source") else None
except Exception:
pending_tail = []
pending_source = None
try:
current_source = (
ctx.get_current_stage_table_source_command() if hasattr(ctx, "get_current_stage_table_source_command") else None
ctx.get_current_stage_table_source_command()
if hasattr(ctx, "get_current_stage_table_source_command")
else None
)
except Exception:
current_source = None
try:
effective_source = current_source or (
ctx.get_last_result_table_source_command() if hasattr(ctx, "get_last_result_table_source_command") else None
ctx.get_last_result_table_source_command()
if hasattr(ctx, "get_last_result_table_source_command")
else None
)
selection_only = len(stages) == 1 and stages[0] and stages[0][0].startswith("@")
if pending_tail and selection_only:
if (pending_source is None) or (effective_source and pending_source == effective_source):
stages.extend(pending_tail)
except Exception:
effective_source = current_source
selection_only = bool(len(stages) == 1 and stages[0] and stages[0][0].startswith("@"))
if pending_tail and selection_only:
if (pending_source is None) or (effective_source and pending_source == effective_source):
stages = list(stages) + list(pending_tail)
try:
if hasattr(ctx, "clear_pending_pipeline_tail"):
ctx.clear_pending_pipeline_tail()
elif hasattr(ctx, "clear_pending_pipeline_tail"):
ctx.clear_pending_pipeline_tail()
config = self._config_loader.load()
if isinstance(config, dict):
# This executor is used by both the REPL and the `pipeline` subcommand.
# Quiet/background mode is helpful for detached/background runners, but
# it suppresses interactive UX (like the pipeline Live progress UI).
config["_quiet_background_output"] = bool(self._toolbar_output is None)
def _resolve_items_for_selection(table_obj, items_list):
return items_list if items_list else []
def _maybe_run_class_selector(selected_items: list, *, stage_is_last: bool) -> bool:
if not stage_is_last:
return False
candidates: list[str] = []
seen: set[str] = set()
def _add(value) -> None:
try:
text = str(value or "").strip().lower()
except Exception:
return
if not text or text in seen:
return
seen.add(text)
candidates.append(text)
try:
current_table = ctx.get_current_stage_table() or ctx.get_last_result_table()
_add(current_table.table if current_table and hasattr(current_table, "table") else None)
except Exception:
pass
for item in selected_items or []:
if isinstance(item, dict):
_add(item.get("provider"))
_add(item.get("store"))
_add(item.get("table"))
else:
_add(getattr(item, "provider", None))
_add(getattr(item, "store", None))
_add(getattr(item, "table", None))
else:
try:
from ProviderCore.registry import get_provider, is_known_provider_name
if hasattr(ctx, "clear_pending_pipeline_tail"):
ctx.clear_pending_pipeline_tail()
except Exception:
get_provider = None # type: ignore
is_known_provider_name = None # type: ignore
pass
return stages
if get_provider is not None:
for key in candidates:
try:
if is_known_provider_name is not None and (not is_known_provider_name(key)):
continue
except Exception:
# If the predicate fails for any reason, fall back to legacy behavior.
pass
try:
provider = get_provider(key, config)
except Exception:
continue
selector = getattr(provider, "selector", None)
if selector is None:
continue
try:
handled = bool(selector(selected_items, ctx=ctx, stage_is_last=True))
except Exception as exc:
print(f"{key} selector failed: {exc}\n")
return True
if handled:
return True
def _apply_quiet_background_flag(self, config: Any) -> Any:
if isinstance(config, dict):
# This executor is used by both the REPL and the `pipeline` subcommand.
# Quiet/background mode is helpful for detached/background runners, but
# it suppresses interactive UX (like the pipeline Live progress UI).
config["_quiet_background_output"] = bool(self._toolbar_output is None)
return config
store_keys: list[str] = []
for item in selected_items or []:
if isinstance(item, dict):
v = item.get("store")
else:
v = getattr(item, "store", None)
name = str(v or "").strip()
if name:
store_keys.append(name)
@staticmethod
def _extract_first_stage_selection_tokens(stages: List[List[str]]) -> tuple[List[List[str]], List[int], bool, bool]:
first_stage_tokens = stages[0] if stages else []
first_stage_selection_indices: List[int] = []
first_stage_had_extra_args = False
first_stage_select_all = False
if store_keys:
if first_stage_tokens:
new_first_stage: List[str] = []
for token in first_stage_tokens:
if token.startswith("@"): # selection
selection = SelectionSyntax.parse(token)
if selection is not None:
first_stage_selection_indices = sorted([i - 1 for i in selection])
continue
if token == "@*":
first_stage_select_all = True
continue
new_first_stage.append(token)
if new_first_stage:
stages = list(stages)
stages[0] = new_first_stage
if first_stage_selection_indices or first_stage_select_all:
first_stage_had_extra_args = True
elif first_stage_selection_indices or first_stage_select_all:
stages = list(stages)
stages.pop(0)
return stages, first_stage_selection_indices, first_stage_had_extra_args, first_stage_select_all
@staticmethod
def _apply_select_all_if_requested(ctx: Any, indices: List[int], select_all: bool) -> List[int]:
if not select_all:
return indices
try:
last_items = ctx.get_last_result_items()
except Exception:
last_items = None
if last_items:
return list(range(len(last_items)))
return indices
@staticmethod
def _maybe_run_class_selector(ctx: Any, config: Any, selected_items: list, *, stage_is_last: bool) -> bool:
if not stage_is_last:
return False
candidates: list[str] = []
seen: set[str] = set()
def _add(value) -> None:
try:
text = str(value or "").strip().lower()
except Exception:
return
if not text or text in seen:
return
seen.add(text)
candidates.append(text)
try:
current_table = ctx.get_current_stage_table() or ctx.get_last_result_table()
_add(current_table.table if current_table and hasattr(current_table, "table") else None)
except Exception:
pass
for item in selected_items or []:
if isinstance(item, dict):
_add(item.get("provider"))
_add(item.get("store"))
_add(item.get("table"))
else:
_add(getattr(item, "provider", None))
_add(getattr(item, "store", None))
_add(getattr(item, "table", None))
try:
from ProviderCore.registry import get_provider, is_known_provider_name
except Exception:
get_provider = None # type: ignore
is_known_provider_name = None # type: ignore
if get_provider is not None:
for key in candidates:
try:
if is_known_provider_name is not None and (not is_known_provider_name(key)):
continue
except Exception:
# If the predicate fails for any reason, fall back to legacy behavior.
pass
try:
provider = get_provider(key, config)
except Exception:
continue
selector = getattr(provider, "selector", None)
if selector is None:
continue
try:
handled = bool(selector(selected_items, ctx=ctx, stage_is_last=True))
except Exception as exc:
print(f"{key} selector failed: {exc}\n")
return True
if handled:
return True
store_keys: list[str] = []
for item in selected_items or []:
if isinstance(item, dict):
v = item.get("store")
else:
v = getattr(item, "store", None)
name = str(v or "").strip()
if name:
store_keys.append(name)
if store_keys:
try:
from Store.registry import Store as StoreRegistry
store_registry = StoreRegistry(config, suppress_debug=True)
_backend_names = list(store_registry.list_backends() or [])
_backend_by_lower = {str(n).lower(): str(n) for n in _backend_names if str(n).strip()}
for name in store_keys:
resolved_name = name
if not store_registry.is_available(resolved_name):
resolved_name = _backend_by_lower.get(str(name).lower(), name)
if not store_registry.is_available(resolved_name):
continue
backend = store_registry[resolved_name]
selector = getattr(backend, "selector", None)
if selector is None:
continue
handled = bool(selector(selected_items, ctx=ctx, stage_is_last=True))
if handled:
return True
except Exception:
pass
return False
def _maybe_enable_background_notifier(self, worker_manager: Any, config: Any, pipeline_session: Any) -> None:
if not (pipeline_session and worker_manager and isinstance(config, dict)):
return
session_worker_ids = config.get("_session_worker_ids")
if not session_worker_ids:
return
try:
output_fn = self._toolbar_output
quiet_mode = bool(config.get("_quiet_background_output"))
terminal_only = quiet_mode and not output_fn
kwargs: Dict[str, Any] = {
"session_worker_ids": session_worker_ids,
"only_terminal_updates": terminal_only,
"overlay_mode": bool(output_fn),
}
if output_fn:
kwargs["output"] = output_fn
ensure_background_notifier(worker_manager, **kwargs)
except Exception:
pass
@staticmethod
def _get_raw_stage_texts(ctx: Any) -> List[str]:
raw_stage_texts: List[str] = []
try:
if hasattr(ctx, "get_current_command_stages"):
raw_stage_texts = ctx.get_current_command_stages() or []
except Exception:
raw_stage_texts = []
return raw_stage_texts
def _maybe_apply_initial_selection(
self,
ctx: Any,
config: Any,
stages: List[List[str]],
*,
selection_indices: List[int],
first_stage_had_extra_args: bool,
worker_manager: Any,
pipeline_session: Any,
) -> tuple[bool, Any]:
if not selection_indices:
return True, None
try:
if not ctx.get_current_stage_table_source_command():
display_table = ctx.get_display_table() if hasattr(ctx, "get_display_table") else None
table_for_stage = display_table or ctx.get_last_result_table()
if table_for_stage:
ctx.set_current_stage_table(table_for_stage)
except Exception:
pass
source_cmd = None
source_args_raw = None
try:
source_cmd = ctx.get_current_stage_table_source_command()
source_args_raw = ctx.get_current_stage_table_source_args()
except Exception:
source_cmd = None
source_args_raw = None
if isinstance(source_args_raw, str):
source_args: List[str] = [source_args_raw]
elif isinstance(source_args_raw, list):
source_args = [str(x) for x in source_args_raw if x is not None]
else:
source_args = []
current_table = None
try:
current_table = ctx.get_current_stage_table()
except Exception:
current_table = None
table_type = current_table.table if current_table and hasattr(current_table, "table") else None
command_expanded = False
if table_type in {"youtube", "soulseek"}:
command_expanded = False
elif source_cmd == "search-file" and source_args and "youtube" in source_args:
command_expanded = False
else:
selected_row_args: List[str] = []
skip_pipe_expansion = source_cmd == ".pipe" and len(stages) > 0
if source_cmd and not skip_pipe_expansion:
for idx in selection_indices:
row_args = ctx.get_current_stage_table_row_selection_args(idx)
if row_args:
selected_row_args.extend(row_args)
break
if selected_row_args:
if isinstance(source_cmd, list):
cmd_list: List[str] = [str(x) for x in source_cmd if x is not None]
elif isinstance(source_cmd, str):
cmd_list = [source_cmd]
else:
cmd_list = []
expanded_stage: List[str] = cmd_list + source_args + selected_row_args
if first_stage_had_extra_args and stages:
expanded_stage += stages[0]
stages[0] = expanded_stage
else:
stages.insert(0, expanded_stage)
if pipeline_session and worker_manager:
try:
from Store.registry import Store as StoreRegistry
store_registry = StoreRegistry(config, suppress_debug=True)
_backend_names = list(store_registry.list_backends() or [])
_backend_by_lower = {str(n).lower(): str(n) for n in _backend_names if str(n).strip()}
for name in store_keys:
resolved_name = name
if not store_registry.is_available(resolved_name):
resolved_name = _backend_by_lower.get(str(name).lower(), name)
if not store_registry.is_available(resolved_name):
continue
backend = store_registry[resolved_name]
selector = getattr(backend, "selector", None)
if selector is None:
continue
handled = bool(selector(selected_items, ctx=ctx, stage_is_last=True))
if handled:
return True
worker_manager.log_step(
pipeline_session.worker_id,
f"@N expansion: {source_cmd} + {' '.join(str(x) for x in selected_row_args)}",
)
except Exception:
pass
return False
selection_indices = []
command_expanded = True
first_stage_tokens = stages[0] if stages else []
first_stage_selection_indices: List[int] = []
first_stage_had_extra_args = False
first_stage_select_all = False
if (not command_expanded) and selection_indices:
last_piped_items = None
try:
last_piped_items = ctx.get_last_result_items()
except Exception:
last_piped_items = None
if first_stage_tokens:
new_first_stage: List[str] = []
for token in first_stage_tokens:
if token.startswith("@"): # selection
selection = SelectionSyntax.parse(token)
if selection is not None:
first_stage_selection_indices = sorted([i - 1 for i in selection])
continue
if token == "@*":
first_stage_select_all = True
continue
new_first_stage.append(token)
stage_table = None
try:
stage_table = ctx.get_current_stage_table()
except Exception:
stage_table = None
if not stage_table and hasattr(ctx, "get_display_table"):
try:
stage_table = ctx.get_display_table()
except Exception:
stage_table = None
if not stage_table:
try:
stage_table = ctx.get_last_result_table()
except Exception:
stage_table = None
if new_first_stage:
stages[0] = new_first_stage
if first_stage_selection_indices or first_stage_select_all:
first_stage_had_extra_args = True
elif first_stage_selection_indices or first_stage_select_all:
stages.pop(0)
resolved_items = last_piped_items if last_piped_items else []
if last_piped_items:
filtered = [resolved_items[i] for i in selection_indices if 0 <= i < len(resolved_items)]
if not filtered:
print("No items matched selection in pipeline\n")
return False, None
if first_stage_select_all:
last_items = ctx.get_last_result_items()
if last_items:
first_stage_selection_indices = list(range(len(last_items)))
if PipelineExecutor._maybe_run_class_selector(ctx, config, filtered, stage_is_last=(not stages)):
return False, None
from cmdlet._shared import coerce_to_pipe_object
filtered_pipe_objs = [coerce_to_pipe_object(item) for item in filtered]
piped_result = filtered_pipe_objs if len(filtered_pipe_objs) > 1 else filtered_pipe_objs[0]
if pipeline_session and worker_manager:
try:
selection_parts = [f"@{i+1}" for i in selection_indices]
worker_manager.log_step(
pipeline_session.worker_id,
f"Applied @N selection {' | '.join(selection_parts)}",
)
except Exception:
pass
# Auto-insert downloader stages for provider tables.
try:
current_table = ctx.get_current_stage_table() or ctx.get_last_result_table()
except Exception:
current_table = None
table_type = current_table.table if current_table and hasattr(current_table, "table") else None
if not stages:
if table_type == "youtube":
print("Auto-running YouTube selection via download-media")
stages.append(["download-media"])
elif table_type == "bandcamp":
print("Auto-running Bandcamp selection via download-media")
stages.append(["download-media"])
elif table_type in {"soulseek", "openlibrary", "libgen"}:
print("Auto-piping selection to download-file")
stages.append(["download-file"])
else:
first_cmd = stages[0][0] if stages and stages[0] else None
if table_type == "soulseek" and first_cmd not in (
"download-file",
"download-media",
"download_media",
".pipe",
):
debug("Auto-inserting download-file after Soulseek selection")
stages.insert(0, ["download-file"])
if table_type == "youtube" and first_cmd not in (
"download-media",
"download_media",
"download-file",
".pipe",
):
debug("Auto-inserting download-media after YouTube selection")
stages.insert(0, ["download-media"])
if table_type == "bandcamp" and first_cmd not in (
"download-media",
"download_media",
"download-file",
".pipe",
):
print("Auto-inserting download-media after Bandcamp selection")
stages.insert(0, ["download-media"])
if table_type == "libgen" and first_cmd not in (
"download-file",
"download-media",
"download_media",
".pipe",
):
print("Auto-inserting download-file after Libgen selection")
stages.insert(0, ["download-file"])
return True, piped_result
else:
print("No previous results to select from\n")
return False, None
return True, None
@staticmethod
def _maybe_start_live_progress(config: Any, stages: List[List[str]]) -> tuple[Any, Dict[int, int]]:
progress_ui = None
pipe_index_by_stage: Dict[int, int] = {}
try:
quiet_mode = bool(config.get("_quiet_background_output")) if isinstance(config, dict) else False
except Exception:
quiet_mode = False
try:
import sys as _sys
if (not quiet_mode) and bool(getattr(_sys.stderr, "isatty", lambda: False)()):
from models import PipelineLiveProgress
pipe_stage_indices: List[int] = []
pipe_labels: List[str] = []
for idx, stage_tokens in enumerate(stages):
if not stage_tokens:
continue
name = str(stage_tokens[0]).replace("_", "-").lower()
if name == "@" or name.startswith("@"):
continue
# `.pipe` (MPV) is an interactive launcher; disable pipeline Live progress
# for it because it doesn't meaningfully "complete" (mpv may keep running)
# and Live output interferes with MPV playlist UI.
if name == ".pipe":
continue
# `.matrix` uses a two-phase picker (@N then .matrix -send). Pipeline Live
# progress can linger across those phases and interfere with interactive output.
if name == ".matrix":
continue
pipe_stage_indices.append(idx)
pipe_labels.append(name)
if pipe_labels:
progress_ui = PipelineLiveProgress(pipe_labels, enabled=True)
progress_ui.start()
try:
import pipeline as _pipeline_ctx
if hasattr(_pipeline_ctx, "set_live_progress"):
_pipeline_ctx.set_live_progress(progress_ui)
except Exception:
pass
pipe_index_by_stage = {stage_idx: pipe_idx for pipe_idx, stage_idx in enumerate(pipe_stage_indices)}
except Exception:
progress_ui = None
pipe_index_by_stage = {}
return progress_ui, pipe_index_by_stage
def execute_tokens(self, tokens: List[str]) -> None:
from cmdlet import REGISTRY
import pipeline as ctx
try:
self._try_clear_pipeline_stop(ctx)
stages = self._split_stages(tokens)
if not stages:
print("Invalid pipeline syntax\n")
return
self._maybe_seed_current_stage_table(ctx)
stages = self._maybe_apply_pending_pipeline_tail(ctx, stages)
config = self._config_loader.load()
config = self._apply_quiet_background_flag(config)
stages, first_stage_selection_indices, first_stage_had_extra_args, first_stage_select_all = (
self._extract_first_stage_selection_tokens(stages)
)
first_stage_selection_indices = self._apply_select_all_if_requested(
ctx, first_stage_selection_indices, first_stage_select_all
)
piped_result: Any = None
worker_manager = WorkerManagerRegistry.ensure(config)
pipeline_text = " | ".join(" ".join(stage) for stage in stages)
pipeline_session = WorkerStages.begin_pipeline(worker_manager, pipeline_text=pipeline_text, config=config)
raw_stage_texts: List[str] = []
try:
if hasattr(ctx, "get_current_command_stages"):
raw_stage_texts = ctx.get_current_command_stages() or []
except Exception:
raw_stage_texts = []
if pipeline_session and worker_manager and isinstance(config, dict):
session_worker_ids = config.get("_session_worker_ids")
if session_worker_ids:
try:
output_fn = self._toolbar_output
quiet_mode = bool(config.get("_quiet_background_output"))
terminal_only = quiet_mode and not output_fn
kwargs: Dict[str, Any] = {
"session_worker_ids": session_worker_ids,
"only_terminal_updates": terminal_only,
"overlay_mode": bool(output_fn),
}
if output_fn:
kwargs["output"] = output_fn
ensure_background_notifier(worker_manager, **kwargs)
except Exception:
pass
raw_stage_texts = self._get_raw_stage_texts(ctx)
self._maybe_enable_background_notifier(worker_manager, config, pipeline_session)
pipeline_status = "completed"
pipeline_error = ""
@@ -1462,201 +1778,24 @@ class PipelineExecutor:
pipe_index_by_stage: Dict[int, int] = {}
try:
if first_stage_selection_indices:
if not ctx.get_current_stage_table_source_command():
display_table = ctx.get_display_table() if hasattr(ctx, "get_display_table") else None
table_for_stage = display_table or ctx.get_last_result_table()
if table_for_stage:
ctx.set_current_stage_table(table_for_stage)
source_cmd = ctx.get_current_stage_table_source_command()
source_args_raw = ctx.get_current_stage_table_source_args()
if isinstance(source_args_raw, str):
source_args: List[str] = [source_args_raw]
elif isinstance(source_args_raw, list):
source_args = [str(x) for x in source_args_raw if x is not None]
else:
source_args = []
current_table = ctx.get_current_stage_table()
table_type = current_table.table if current_table and hasattr(current_table, "table") else None
command_expanded = False
if table_type in {"youtube", "soulseek"}:
command_expanded = False
elif source_cmd == "search-file" and source_args and "youtube" in source_args:
command_expanded = False
else:
selected_row_args: List[str] = []
skip_pipe_expansion = source_cmd == ".pipe" and len(stages) > 0
if source_cmd and not skip_pipe_expansion:
for idx in first_stage_selection_indices:
row_args = ctx.get_current_stage_table_row_selection_args(idx)
if row_args:
selected_row_args.extend(row_args)
break
if selected_row_args:
if isinstance(source_cmd, list):
cmd_list: List[str] = [str(x) for x in source_cmd if x is not None]
elif isinstance(source_cmd, str):
cmd_list = [source_cmd]
else:
cmd_list = []
expanded_stage: List[str] = cmd_list + source_args + selected_row_args
if first_stage_had_extra_args and stages:
expanded_stage += stages[0]
stages[0] = expanded_stage
else:
stages.insert(0, expanded_stage)
if pipeline_session and worker_manager:
try:
worker_manager.log_step(
pipeline_session.worker_id,
f"@N expansion: {source_cmd} + {' '.join(str(x) for x in selected_row_args)}",
)
except Exception:
pass
first_stage_selection_indices = []
command_expanded = True
if not command_expanded and first_stage_selection_indices:
last_piped_items = ctx.get_last_result_items()
stage_table = ctx.get_current_stage_table()
if not stage_table and hasattr(ctx, "get_display_table"):
stage_table = ctx.get_display_table()
if not stage_table:
stage_table = ctx.get_last_result_table()
resolved_items = _resolve_items_for_selection(stage_table, last_piped_items)
if last_piped_items:
filtered = [
resolved_items[i]
for i in first_stage_selection_indices
if 0 <= i < len(resolved_items)
]
if not filtered:
print("No items matched selection in pipeline\n")
return
if _maybe_run_class_selector(filtered, stage_is_last=(not stages)):
return
from cmdlet._shared import coerce_to_pipe_object
filtered_pipe_objs = [coerce_to_pipe_object(item) for item in filtered]
piped_result = filtered_pipe_objs if len(filtered_pipe_objs) > 1 else filtered_pipe_objs[0]
if pipeline_session and worker_manager:
try:
selection_parts = [f"@{i+1}" for i in first_stage_selection_indices]
worker_manager.log_step(
pipeline_session.worker_id,
f"Applied @N selection {' | '.join(selection_parts)}",
)
except Exception:
pass
# Auto-insert downloader stages for provider tables.
current_table = ctx.get_current_stage_table() or ctx.get_last_result_table()
table_type = current_table.table if current_table and hasattr(current_table, "table") else None
if not stages:
if table_type == "youtube":
print("Auto-running YouTube selection via download-media")
stages.append(["download-media"])
elif table_type == "bandcamp":
print("Auto-running Bandcamp selection via download-media")
stages.append(["download-media"])
elif table_type in {"soulseek", "openlibrary", "libgen"}:
print("Auto-piping selection to download-file")
stages.append(["download-file"])
else:
first_cmd = stages[0][0] if stages and stages[0] else None
if table_type == "soulseek" and first_cmd not in (
"download-file",
"download-media",
"download_media",
".pipe",
):
debug("Auto-inserting download-file after Soulseek selection")
stages.insert(0, ["download-file"])
if table_type == "youtube" and first_cmd not in (
"download-media",
"download_media",
"download-file",
".pipe",
):
debug("Auto-inserting download-media after YouTube selection")
stages.insert(0, ["download-media"])
if table_type == "bandcamp" and first_cmd not in (
"download-media",
"download_media",
"download-file",
".pipe",
):
print("Auto-inserting download-media after Bandcamp selection")
stages.insert(0, ["download-media"])
if table_type == "libgen" and first_cmd not in (
"download-file",
"download-media",
"download_media",
".pipe",
):
print("Auto-inserting download-file after Libgen selection")
stages.insert(0, ["download-file"])
else:
print("No previous results to select from\n")
return
ok, initial_piped = self._maybe_apply_initial_selection(
ctx,
config,
stages,
selection_indices=first_stage_selection_indices,
first_stage_had_extra_args=first_stage_had_extra_args,
worker_manager=worker_manager,
pipeline_session=pipeline_session,
)
if not ok:
return
if initial_piped is not None:
piped_result = initial_piped
# ------------------------------------------------------------------
# Multi-level pipeline progress (pipes = stages, tasks = items)
# ------------------------------------------------------------------
try:
quiet_mode = bool(config.get("_quiet_background_output")) if isinstance(config, dict) else False
except Exception:
quiet_mode = False
try:
import sys as _sys
if (not quiet_mode) and bool(getattr(_sys.stderr, "isatty", lambda: False)()):
from models import PipelineLiveProgress
pipe_stage_indices: List[int] = []
pipe_labels: List[str] = []
for idx, tokens in enumerate(stages):
if not tokens:
continue
name = str(tokens[0]).replace("_", "-").lower()
if name == "@" or name.startswith("@"):
continue
# `.pipe` (MPV) is an interactive launcher; disable pipeline Live progress
# for it because it doesn't meaningfully "complete" (mpv may keep running)
# and Live output interferes with MPV playlist UI.
if name == ".pipe":
continue
pipe_stage_indices.append(idx)
pipe_labels.append(name)
if pipe_labels:
progress_ui = PipelineLiveProgress(pipe_labels, enabled=True)
progress_ui.start()
try:
import pipeline as _pipeline_ctx
if hasattr(_pipeline_ctx, "set_live_progress"):
_pipeline_ctx.set_live_progress(progress_ui)
except Exception:
pass
pipe_index_by_stage = {stage_idx: pipe_idx for pipe_idx, stage_idx in enumerate(pipe_stage_indices)}
except Exception:
progress_ui = None
pipe_index_by_stage = {}
progress_ui, pipe_index_by_stage = self._maybe_start_live_progress(config, stages)
for stage_index, stage_tokens in enumerate(stages):
if not stage_tokens:
@@ -1707,7 +1846,7 @@ class PipelineExecutor:
if not stage_table:
stage_table = ctx.get_last_result_table()
items_list = ctx.get_last_result_items() or []
resolved_items = _resolve_items_for_selection(stage_table, items_list)
resolved_items = items_list if items_list else []
filtered = [resolved_items[i] for i in selected_indices if 0 <= i < len(resolved_items)]
if not filtered:
print("No items matched selection\n")
@@ -1715,7 +1854,7 @@ class PipelineExecutor:
pipeline_error = "Empty selection"
return
if _maybe_run_class_selector(filtered, stage_is_last=(stage_index + 1 >= len(stages))):
if PipelineExecutor._maybe_run_class_selector(ctx, config, filtered, stage_is_last=(stage_index + 1 >= len(stages))):
return
# Special case: selecting multiple tags from get-tag and piping into delete-tag
@@ -1841,9 +1980,11 @@ class PipelineExecutor:
on_emit = None
if progress_ui is not None and pipe_idx is not None:
def _on_emit(obj: Any, _idx: int = int(pipe_idx)) -> None:
_ui = cast(Any, progress_ui)
def _on_emit(obj: Any, _idx: int = int(pipe_idx), _progress=_ui) -> None:
try:
progress_ui.on_emit(_idx, obj)
_progress.on_emit(_idx, obj)
except Exception:
pass
on_emit = _on_emit

View File

@@ -23,6 +23,15 @@ except ImportError:
class Libgen(Provider):
# Domains that should be routed to this provider when the user supplies a URL.
# (Used by ProviderCore.registry.match_provider_name_for_url)
URL_DOMAINS = (
"libgen.gl",
"libgen.li",
"libgen.is",
"libgen.rs",
"libgen.st",
)
"""Search provider for Library Genesis books."""
def search(

View File

@@ -1,9 +1,11 @@
from __future__ import annotations
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Optional, Type
from typing import Any, Dict, List, Optional, Type, cast
import requests
import sys
import json
import subprocess
from SYS.logger import log, debug
@@ -13,6 +15,12 @@ except ImportError: # pragma: no cover - optional
musicbrainzngs = None
try: # Optional dependency
import yt_dlp # type: ignore
except ImportError: # pragma: no cover - optional
yt_dlp = None
class MetadataProvider(ABC):
"""Base class for metadata providers (music, movies, books, etc.)."""
@@ -351,6 +359,157 @@ class MusicBrainzMetadataProvider(MetadataProvider):
return tags
class YtdlpMetadataProvider(MetadataProvider):
"""Metadata provider that extracts tags from a supported URL using yt-dlp.
This does NOT download media; it only probes metadata.
"""
@property
def name(self) -> str: # type: ignore[override]
return "ytdlp"
def _extract_info(self, url: str) -> Optional[Dict[str, Any]]:
url = (url or "").strip()
if not url:
return None
# Prefer Python module when available.
if yt_dlp is not None:
try:
opts: Any = {
"quiet": True,
"no_warnings": True,
"skip_download": True,
"noprogress": True,
"socket_timeout": 15,
"retries": 1,
"playlist_items": "1-10",
}
with yt_dlp.YoutubeDL(opts) as ydl: # type: ignore[attr-defined]
info = ydl.extract_info(url, download=False)
return cast(Dict[str, Any], info) if isinstance(info, dict) else None
except Exception:
pass
# Fallback to CLI.
try:
cmd = [
"yt-dlp",
"-J",
"--no-warnings",
"--skip-download",
"--playlist-items",
"1-10",
url,
]
proc = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
if proc.returncode != 0:
return None
payload = (proc.stdout or "").strip()
if not payload:
return None
data = json.loads(payload)
return data if isinstance(data, dict) else None
except Exception:
return None
def search(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
url = (query or "").strip()
if not url.startswith(("http://", "https://")):
return []
info = self._extract_info(url)
if not isinstance(info, dict):
return []
upload_date = str(info.get("upload_date") or "")
release_date = str(info.get("release_date") or "")
year = (release_date or upload_date)[:4] if (release_date or upload_date) else ""
# Provide basic columns for the standard metadata selection table.
# NOTE: This is best-effort; many extractors don't provide artist/album.
artist = (
info.get("artist")
or info.get("uploader")
or info.get("channel")
or ""
)
album = info.get("album") or info.get("playlist_title") or ""
title = info.get("title") or ""
return [
{
"title": title,
"artist": str(artist or ""),
"album": str(album or ""),
"year": str(year or ""),
"provider": self.name,
"url": url,
"raw": info,
}
]
def to_tags(self, item: Dict[str, Any]) -> List[str]:
raw = item.get("raw")
if not isinstance(raw, dict):
return super().to_tags(item)
tags: List[str] = []
try:
from metadata import extract_ytdlp_tags
except Exception:
extract_ytdlp_tags = None # type: ignore[assignment]
if extract_ytdlp_tags:
try:
tags.extend(extract_ytdlp_tags(raw))
except Exception:
pass
# Subtitle availability tags
def _langs(value: Any) -> List[str]:
if not isinstance(value, dict):
return []
out: List[str] = []
for k in value.keys():
if isinstance(k, str) and k.strip():
out.append(k.strip().lower())
return sorted(set(out))
# If this is a playlist container, subtitle/captions are usually per-entry.
info_for_subs: Dict[str, Any] = raw
entries = raw.get("entries")
if isinstance(entries, list) and entries:
first = entries[0]
if isinstance(first, dict):
info_for_subs = first
for lang in _langs(info_for_subs.get("subtitles")):
tags.append(f"subs:{lang}")
for lang in _langs(info_for_subs.get("automatic_captions")):
tags.append(f"subs_auto:{lang}")
# Always include source tag for parity with other providers.
tags.append(f"source:{self.name}")
# Dedup case-insensitively, preserve order.
seen = set()
out: List[str] = []
for t in tags:
if not isinstance(t, str):
continue
s = t.strip()
if not s:
continue
k = s.lower()
if k in seen:
continue
seen.add(k)
out.append(s)
return out
# Registry ---------------------------------------------------------------
_METADATA_PROVIDERS: Dict[str, Type[MetadataProvider]] = {
@@ -359,6 +518,7 @@ _METADATA_PROVIDERS: Dict[str, Type[MetadataProvider]] = {
"googlebooks": GoogleBooksMetadataProvider,
"google": GoogleBooksMetadataProvider,
"musicbrainz": MusicBrainzMetadataProvider,
"ytdlp": YtdlpMetadataProvider,
}
@@ -370,7 +530,7 @@ def list_metadata_providers(config: Optional[Dict[str, Any]] = None) -> Dict[str
availability: Dict[str, bool] = {}
for name, cls in _METADATA_PROVIDERS.items():
try:
provider = cls(config)
_ = cls(config)
# Basic availability check: perform lightweight validation if defined
availability[name] = True
except Exception:

View File

@@ -11,7 +11,8 @@ import sys
import tempfile
import time
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from typing import Any, Callable, Dict, List, Optional, Tuple
from urllib.parse import urlparse
import requests
@@ -183,7 +184,44 @@ def _resolve_archive_id(session: requests.Session, edition_id: str, ia_candidate
return ""
def _archive_id_from_url(url: str) -> str:
"""Best-effort extraction of an Archive.org item identifier from a URL."""
u = str(url or "").strip()
if not u:
return ""
try:
p = urlparse(u)
host = (p.hostname or "").lower().strip()
if not host.endswith("archive.org"):
return ""
parts = [x for x in (p.path or "").split("/") if x]
except Exception:
return ""
# Common patterns:
# - /details/<id>/...
# - /borrow/<id>
# - /download/<id>/...
if len(parts) >= 2 and parts[0].lower() in {"details", "borrow", "download", "stream"}:
return str(parts[1]).strip()
# Sometimes the identifier is the first segment.
if len(parts) >= 1:
first = str(parts[0]).strip()
if first and first.lower() not in {"account", "services", "search", "advancedsearch.php"}:
return first
return ""
class OpenLibrary(Provider):
# Domains that should be routed to this provider when the user supplies a URL.
# (Used by ProviderCore.registry.match_provider_name_for_url)
URL_DOMAINS = (
"openlibrary.org",
"archive.org",
)
"""Search provider for OpenLibrary books + Archive.org direct/borrow download."""
def __init__(self, config: Optional[Dict[str, Any]] = None):
@@ -311,6 +349,60 @@ class OpenLibrary(Provider):
pass
raise RuntimeError("Something went wrong when trying to return the book")
@staticmethod
def _archive_logout(session: requests.Session) -> None:
"""Best-effort logout from archive.org.
Archive sessions are cookie-based; returning the loan is the critical step.
Logout is attempted for cleanliness but failures should not abort the workflow.
"""
if session is None:
return
for url in (
"https://archive.org/account/logout",
"https://archive.org/account/logout.php",
):
try:
resp = session.get(url, timeout=15, allow_redirects=True)
code = int(getattr(resp, "status_code", 0) or 0)
if code and code < 500:
return
except Exception:
continue
@staticmethod
def _archive_is_lendable(book_id: str) -> tuple[bool, str]:
"""Heuristic lendable check using Archive.org item metadata.
Some lendable items do not map cleanly to an OpenLibrary edition id.
In practice, Archive metadata collections often include markers like:
- inlibrary
- printdisabled
"""
ident = str(book_id or "").strip()
if not ident:
return False, "no-archive-id"
try:
resp = requests.get(f"https://archive.org/metadata/{ident}", timeout=8)
resp.raise_for_status()
data = resp.json() if resp is not None else {}
meta = data.get("metadata", {}) if isinstance(data, dict) else {}
collection = meta.get("collection") if isinstance(meta, dict) else None
values: List[str] = []
if isinstance(collection, list):
values = [str(x).strip().lower() for x in collection if str(x).strip()]
elif isinstance(collection, str):
values = [collection.strip().lower()]
if any(v in {"inlibrary", "printdisabled", "lendinglibrary"} for v in values):
return True, "archive-collection"
return False, "archive-not-lendable"
except Exception:
return False, "archive-metadata-error"
@staticmethod
def _archive_get_book_infos(session: requests.Session, url: str) -> Tuple[str, List[str], Dict[str, Any]]:
"""Extract page links from Archive.org book reader."""
@@ -430,6 +522,7 @@ class OpenLibrary(Provider):
links: List[str],
scale: int,
book_id: str,
progress_callback: Optional[Callable[[int, int], None]] = None,
) -> List[str]:
links_scaled = [f"{link}&rotate=0&scale={scale}" for link in links]
pages = len(links_scaled)
@@ -448,7 +541,20 @@ class OpenLibrary(Provider):
pages=pages,
)
)
if tqdm:
if progress_callback is not None:
done = 0
total = len(tasks)
for fut in futures.as_completed(tasks):
try:
_ = fut.result()
except Exception:
pass
done += 1
try:
progress_callback(done, total)
except Exception:
pass
elif tqdm:
for _ in tqdm(futures.as_completed(tasks), total=len(tasks)): # type: ignore
pass
else:
@@ -904,15 +1010,20 @@ class OpenLibrary(Provider):
return results
def download(self, result: SearchResult, output_dir: Path) -> Optional[Path]:
def download(
self,
result: SearchResult,
output_dir: Path,
progress_callback: Optional[Callable[[str, int, Optional[int], str], None]] = None,
) -> Optional[Path]:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
meta = result.full_metadata or {}
edition_id = str(meta.get("openlibrary_id") or "").strip()
if not edition_id:
log("[openlibrary] Missing openlibrary_id; cannot download", file=sys.stderr)
return None
# Accept direct Archive.org URLs too (details/borrow/download) even when no OL edition id is known.
archive_id = str(meta.get("archive_id") or "").strip()
ia_ids = meta.get("ia") or []
if isinstance(ia_ids, str):
@@ -921,12 +1032,23 @@ class OpenLibrary(Provider):
ia_ids = []
ia_candidates = [str(x) for x in ia_ids if x]
archive_id = _resolve_archive_id(self._session, edition_id, ia_candidates)
if not archive_id:
archive_id = _first_str(ia_candidates) or ""
if not archive_id and edition_id:
archive_id = _resolve_archive_id(self._session, edition_id, ia_candidates)
if not archive_id:
# Try to extract identifier from the SearchResult path (URL).
archive_id = _archive_id_from_url(str(getattr(result, "path", "") or ""))
if not archive_id:
log("[openlibrary] No archive identifier available; cannot download", file=sys.stderr)
return None
safe_title = sanitize_filename(result.title)
if not safe_title or "http" in safe_title.lower():
safe_title = sanitize_filename(archive_id) or "archive"
# 1) Direct download if available.
try:
@@ -935,8 +1057,22 @@ class OpenLibrary(Provider):
can_direct, pdf_url = False, ""
if can_direct and pdf_url:
try:
if progress_callback is not None:
progress_callback("step", 0, None, "direct download")
except Exception:
pass
out_path = unique_path(output_dir / f"{safe_title}.pdf")
ok = download_file(pdf_url, out_path, session=self._session)
ok = download_file(
pdf_url,
out_path,
session=self._session,
progress_callback=(
(lambda downloaded, total, label: progress_callback("bytes", downloaded, total, label))
if progress_callback is not None
else None
),
)
if ok:
return out_path
log("[openlibrary] Direct download failed", file=sys.stderr)
@@ -949,65 +1085,131 @@ class OpenLibrary(Provider):
log("[openlibrary] Archive credentials missing; cannot borrow", file=sys.stderr)
return None
lendable, reason = _check_lendable(self._session, edition_id)
lendable = True
reason = ""
if edition_id:
lendable, reason = _check_lendable(self._session, edition_id)
if not lendable:
# OpenLibrary API can be a false-negative; fall back to Archive metadata.
lendable2, reason2 = self._archive_is_lendable(archive_id)
if lendable2:
lendable, reason = True, reason2
else:
lendable, reason = self._archive_is_lendable(archive_id)
if not lendable:
log(f"[openlibrary] Not lendable: {reason}", file=sys.stderr)
return None
session = self._archive_login(email, password)
loaned = False
try:
session = self._archive_loan(session, archive_id, verbose=False)
except self.BookNotAvailableError:
log("[openlibrary] Book not available to borrow", file=sys.stderr)
return None
except Exception:
log("[openlibrary] Borrow failed", file=sys.stderr)
return None
urls = [f"https://archive.org/borrow/{archive_id}", f"https://archive.org/details/{archive_id}"]
title = safe_title
links: Optional[List[str]] = None
last_exc: Optional[Exception] = None
for u in urls:
try:
title_raw, links, _metadata = self._archive_get_book_infos(session, u)
if title_raw:
title = sanitize_filename(title_raw)
break
except Exception as exc:
last_exc = exc
continue
if not links:
log(f"[openlibrary] Failed to extract pages: {last_exc}", file=sys.stderr)
return None
temp_dir = tempfile.mkdtemp(prefix=f"{title}_", dir=str(output_dir))
try:
images = self._archive_download(session=session, n_threads=10, directory=temp_dir, links=links, scale=3, book_id=archive_id)
pdf_bytes = _image_paths_to_pdf_bytes(images)
if not pdf_bytes:
# Keep images folder for manual conversion.
log("[openlibrary] PDF conversion failed; keeping images folder", file=sys.stderr)
return Path(temp_dir)
pdf_path = unique_path(output_dir / f"{title}.pdf")
with open(pdf_path, "wb") as f:
f.write(pdf_bytes)
try:
shutil.rmtree(temp_dir)
if progress_callback is not None:
progress_callback("step", 0, None, "login")
except Exception:
pass
return pdf_path
except Exception:
try:
shutil.rmtree(temp_dir)
session = self._archive_loan(session, archive_id, verbose=False)
loaned = True
except self.BookNotAvailableError:
log("[openlibrary] Book not available to borrow", file=sys.stderr)
return None
except Exception:
log("[openlibrary] Borrow failed", file=sys.stderr)
return None
try:
if progress_callback is not None:
progress_callback("step", 0, None, "borrow")
except Exception:
pass
urls = [f"https://archive.org/borrow/{archive_id}", f"https://archive.org/details/{archive_id}"]
title = safe_title
links: Optional[List[str]] = None
last_exc: Optional[Exception] = None
for u in urls:
try:
title_raw, links, _metadata = self._archive_get_book_infos(session, u)
if title_raw:
title = sanitize_filename(title_raw)
break
except Exception as exc:
last_exc = exc
continue
if not links:
log(f"[openlibrary] Failed to extract pages: {last_exc}", file=sys.stderr)
return None
try:
if progress_callback is not None:
progress_callback("step", 0, None, "download pages")
except Exception:
pass
temp_dir = tempfile.mkdtemp(prefix=f"{title}_", dir=str(output_dir))
try:
images = self._archive_download(
session=session,
n_threads=10,
directory=temp_dir,
links=links,
scale=3,
book_id=archive_id,
progress_callback=(
(lambda done, total: progress_callback("pages", done, total, "pages"))
if progress_callback is not None
else None
),
)
pdf_bytes = _image_paths_to_pdf_bytes(images)
if not pdf_bytes:
# Keep images folder for manual conversion.
log("[openlibrary] PDF conversion failed; keeping images folder", file=sys.stderr)
return Path(temp_dir)
try:
if progress_callback is not None:
progress_callback("step", 0, None, "stitch pdf")
except Exception:
pass
pdf_path = unique_path(output_dir / f"{title}.pdf")
with open(pdf_path, "wb") as f:
f.write(pdf_bytes)
try:
shutil.rmtree(temp_dir)
except Exception:
pass
return pdf_path
except Exception:
try:
shutil.rmtree(temp_dir)
except Exception:
pass
raise
finally:
# Always return the loan after a successful borrow, even if download/stitch fails.
if loaned:
try:
if progress_callback is not None:
progress_callback("step", 0, None, "return book")
except Exception:
pass
try:
self._archive_return_loan(session, archive_id)
except Exception as exc:
log(f"[openlibrary] Warning: failed to return loan: {exc}", file=sys.stderr)
try:
self._archive_logout(session)
except Exception:
pass
raise
except Exception as exc:
log(f"[openlibrary] Borrow workflow error: {exc}", file=sys.stderr)

View File

@@ -1,7 +1,7 @@
from __future__ import annotations
from pathlib import Path
from typing import Optional
from typing import Callable, Optional
import sys
import requests
@@ -22,13 +22,20 @@ def sanitize_filename(name: str, *, max_len: int = 150) -> str:
return cleaned[:max_len]
def download_file(url: str, output_path: Path, *, session: Optional[requests.Session] = None, timeout_s: float = 30.0) -> bool:
def download_file(
url: str,
output_path: Path,
*,
session: Optional[requests.Session] = None,
timeout_s: float = 30.0,
progress_callback: Optional[Callable[[int, Optional[int], str], None]] = None,
) -> bool:
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
s = session or requests.Session()
bar = ProgressBar()
bar = ProgressBar() if progress_callback is None else None
downloaded = 0
total = None
@@ -41,9 +48,14 @@ def download_file(url: str, output_path: Path, *, session: Optional[requests.Ses
except Exception:
total = None
label = str(output_path.name or "download")
# Render once immediately so fast downloads still show something.
try:
bar.update(downloaded=0, total=total, label=str(output_path.name or "download"), file=sys.stderr)
if progress_callback is not None:
progress_callback(0, total, label)
elif bar is not None:
bar.update(downloaded=0, total=total, label=label, file=sys.stderr)
except Exception:
pass
@@ -53,18 +65,23 @@ def download_file(url: str, output_path: Path, *, session: Optional[requests.Ses
f.write(chunk)
downloaded += len(chunk)
try:
bar.update(downloaded=downloaded, total=total, label=str(output_path.name or "download"), file=sys.stderr)
if progress_callback is not None:
progress_callback(downloaded, total, label)
elif bar is not None:
bar.update(downloaded=downloaded, total=total, label=label, file=sys.stderr)
except Exception:
pass
try:
bar.finish()
if bar is not None:
bar.finish()
except Exception:
pass
return output_path.exists() and output_path.stat().st_size > 0
except Exception:
try:
bar.finish()
if bar is not None:
bar.finish()
except Exception:
pass
try:

View File

@@ -6,8 +6,9 @@ This module is the single source of truth for provider discovery.
from __future__ import annotations
from typing import Any, Dict, Optional, Type
from typing import Any, Dict, Optional, Sequence, Type
import sys
from urllib.parse import urlparse
from SYS.logger import log
@@ -141,6 +142,45 @@ def list_file_providers(config: Optional[Dict[str, Any]] = None) -> Dict[str, bo
return availability
def match_provider_name_for_url(url: str) -> Optional[str]:
"""Return a registered provider name that claims the URL's domain.
Providers can declare domains via a class attribute `URL_DOMAINS` (sequence of strings).
This matcher is intentionally cheap (no provider instantiation, no network).
"""
try:
parsed = urlparse(str(url))
host = (parsed.hostname or "").strip().lower()
except Exception:
host = ""
if not host:
return None
for name, provider_class in _PROVIDERS.items():
domains = getattr(provider_class, "URL_DOMAINS", None)
if not isinstance(domains, (list, tuple)):
continue
for d in domains:
dom = str(d or "").strip().lower()
if not dom:
continue
if host == dom or host.endswith("." + dom):
return name
return None
def get_provider_for_url(url: str, config: Optional[Dict[str, Any]] = None) -> Optional[Provider]:
"""Instantiate and return the matching provider for a URL, if any."""
name = match_provider_name_for_url(url)
if not name:
return None
return get_provider(name, config)
__all__ = [
"SearchResult",
"Provider",
@@ -152,5 +192,7 @@ __all__ = [
"list_search_providers",
"get_file_provider",
"list_file_providers",
"match_provider_name_for_url",
"get_provider_for_url",
"download_soulseek_file",
]

View File

@@ -584,10 +584,15 @@ def _download_direct_file(
filename = filename.split("?")[0]
# Try to get real filename from Content-Disposition header (HEAD request)
content_type = ""
try:
with HTTPClient(timeout=10.0) as client:
response = client._request("HEAD", url, follow_redirects=True)
content_disposition = response.headers.get("content-disposition", "")
try:
content_type = str(response.headers.get("content-type", "") or "").strip().lower()
except Exception:
content_type = ""
if content_disposition:
# Extract filename from Content-Disposition header
# Format: attachment; filename="filename.pdf" or filename=filename.pdf
@@ -620,9 +625,36 @@ def _download_direct_file(
else:
filename = suggested
# Final fallback if we still don't have a good filename
if not filename or "." not in filename:
filename = "downloaded_file.bin"
# If we still don't have an extension, try to infer one from Content-Type.
# Never fall back to a generic `.bin` extension.
try:
has_ext = bool(filename and Path(str(filename)).suffix)
except Exception:
has_ext = False
if filename and (not has_ext):
ct = (content_type or "").split(";")[0].strip().lower()
ext_by_ct = {
"application/pdf": ".pdf",
"application/epub+zip": ".epub",
"application/x-mobipocket-ebook": ".mobi",
"image/jpeg": ".jpg",
"image/png": ".png",
"image/webp": ".webp",
"image/gif": ".gif",
"text/plain": ".txt",
"application/zip": ".zip",
}
if ct in ext_by_ct:
filename = f"{filename}{ext_by_ct[ct]}"
elif ct.startswith("text/html"):
# Guardrail: HTML landing pages should not be downloaded as opaque files.
raise DownloadError("URL appears to be an HTML page, not a direct file")
# Final guardrail: if filename is empty, refuse rather than inventing `download.bin`.
if not filename or not str(filename).strip():
raise DownloadError("Could not determine filename for URL (no Content-Disposition and no path filename)")
file_path = _unique_path(output_dir / filename)
progress_bar = ProgressBar()
@@ -684,9 +716,15 @@ def _download_direct_file(
# For direct file downloads, create minimal info dict without filename as title
# This prevents creating duplicate title: tags when filename gets auto-generated
# We'll add title back later only if we couldn't extract meaningful tags
ext = ""
try:
ext = Path(str(filename)).suffix.lstrip(".")
except Exception:
ext = ""
info = {
"id": filename.rsplit(".", 1)[0],
"ext": filename.rsplit(".", 1)[1] if "." in filename else "bin",
"id": str(filename).rsplit(".", 1)[0] if "." in str(filename) else str(filename),
"ext": ext,
"webpage_url": url,
}

218
SYS/pipeline_progress.py Normal file
View File

@@ -0,0 +1,218 @@
from __future__ import annotations
import sys
from contextlib import contextmanager
from typing import Any, Iterator, Optional, Sequence, Tuple
class PipelineProgress:
"""Small adapter around PipelineLiveProgress.
This centralizes the boilerplate used across cmdlets:
- locating the active Live UI (if any)
- resolving the current pipe_index from stage context
- step-based progress (begin_pipe_steps/advance_pipe_step)
- optional pipe percent/status updates
- optional byte transfer bars
- optional local Live panel when a cmdlet runs standalone
The class is intentionally defensive: all UI operations are best-effort.
"""
def __init__(self, pipeline_module: Any):
self._ctx = pipeline_module
self._local_ui: Optional[Any] = None
self._local_attached: bool = False
def ui_and_pipe_index(self) -> Tuple[Optional[Any], int]:
ui = None
try:
ui = self._ctx.get_live_progress() if hasattr(self._ctx, "get_live_progress") else None
except Exception:
ui = None
pipe_idx: int = 0
try:
stage_ctx = self._ctx.get_stage_context() if hasattr(self._ctx, "get_stage_context") else None
maybe_idx = getattr(stage_ctx, "pipe_index", None) if stage_ctx is not None else None
if isinstance(maybe_idx, int):
pipe_idx = int(maybe_idx)
except Exception:
pipe_idx = 0
return ui, pipe_idx
def begin_steps(self, total_steps: int) -> None:
ui, pipe_idx = self.ui_and_pipe_index()
if ui is None:
return
try:
begin = getattr(ui, "begin_pipe_steps", None)
if callable(begin):
begin(int(pipe_idx), total_steps=int(total_steps))
except Exception:
return
def step(self, text: str) -> None:
ui, pipe_idx = self.ui_and_pipe_index()
if ui is None:
return
try:
adv = getattr(ui, "advance_pipe_step", None)
if callable(adv):
adv(int(pipe_idx), str(text))
except Exception:
return
def set_percent(self, percent: int) -> None:
ui, pipe_idx = self.ui_and_pipe_index()
if ui is None:
return
try:
set_pct = getattr(ui, "set_pipe_percent", None)
if callable(set_pct):
set_pct(int(pipe_idx), int(percent))
except Exception:
return
def set_status(self, text: str) -> None:
ui, pipe_idx = self.ui_and_pipe_index()
if ui is None:
return
try:
setter = getattr(ui, "set_pipe_status_text", None)
if callable(setter):
setter(int(pipe_idx), str(text))
except Exception:
return
def clear_status(self) -> None:
ui, pipe_idx = self.ui_and_pipe_index()
if ui is None:
return
try:
clr = getattr(ui, "clear_pipe_status_text", None)
if callable(clr):
clr(int(pipe_idx))
except Exception:
return
def begin_transfer(self, *, label: str, total: Optional[int] = None) -> None:
ui, _ = self.ui_and_pipe_index()
if ui is None:
return
try:
fn = getattr(ui, "begin_transfer", None)
if callable(fn):
fn(label=str(label or "transfer"), total=total)
except Exception:
return
def update_transfer(self, *, label: str, completed: Optional[int], total: Optional[int] = None) -> None:
ui, _ = self.ui_and_pipe_index()
if ui is None:
return
try:
fn = getattr(ui, "update_transfer", None)
if callable(fn):
fn(label=str(label or "transfer"), completed=completed, total=total)
except Exception:
return
def finish_transfer(self, *, label: str) -> None:
ui, _ = self.ui_and_pipe_index()
if ui is None:
return
try:
fn = getattr(ui, "finish_transfer", None)
if callable(fn):
fn(label=str(label or "transfer"))
except Exception:
return
def on_emit(self, emitted: Any) -> None:
"""Advance local pipe progress after pipeline_context.emit().
The shared PipelineExecutor wires on_emit automatically for pipelines.
Standalone cmdlet runs do not, so cmdlets call this explicitly.
"""
if self._local_ui is None:
return
try:
self._local_ui.on_emit(0, emitted)
except Exception:
return
def ensure_local_ui(self, *, label: str, total_items: int, items_preview: Optional[Sequence[Any]] = None) -> bool:
"""Start a local PipelineLiveProgress panel if no shared UI exists."""
try:
existing = self._ctx.get_live_progress() if hasattr(self._ctx, "get_live_progress") else None
except Exception:
existing = None
if existing is not None:
return False
if not bool(getattr(sys.stderr, "isatty", lambda: False)()):
return False
try:
from models import PipelineLiveProgress
ui = PipelineLiveProgress([str(label or "pipeline")], enabled=True)
ui.start()
try:
if hasattr(self._ctx, "set_live_progress"):
self._ctx.set_live_progress(ui)
self._local_attached = True
except Exception:
self._local_attached = False
try:
ui.begin_pipe(0, total_items=max(1, int(total_items)), items_preview=list(items_preview or []))
except Exception:
pass
self._local_ui = ui
return True
except Exception:
self._local_ui = None
self._local_attached = False
return False
def close_local_ui(self, *, force_complete: bool = True) -> None:
if self._local_ui is None:
return
try:
try:
self._local_ui.finish_pipe(0, force_complete=bool(force_complete))
except Exception:
pass
try:
self._local_ui.stop()
except Exception:
pass
finally:
self._local_ui = None
try:
if self._local_attached and hasattr(self._ctx, "set_live_progress"):
self._ctx.set_live_progress(None)
except Exception:
pass
self._local_attached = False
@contextmanager
def local_ui_if_needed(
self,
*,
label: str,
total_items: int,
items_preview: Optional[Sequence[Any]] = None,
) -> Iterator["PipelineProgress"]:
created = self.ensure_local_ui(label=label, total_items=total_items, items_preview=items_preview)
try:
yield self
finally:
if created:
self.close_local_ui(force_complete=True)

View File

@@ -1585,9 +1585,46 @@ def coerce_to_pipe_object(value: Any, default_path: Optional[str] = None) -> mod
"warnings", "path", "relationships", "is_temp", "action", "parent_hash",
}
# Convert ResultItem to dict to preserve all attributes
# Convert common object-like results into a dict so we can preserve fields like
# hash/store/url when they come from result tables (e.g., get-url emits UrlItem).
#
# Priority:
# 1) explicit to_dict()
# 2) best-effort attribute extraction for known PipeObject-ish fields
if hasattr(value, 'to_dict'):
value = value.to_dict()
elif not isinstance(value, dict):
try:
obj_map: Dict[str, Any] = {}
for k in (
"hash",
"store",
"provider",
"prov",
"tag",
"title",
"url",
"source_url",
"duration",
"duration_seconds",
"metadata",
"full_metadata",
"warnings",
"path",
"target",
"relationships",
"is_temp",
"action",
"parent_hash",
"extra",
"media_kind",
):
if hasattr(value, k):
obj_map[k] = getattr(value, k)
if obj_map:
value = obj_map
except Exception:
pass
if isinstance(value, dict):
# Extract hash and store (canonical identifiers)
@@ -1695,8 +1732,19 @@ def coerce_to_pipe_object(value: Any, default_path: Optional[str] = None) -> mod
# Fallback: build from path argument or bare value
hash_val = "unknown"
path_val = default_path or getattr(value, "path", None)
url_val: Optional[str] = None
title_val = None
# If the raw value is a string, treat it as either a URL or a file path.
# This is important for @-selection results that are plain URL strings.
if isinstance(value, str):
s = value.strip()
if s.lower().startswith(("http://", "https://")):
url_val = s
path_val = None
else:
path_val = s
if path_val and path_val != "unknown":
try:
from SYS.utils import sha256_file
@@ -1708,8 +1756,9 @@ def coerce_to_pipe_object(value: Any, default_path: Optional[str] = None) -> mod
except Exception:
pass
# When coming from path argument, store should be "PATH" (file path, not a backend)
store_val = "PATH"
# When coming from a raw URL string, mark it explicitly as URL.
# Otherwise treat it as a local path.
store_val = "URL" if url_val else "PATH"
pipe_obj = models.PipeObject(
hash=hash_val,
@@ -1717,6 +1766,8 @@ def coerce_to_pipe_object(value: Any, default_path: Optional[str] = None) -> mod
provider=None,
path=str(path_val) if path_val and path_val != "unknown" else None,
title=title_val,
url=url_val,
source_url=url_val,
tag=[],
extra={},
)

View File

@@ -12,6 +12,7 @@ import models
import pipeline as ctx
from API import HydrusNetwork as hydrus_wrapper
from SYS.logger import log, debug
from SYS.pipeline_progress import PipelineProgress
from SYS.utils_constant import ALL_SUPPORTED_EXTENSIONS
from Store import Store
from . import _shared as sh
@@ -73,6 +74,7 @@ class Add_File(Cmdlet):
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
"""Main execution entry point."""
parsed = parse_cmdlet_args(args, self)
progress = PipelineProgress(ctx)
path_arg = parsed.get("path")
location = parsed.get("store")
@@ -80,6 +82,35 @@ class Add_File(Cmdlet):
provider_room = parsed.get("room")
delete_after = parsed.get("delete", False)
# Convenience: when piping a file into add-file, allow `-path <existing dir>`
# to act as the destination export directory.
# Example: screen-shot "https://..." | add-file -path "C:\Users\Admin\Desktop"
if path_arg and not location and not provider_name:
try:
candidate_dir = Path(str(path_arg))
if candidate_dir.exists() and candidate_dir.is_dir():
piped_items = result if isinstance(result, list) else [result]
has_local_source = False
for it in piped_items:
try:
po = coerce_to_pipe_object(it, None)
src = str(getattr(po, "path", "") or "").strip()
if not src:
continue
if src.lower().startswith(("http://", "https://", "magnet:", "torrent:")):
continue
if Path(src).is_file():
has_local_source = True
break
except Exception:
continue
if has_local_source:
debug(f"[add-file] Treating -path directory as destination: {candidate_dir}")
location = str(candidate_dir)
path_arg = None
except Exception:
pass
stage_ctx = ctx.get_stage_context()
is_last_stage = (stage_ctx is None) or bool(getattr(stage_ctx, "is_last_stage", False))
@@ -93,7 +124,7 @@ class Add_File(Cmdlet):
is_storage_backend_location = False
# Decide which items to process.
# - If user provided -path, treat this invocation as single-item.
# - If user provided -path (and it was not reinterpreted as destination), treat this invocation as single-item.
# - Otherwise, if piped input is a list, ingest each item.
if path_arg:
items_to_process: List[Any] = [result]
@@ -102,6 +133,17 @@ class Add_File(Cmdlet):
else:
items_to_process = [result]
# Minimal step-based progress for single-item runs.
# Many add-file flows don't emit intermediate items, so without steps the pipe can look "stuck".
use_steps = False
steps_started = False
step2_done = False
try:
ui, _ = progress.ui_and_pipe_index()
use_steps = (ui is not None) and (len(items_to_process) == 1)
except Exception:
use_steps = False
debug(f"[add-file] INPUT result type={type(result).__name__}")
if isinstance(result, list):
debug(f"[add-file] INPUT result is list with {len(result)} items")
@@ -235,6 +277,14 @@ class Add_File(Cmdlet):
failures += 1
continue
is_url_target = isinstance(media_path_or_url, str) and str(media_path_or_url).lower().startswith(
("http://", "https://", "magnet:", "torrent:")
)
if use_steps and (not steps_started) and (not is_url_target):
progress.begin_steps(3)
progress.step("resolving source")
steps_started = True
# Update pipe_obj with resolved path
pipe_obj.path = str(media_path_or_url)
@@ -300,13 +350,34 @@ class Add_File(Cmdlet):
pass
temp_dir_to_cleanup = Path(tempfile.mkdtemp(prefix="medios_openlibrary_"))
# Wire OpenLibrary download progress into pipeline Live UI (no tqdm spam).
def _ol_progress(kind: str, completed: int, total: Optional[int], label: str) -> None:
try:
if kind == "pages" and total:
progress.set_status(f"downloading pages {completed}/{total}")
progress.set_percent(int(round((completed / max(1, total)) * 100.0)))
elif kind == "bytes" and total:
progress.set_status(f"downloading {label} {completed}/{total} bytes")
progress.set_percent(int(round((completed / max(1, total)) * 100.0)))
else:
progress.set_status("downloading")
except Exception:
return
try:
progress.set_percent(0)
progress.set_status("downloading openlibrary")
except Exception:
pass
sr = SearchResult(
table="openlibrary",
title=str(getattr(pipe_obj, "title", None) or "Unknown"),
path=str(media_path_or_url),
full_metadata=full_metadata if isinstance(full_metadata, dict) else {},
)
downloaded = provider.download(sr, temp_dir_to_cleanup)
downloaded = provider.download(sr, temp_dir_to_cleanup, progress_callback=_ol_progress)
if downloaded is None:
log("[add-file] OpenLibrary download failed", file=sys.stderr)
failures += 1
@@ -325,6 +396,13 @@ class Add_File(Cmdlet):
pipe_obj.path = str(downloaded_path)
delete_after_item = True
try:
if ui is not None:
ui.set_pipe_percent(int(pipe_idx), 100)
ui.set_pipe_status_text(int(pipe_idx), "downloaded")
except Exception:
pass
# For non-provider URLs, or if still a URL after provider attempt, delegate to download-media.
if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith(
("http://", "https://", "magnet:", "torrent:")
@@ -562,6 +640,10 @@ class Add_File(Cmdlet):
failures += 1
continue
if use_steps and steps_started and (not step2_done):
progress.step("writing destination")
step2_done = True
if code == 0:
successes += 1
else:
@@ -619,6 +701,9 @@ class Add_File(Cmdlet):
except Exception:
pass
if use_steps and steps_started:
progress.step("finalized")
if successes > 0:
return 0
return 1

View File

@@ -34,6 +34,19 @@ class Add_Url(sh.Cmdlet):
"""Add URL to file via hash+store backend."""
parsed = sh.parse_cmdlet_args(args, self)
# Compatibility/piping fix:
# `SharedArgs.QUERY` is positional in the shared parser, so `add-url <url>`
# (and `@N | add-url <url>`) can mistakenly parse the URL into `query`.
# If `url` is missing and `query` looks like an http(s) URL, treat it as `url`.
try:
if (not parsed.get("url")) and isinstance(parsed.get("query"), str):
q = str(parsed.get("query") or "").strip()
if q.startswith(("http://", "https://")):
parsed["url"] = q
parsed.pop("query", None)
except Exception:
pass
query_hash = sh.parse_single_hash_query(parsed.get("query"))
if parsed.get("query") and not query_hash:
log("Error: -query must be of the form hash:<sha256>")

View File

@@ -29,7 +29,7 @@ class Delete_Url(Cmdlet):
arg=[
SharedArgs.QUERY,
SharedArgs.STORE,
CmdletArg("url", required=True, description="URL to remove"),
CmdletArg("url", required=False, description="URL to remove (optional when piping url rows)"),
],
detail=[
"- Removes URL association from file identified by hash+store",
@@ -69,22 +69,24 @@ class Delete_Url(Cmdlet):
log("Error: No store name provided")
return 1
if not url_arg:
log("Error: No URL provided")
return 1
# Normalize hash (single-item mode)
if not results and file_hash:
file_hash = normalize_hash(file_hash)
if not file_hash:
log("Error: Invalid hash format")
return 1
# Parse url (comma-separated)
urls = [u.strip() for u in str(url_arg).split(',') if u.strip()]
if not urls:
log("Error: No valid url provided")
return 1
from metadata import normalize_urls
def _urls_from_arg(raw: Any) -> List[str]:
if raw is None:
return []
# Support comma-separated input for backwards compatibility
if isinstance(raw, str) and "," in raw:
return [u.strip() for u in raw.split(",") if u.strip()]
return [u.strip() for u in normalize_urls(raw) if str(u).strip()]
urls_from_cli = _urls_from_arg(url_arg)
# Get backend and delete url
try:
@@ -145,7 +147,17 @@ class Delete_Url(Cmdlet):
)
continue
batch.setdefault(store_text, []).append((normalized, list(urls)))
# Determine which URLs to delete.
# - If user passed an explicit <url>, apply it to all items.
# - Otherwise, when piping url rows from get-url, delete the url(s) from each item.
item_urls = list(urls_from_cli)
if not item_urls:
item_urls = [u.strip() for u in normalize_urls(get_field(item, "url") or get_field(item, "source_url")) if str(u).strip()]
if not item_urls:
ctx.print_if_visible("[delete-url] Warning: Item has no url field; skipping", file=sys.stderr)
continue
batch.setdefault(store_text, []).append((normalized, item_urls))
for store_text, pairs in batch.items():
try:
@@ -168,24 +180,39 @@ class Delete_Url(Cmdlet):
for h, ulist in bulk_pairs:
backend.delete_url(h, ulist, config=config)
deleted_count = 0
for _h, ulist in bulk_pairs:
deleted_count += len(ulist or [])
ctx.print_if_visible(
f"✓ delete-url: {len(urls)} url(s) for {len(bulk_pairs)} item(s) in '{store_text}'",
f"✓ delete-url: {deleted_count} url(s) for {len(bulk_pairs)} item(s) in '{store_text}'",
file=sys.stderr,
)
for item in pass_through:
existing = get_field(item, "url")
_set_item_url(item, _remove_urls(existing, list(urls)))
# In batch mode we removed the union of requested urls for the file.
# Using urls_from_cli (if present) matches the user's explicit intent; otherwise
# remove the piped url row(s).
remove_set = urls_from_cli
if not remove_set:
remove_set = [u.strip() for u in normalize_urls(get_field(item, "url") or get_field(item, "source_url")) if str(u).strip()]
_set_item_url(item, _remove_urls(existing, list(remove_set)))
ctx.emit(item)
return 0
# Single-item mode
if not urls_from_cli:
urls_from_cli = [u.strip() for u in normalize_urls(get_field(result, "url") or get_field(result, "source_url")) if str(u).strip()]
if not urls_from_cli:
log("Error: No URL provided")
return 1
backend = storage[str(store_name)]
backend.delete_url(str(file_hash), urls, config=config)
ctx.print_if_visible(f"✓ delete-url: {len(urls)} url(s) removed", file=sys.stderr)
backend.delete_url(str(file_hash), list(urls_from_cli), config=config)
ctx.print_if_visible(f"✓ delete-url: {len(urls_from_cli)} url(s) removed", file=sys.stderr)
if result is not None:
existing = get_field(result, "url")
_set_item_url(result, _remove_urls(existing, list(urls)))
_set_item_url(result, _remove_urls(existing, list(urls_from_cli)))
ctx.emit(result)
return 0

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -126,7 +126,7 @@ class Get_File(sh.Cmdlet):
except Exception as exc:
log(f"Error opening browser: {exc}", file=sys.stderr)
else:
log(f"Opened in browser: {source_path}", file=sys.stderr)
debug(f"Opened in browser: {source_path}", file=sys.stderr)
# Emit result for pipeline
ctx.emit({

View File

@@ -47,6 +47,210 @@ except ImportError:
extract_title = None
def _dedup_tags_preserve_order(tags: List[str]) -> List[str]:
"""Deduplicate tags case-insensitively while preserving order."""
out: List[str] = []
seen: set[str] = set()
for t in tags or []:
if not isinstance(t, str):
continue
s = t.strip()
if not s:
continue
key = s.lower()
if key in seen:
continue
seen.add(key)
out.append(s)
return out
def _extract_subtitle_tags(info: Dict[str, Any]) -> List[str]:
"""Extract subtitle availability tags from a yt-dlp info dict.
Produces multi-valued tags so languages can coexist:
- subs:<lang>
- subs_auto:<lang>
"""
def _langs(value: Any) -> List[str]:
if not isinstance(value, dict):
return []
langs: List[str] = []
for k in value.keys():
if not isinstance(k, str):
continue
lang = k.strip().lower()
if lang:
langs.append(lang)
return sorted(set(langs))
out: List[str] = []
for lang in _langs(info.get("subtitles")):
out.append(f"subs:{lang}")
for lang in _langs(info.get("automatic_captions")):
out.append(f"subs_auto:{lang}")
return out
def _scrape_ytdlp_info(url: str) -> Optional[Dict[str, Any]]:
"""Fetch a yt-dlp info dict without downloading media."""
if not isinstance(url, str) or not url.strip():
return None
url = url.strip()
# Prefer the Python module when available (faster, avoids shell quoting issues).
try:
import yt_dlp # type: ignore
opts: Any = {
"quiet": True,
"no_warnings": True,
"skip_download": True,
"noprogress": True,
"socket_timeout": 15,
"retries": 1,
"playlist_items": "1-10",
}
with yt_dlp.YoutubeDL(opts) as ydl:
info = ydl.extract_info(url, download=False)
return info if isinstance(info, dict) else None
except Exception:
pass
# Fallback to yt-dlp CLI if the module isn't available.
try:
import json as json_module
cmd = [
"yt-dlp",
"-J",
"--no-warnings",
"--skip-download",
"--playlist-items",
"1-10",
url,
]
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
if result.returncode != 0:
return None
payload = (result.stdout or "").strip()
if not payload:
return None
data = json_module.loads(payload)
return data if isinstance(data, dict) else None
except Exception:
return None
def _resolve_candidate_urls_for_item(
result: Any,
backend: Any,
file_hash: str,
config: Dict[str, Any],
) -> List[str]:
"""Get candidate URLs from backend and/or piped result."""
try:
from metadata import normalize_urls
except Exception:
normalize_urls = None # type: ignore[assignment]
urls: List[str] = []
# 1) Backend URL association (best source of truth)
try:
backend_urls = backend.get_url(file_hash, config=config)
if backend_urls:
if normalize_urls:
urls.extend(normalize_urls(backend_urls))
else:
urls.extend([str(u).strip() for u in backend_urls if isinstance(u, str) and str(u).strip()])
except Exception:
pass
# 2) Backend metadata url field
try:
meta = backend.get_metadata(file_hash, config=config)
if isinstance(meta, dict) and meta.get("url"):
if normalize_urls:
urls.extend(normalize_urls(meta.get("url")))
else:
raw = meta.get("url")
if isinstance(raw, list):
urls.extend([str(u).strip() for u in raw if isinstance(u, str) and str(u).strip()])
elif isinstance(raw, str) and raw.strip():
urls.append(raw.strip())
except Exception:
pass
# 3) Piped result fields
def _get(obj: Any, key: str, default: Any = None) -> Any:
if isinstance(obj, dict):
return obj.get(key, default)
return getattr(obj, key, default)
for key in ("url", "webpage_url", "source_url", "target"):
val = _get(result, key, None)
if not val:
continue
if normalize_urls:
urls.extend(normalize_urls(val))
continue
if isinstance(val, str) and val.strip():
urls.append(val.strip())
elif isinstance(val, list):
urls.extend([str(u).strip() for u in val if isinstance(u, str) and str(u).strip()])
meta_field = _get(result, "metadata", None)
if isinstance(meta_field, dict) and meta_field.get("url"):
val = meta_field.get("url")
if normalize_urls:
urls.extend(normalize_urls(val))
elif isinstance(val, list):
urls.extend([str(u).strip() for u in val if isinstance(u, str) and str(u).strip()])
elif isinstance(val, str) and val.strip():
urls.append(val.strip())
# Dedup
return _dedup_tags_preserve_order(urls)
def _pick_supported_ytdlp_url(urls: List[str]) -> Optional[str]:
"""Pick the first URL that looks supported by yt-dlp (best effort)."""
if not urls:
return None
def _is_hydrus_file_url(u: str) -> bool:
text = str(u or "").strip().lower()
if not text:
return False
# Hydrus-local file URLs are retrievable blobs, not original source pages.
# yt-dlp generally can't extract meaningful metadata from these.
return ("/get_files/file" in text) and ("hash=" in text)
http_urls: List[str] = []
for u in urls:
text = str(u or "").strip()
if text.lower().startswith(("http://", "https://")):
http_urls.append(text)
# Prefer non-Hydrus URLs for yt-dlp scraping.
candidates = [u for u in http_urls if not _is_hydrus_file_url(u)]
if not candidates:
return None
# Prefer a true support check when the Python module is available.
try:
from SYS.download import is_url_supported_by_ytdlp
for text in candidates:
try:
if is_url_supported_by_ytdlp(text):
return text
except Exception:
continue
except Exception:
pass
# Fallback: use the first non-Hydrus http(s) URL and let extraction decide.
return candidates[0] if candidates else None
_scrape_isbn_metadata = _ol_scrape_isbn_metadata # type: ignore[assignment]
_scrape_openlibrary_metadata = _ol_scrape_openlibrary_metadata # type: ignore[assignment]
@@ -853,7 +1057,12 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
scrape_url = parsed_args.get("scrape")
scrape_requested = scrape_flag_present or scrape_url is not None
if scrape_requested and (not scrape_url or str(scrape_url).strip() == ""):
# Convenience: `-scrape` with no value defaults to `ytdlp` (store-backed URL scrape).
if scrape_flag_present and (scrape_url is None or str(scrape_url).strip() == ""):
scrape_url = "ytdlp"
scrape_requested = True
if scrape_requested and (scrape_url is None or str(scrape_url).strip() == ""):
log("-scrape requires a URL or provider name", file=sys.stderr)
return 1
@@ -861,6 +1070,123 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
if scrape_requested and scrape_url:
import json as json_module
if str(scrape_url).strip().lower() == "ytdlp":
# Scrape metadata from the selected item's URL via yt-dlp (no download),
# then OVERWRITE all existing tags (including title:).
#
# This mode requires a store-backed item (hash + store).
#
# NOTE: We intentionally do not reuse _scrape_url_metadata() here because it
# performs namespace deduplication that would collapse multi-valued tags.
file_hash = normalize_hash(hash_override) or normalize_hash(get_field(result, "hash", None))
store_name = get_field(result, "store", None)
subject_path = get_field(result, "path", None) or get_field(result, "target", None) or get_field(result, "filename", None)
item_title = get_field(result, "title", None) or get_field(result, "name", None) or get_field(result, "filename", None)
# Only run overwrite-apply when the item is store-backed.
# If this is a URL-only PipeObject, fall through to provider mode below.
if file_hash and store_name and str(file_hash).strip().lower() != "unknown" and str(store_name).strip().upper() not in {"PATH", "URL"}:
try:
from Store import Store
storage = Store(config)
backend = storage[str(store_name)]
except Exception as exc:
log(f"Failed to resolve store backend '{store_name}': {exc}", file=sys.stderr)
return 1
candidate_urls = _resolve_candidate_urls_for_item(result, backend, file_hash, config)
scrape_target = _pick_supported_ytdlp_url(candidate_urls)
if not scrape_target:
log(
"No yt-dlp-supported source URL found for this item (Hydrus /get_files/file URLs are ignored). ",
file=sys.stderr,
)
log(
"Add the original page URL to the file (e.g. via add-url), then retry get-tag -scrape.",
file=sys.stderr,
)
return 1
info = _scrape_ytdlp_info(scrape_target)
if not info:
log("yt-dlp could not extract metadata for this URL (unsupported or failed)", file=sys.stderr)
return 1
try:
from metadata import extract_ytdlp_tags
except Exception:
extract_ytdlp_tags = None # type: ignore[assignment]
# Prefer the top-level metadata, but if this is a playlist container, use
# the first entry for per-item fields like subtitles.
info_for_subs = info
entries = info.get("entries") if isinstance(info, dict) else None
if isinstance(entries, list) and entries:
first = entries[0]
if isinstance(first, dict):
info_for_subs = first
tags: List[str] = []
if extract_ytdlp_tags:
try:
tags.extend(extract_ytdlp_tags(info))
except Exception:
pass
# Subtitle availability tags
try:
tags.extend(_extract_subtitle_tags(info_for_subs if isinstance(info_for_subs, dict) else {}))
except Exception:
pass
# Ensure we actually have something to apply.
tags = _dedup_tags_preserve_order(tags)
if not tags:
log("No tags extracted from yt-dlp metadata", file=sys.stderr)
return 1
# Full overwrite: delete all existing tags, then add the new set.
try:
existing_tags, _src = backend.get_tag(file_hash, config=config)
except Exception:
existing_tags = []
try:
if existing_tags:
backend.delete_tag(file_hash, list(existing_tags), config=config)
except Exception as exc:
debug(f"[get_tag] ytdlp overwrite: delete_tag failed: {exc}")
try:
backend.add_tag(file_hash, list(tags), config=config)
except Exception as exc:
log(f"Failed to apply yt-dlp tags: {exc}", file=sys.stderr)
return 1
# Show updated tags
try:
updated_tags, _src = backend.get_tag(file_hash, config=config)
except Exception:
updated_tags = tags
if not updated_tags:
updated_tags = tags
_emit_tags_as_table(
tags_list=list(updated_tags),
file_hash=file_hash,
store=str(store_name),
service_name=None,
config=config,
item_title=str(item_title or "ytdlp"),
path=str(subject_path) if subject_path else None,
subject={
"hash": file_hash,
"store": str(store_name),
"path": str(subject_path) if subject_path else None,
"title": item_title,
"extra": {"applied_provider": "ytdlp", "scrape_url": scrape_target},
},
)
return 0
if scrape_url.startswith("http://") or scrape_url.startswith("https://"):
# URL scraping (existing behavior)
title, tags, formats, playlist_items = _scrape_url_metadata(scrape_url)
@@ -951,7 +1277,16 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
else:
combined_query = f"{title_hint} {artist_hint}"
query_hint = identifier_query or combined_query or title_hint
# yt-dlp isn't a search provider; it requires a URL.
url_hint: Optional[str] = None
if provider.name == "ytdlp":
raw_url = get_field(result, "url", None) or get_field(result, "source_url", None) or get_field(result, "target", None)
if isinstance(raw_url, list) and raw_url:
raw_url = raw_url[0]
if isinstance(raw_url, str) and raw_url.strip().startswith(("http://", "https://")):
url_hint = raw_url.strip()
query_hint = url_hint or identifier_query or combined_query or title_hint
if not query_hint:
log("No title or identifier available to search for metadata", file=sys.stderr)
return 1
@@ -967,6 +1302,27 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
if not items:
log("No metadata results found", file=sys.stderr)
return 1
# For yt-dlp, emit tags directly (there is no meaningful multi-result selection step).
if provider.name == "ytdlp":
try:
tags = [str(t) for t in provider.to_tags(items[0]) if t is not None]
except Exception:
tags = []
if not tags:
log("No tags extracted from yt-dlp metadata", file=sys.stderr)
return 1
_emit_tags_as_table(
tags_list=list(tags),
file_hash=None,
store="url",
service_name=None,
config=config,
item_title=str(items[0].get("title") or "ytdlp"),
path=None,
subject={"provider": "ytdlp", "url": str(query_hint)},
)
return 0
from result_table import ResultTable
table = ResultTable(f"Metadata: {provider.name}")
@@ -1040,7 +1396,10 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
return 0
# Apply tags to the store backend (no sidecar writing here).
apply_tags = _filter_scraped_tags([str(t) for t in result_tags if t is not None])
if str(result_provider).strip().lower() == "ytdlp":
apply_tags = [str(t) for t in result_tags if t is not None]
else:
apply_tags = _filter_scraped_tags([str(t) for t in result_tags if t is not None])
if not apply_tags:
log("No applicable scraped tags to apply (title:/artist:/source: are skipped)", file=sys.stderr)
return 0
@@ -1167,6 +1526,11 @@ try:
except Exception:
_SCRAPE_CHOICES = ["itunes", "openlibrary", "googlebooks", "google", "musicbrainz"]
# Special scrape mode: pull tags from an item's URL via yt-dlp (no download)
if "ytdlp" not in _SCRAPE_CHOICES:
_SCRAPE_CHOICES.append("ytdlp")
_SCRAPE_CHOICES = sorted(_SCRAPE_CHOICES)
class Get_Tag(Cmdlet):
"""Class-based get-tag cmdlet with self-registration."""
@@ -1195,7 +1559,7 @@ class Get_Tag(Cmdlet):
CmdletArg(
name="-scrape",
type="string",
description="Scrape metadata from URL or provider name (returns tags as JSON or table)",
description="Scrape metadata from URL/provider, or use 'ytdlp' to scrape from the item's URL and overwrite tags",
required=False,
choices=_SCRAPE_CHOICES,
)

View File

@@ -14,10 +14,11 @@ import httpx
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence, Tuple
from urllib.parse import urlsplit, quote, urljoin
from urllib.parse import urlsplit, quote, urljoin, unquote
from SYS.logger import log, debug
from API.HTTP import HTTPClient
from SYS.pipeline_progress import PipelineProgress
from SYS.utils import ensure_directory, unique_path, unique_preserve_order
from . import _shared as sh
@@ -31,54 +32,6 @@ get_field = sh.get_field
parse_cmdlet_args = sh.parse_cmdlet_args
import pipeline as pipeline_context
def _live_ui_and_pipe_index() -> tuple[Optional[Any], int]:
ui = None
try:
ui = pipeline_context.get_live_progress() if hasattr(pipeline_context, "get_live_progress") else None
except Exception:
ui = None
pipe_idx: int = 0
try:
stage_ctx = pipeline_context.get_stage_context() if hasattr(pipeline_context, "get_stage_context") else None
maybe_idx = getattr(stage_ctx, "pipe_index", None) if stage_ctx is not None else None
if isinstance(maybe_idx, int):
pipe_idx = int(maybe_idx)
except Exception:
pipe_idx = 0
return ui, pipe_idx
def _begin_live_steps(total_steps: int) -> None:
"""Declare the total number of steps for this cmdlet run (per-pipe)."""
ui, pipe_idx = _live_ui_and_pipe_index()
if ui is None:
return
try:
begin = getattr(ui, "begin_pipe_steps", None)
if callable(begin):
begin(int(pipe_idx), total_steps=int(total_steps))
except Exception:
return
def _step(text: str) -> None:
"""Emit a *new* step.
Each call increments the step counter and advances percent automatically.
"""
ui, pipe_idx = _live_ui_and_pipe_index()
if ui is None:
return
try:
adv = getattr(ui, "advance_pipe_step", None)
if callable(adv):
adv(int(pipe_idx), str(text))
except Exception:
return
# ============================================================================
# CMDLET Metadata Declaration
# ============================================================================
@@ -115,6 +68,10 @@ USER_AGENT = (
DEFAULT_VIEWPORT: dict[str, int] = {"width": 1920, "height": 1080}
ARCHIVE_TIMEOUT = 30.0
# WebP has a hard maximum dimension per side.
# Pillow typically fails with: "encoding error 5: Image size exceeds WebP limit of 16383 pixels"
WEBP_MAX_DIM = 16_383
# Configurable selectors for specific websites
SITE_SELECTORS: Dict[str, List[str]] = {
"twitter.com": [
@@ -200,6 +157,80 @@ def _slugify_url(url: str) -> str:
return slug[:100]
def _tags_from_url(url: str) -> List[str]:
"""Derive simple tags from a URL.
- site:<domain> (strips leading www.)
- title:<slug> derived from the last path segment, with extension removed
and separators (-, _, %) normalized to spaces.
"""
u = str(url or "").strip()
if not u:
return []
parsed = None
try:
parsed = urlsplit(u)
host = str(getattr(parsed, "hostname", None) or getattr(parsed, "netloc", "") or "").strip().lower()
except Exception:
parsed = None
host = ""
if host:
# Drop credentials and port if present.
if "@" in host:
host = host.rsplit("@", 1)[-1]
if ":" in host:
host = host.split(":", 1)[0]
if host.startswith("www."):
host = host[len("www.") :]
path = ""
if parsed is not None:
try:
path = str(getattr(parsed, "path", "") or "")
except Exception:
path = ""
last = ""
if path:
try:
last = path.rsplit("/", 1)[-1]
except Exception:
last = ""
try:
last = unquote(last or "")
except Exception:
last = last or ""
if last and "." in last:
# Drop a single trailing extension (e.g. .html, .php).
last = last.rsplit(".", 1)[0]
for sep in ("_", "-", "%"):
if last and sep in last:
last = last.replace(sep, " ")
title = " ".join(str(last or "").split()).strip().lower()
tags: List[str] = []
if host:
tags.append(f"site:{host}")
if title:
tags.append(f"title:{title}")
return tags
def _title_from_url(url: str) -> str:
"""Return the normalized title derived from a URL's last path segment."""
for t in _tags_from_url(url):
if str(t).lower().startswith("title:"):
return str(t)[len("title:") :].strip()
return ""
def _normalise_format(fmt: Optional[str]) -> str:
"""Normalize output format to valid values."""
if not fmt:
@@ -218,6 +249,89 @@ def _format_suffix(fmt: str) -> str:
return ".jpg"
return f".{fmt}"
def _convert_to_webp(
src_png: Path,
dst_webp: Path,
*,
quality: int = 90,
method: int = 6,
max_dim: int = WEBP_MAX_DIM,
downscale_if_oversize: bool = True,
) -> bool:
"""Convert a PNG screenshot to WebP via Pillow.
Playwright does not currently support emitting WebP directly.
"""
if not src_png or not Path(src_png).is_file():
raise ScreenshotError(f"Source image not found: {src_png}")
dst_webp = Path(dst_webp)
try:
dst_webp.parent.mkdir(parents=True, exist_ok=True)
except Exception:
pass
try:
from PIL import Image
except Exception as exc:
raise ScreenshotError(f"Pillow is required for webp conversion: {exc}") from exc
# Write atomically to avoid partial files if conversion is interrupted.
tmp_path = unique_path(dst_webp.with_suffix(".tmp.webp"))
try:
with Image.open(src_png) as im:
did_downscale = False
save_kwargs: Dict[str, Any] = {
"format": "WEBP",
"quality": int(quality),
"method": int(method),
}
# Preserve alpha when present; Pillow handles it for WEBP.
# Normalize palette images to RGBA to avoid odd palette artifacts.
if im.mode == "P":
im = im.convert("RGBA")
# WebP enforces a hard max dimension per side (16383px).
# When full-page captures are very tall, downscale proportionally to fit.
try:
w, h = im.size
except Exception:
w, h = 0, 0
if downscale_if_oversize and isinstance(max_dim, int) and max_dim > 0 and (w > max_dim or h > max_dim):
scale = 1.0
try:
scale = min(float(max_dim) / float(w), float(max_dim) / float(h))
except Exception:
scale = 1.0
if scale > 0.0 and scale < 1.0:
new_w = max(1, int(w * scale))
new_h = max(1, int(h * scale))
debug(
f"[_convert_to_webp] Image exceeds WebP limit ({w}x{h}); downscaling -> {new_w}x{new_h}"
)
try:
resample = getattr(getattr(Image, "Resampling", Image), "LANCZOS", None)
if resample is None:
resample = getattr(Image, "LANCZOS", 1)
im = im.resize((new_w, new_h), resample=resample)
did_downscale = True
except Exception as exc:
debug(f"[_convert_to_webp] Downscale failed; attempting direct WEBP save anyway: {exc}")
im.save(tmp_path, **save_kwargs)
tmp_path.replace(dst_webp)
return bool(did_downscale)
finally:
try:
tmp_path.unlink(missing_ok=True)
except Exception:
pass
def _matched_site_selectors(url: str) -> List[str]:
"""Return SITE_SELECTORS for a matched domain; empty if no match.
@@ -231,6 +345,16 @@ def _matched_site_selectors(url: str) -> List[str]:
return sels
def _selectors_for_url(url: str) -> List[str]:
"""Return selectors to try for a URL.
For now, prefer a minimal behavior: only return known SITE_SELECTORS.
(The cmdlet already falls back to full-page capture when no selectors match.)
"""
return _matched_site_selectors(url)
def _platform_preprocess(url: str, page: Any, warnings: List[str], timeout_ms: int = 10_000) -> None:
"""Best-effort page tweaks for popular platforms before capture."""
try:
@@ -366,11 +490,11 @@ def _prepare_output_path(options: ScreenshotOptions) -> Path:
return unique_path(path)
def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str]) -> None:
def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str], progress: PipelineProgress) -> None:
"""Capture screenshot using Playwright."""
debug(f"[_capture] Starting capture for {options.url} -> {destination}")
try:
_step("loading launching browser")
progress.step("loading launching browser")
tool = options.playwright_tool or PlaywrightTool({})
# Ensure Chromium engine is used for the screen-shot cmdlet (force for consistency)
@@ -405,16 +529,16 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
try:
with tool.open_page(headless=headless) as page:
_step("loading navigating")
progress.step("loading navigating")
debug(f"Navigating to {options.url}...")
try:
tool.goto(page, options.url)
debug("Page loaded successfully")
_step("loading page loaded")
progress.step("loading page loaded")
except PlaywrightTimeoutError:
warnings.append("navigation timeout; capturing current page state")
debug("Navigation timeout; proceeding with current state")
_step("loading navigation timeout")
progress.step("loading navigation timeout")
# Skip article lookup by default (wait_for_article defaults to False)
if options.wait_for_article:
@@ -430,9 +554,9 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
debug(f"Waiting {options.wait_after_load}s for page stabilization...")
time.sleep(min(10.0, max(0.0, options.wait_after_load)))
_step("loading stabilized")
progress.step("loading stabilized")
_step("capturing preparing")
progress.step("capturing preparing")
if options.replace_video_posters:
debug("Replacing video elements with posters...")
page.evaluate(
@@ -453,7 +577,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
if options.prefer_platform_target and format_name != "pdf":
debug(f"[_capture] Target capture enabled")
debug("Attempting platform-specific content capture...")
_step("capturing locating target")
progress.step("capturing locating target")
try:
_platform_preprocess(options.url, page, warnings)
except Exception as e:
@@ -478,7 +602,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
el.scroll_into_view_if_needed(timeout=1000)
except Exception:
pass
_step("capturing output")
progress.step("capturing output")
debug(f"Capturing element to {destination}...")
el.screenshot(path=str(destination), type=("jpeg" if format_name == "jpeg" else None))
element_captured = True
@@ -489,14 +613,14 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
debug(f"Failed to capture element: {exc}")
# Fallback to default capture paths
if element_captured:
_step("capturing saved")
progress.step("capturing saved")
elif format_name == "pdf":
debug("Generating PDF...")
page.emulate_media(media="print")
_step("capturing output")
progress.step("capturing output")
page.pdf(path=str(destination), print_background=True)
debug(f"PDF saved to {destination}")
_step("capturing saved")
progress.step("capturing saved")
else:
debug(f"Capturing full page to {destination}...")
screenshot_kwargs: Dict[str, Any] = {"path": str(destination)}
@@ -504,20 +628,20 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
screenshot_kwargs["type"] = "jpeg"
screenshot_kwargs["quality"] = 90
if options.full_page:
_step("capturing output")
progress.step("capturing output")
page.screenshot(full_page=True, **screenshot_kwargs)
else:
article = page.query_selector("article")
if article is not None:
article_kwargs = dict(screenshot_kwargs)
article_kwargs.pop("full_page", None)
_step("capturing output")
progress.step("capturing output")
article.screenshot(**article_kwargs)
else:
_step("capturing output")
progress.step("capturing output")
page.screenshot(**screenshot_kwargs)
debug(f"Screenshot saved to {destination}")
_step("capturing saved")
progress.step("capturing saved")
except Exception as exc:
debug(f"[_capture] Exception launching browser/page: {exc}")
msg = str(exc).lower()
@@ -532,7 +656,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
raise ScreenshotError(f"Failed to capture screenshot: {exc}") from exc
def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
def _capture_screenshot(options: ScreenshotOptions, progress: PipelineProgress) -> ScreenshotResult:
"""Capture a screenshot for the given options."""
debug(f"[_capture_screenshot] Preparing capture for {options.url}")
requested_format = _normalise_format(options.output_format)
@@ -543,8 +667,8 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
will_convert = requested_format == "webp"
will_archive = bool(options.archive and options.url)
total_steps = 9 + (1 if will_target else 0) + (1 if will_convert else 0) + (1 if will_archive else 0)
_begin_live_steps(total_steps)
_step("loading starting")
progress.begin_steps(total_steps)
progress.step("loading starting")
# Playwright screenshots do not natively support WebP output.
# Capture as PNG, then convert via Pillow.
@@ -553,17 +677,22 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
capture_path = unique_path(destination.with_suffix(".png"))
debug(f"[_capture_screenshot] Requested webp; capturing intermediate png -> {capture_path}")
options.output_format = "png"
_capture(options, capture_path, warnings)
_capture(options, capture_path, warnings, progress)
if requested_format == "webp":
_step("capturing converting to webp")
progress.step("capturing converting to webp")
debug(f"[_capture_screenshot] Converting png -> webp: {destination}")
try:
_convert_to_webp(capture_path, destination)
try:
capture_path.unlink(missing_ok=True)
except Exception:
pass
did_downscale = _convert_to_webp(capture_path, destination)
if did_downscale:
warnings.append(
f"webp conversion used downscaling to fit {WEBP_MAX_DIM}px limit; keeping original png: {capture_path.name}"
)
else:
try:
capture_path.unlink(missing_ok=True)
except Exception:
pass
except Exception as exc:
warnings.append(f"webp conversion failed; keeping png: {exc}")
destination = capture_path
@@ -572,7 +701,7 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
url: List[str] = [options.url] if options.url else []
archive_url: List[str] = []
if options.archive and options.url:
_step("capturing archiving")
progress.step("capturing archiving")
debug(f"[_capture_screenshot] Archiving enabled for {options.url}")
archives, archive_warnings = _archive_url(options.url, options.archive_timeout)
archive_url.extend(archives)
@@ -580,7 +709,7 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
if archives:
url = unique_preserve_order([*url, *archives])
_step("capturing finalized")
progress.step("capturing finalized")
applied_tag = unique_preserve_order(list(tag for tag in options.tag if tag.strip()))
@@ -627,6 +756,8 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
)
return 1
progress = PipelineProgress(pipeline_context)
# ========================================================================
# ARGUMENT PARSING
# ========================================================================
@@ -685,32 +816,6 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
debug(f"[_run] url to process: {[u for u, _ in url_to_process]}")
# If the caller isn't running the shared pipeline Live progress UI (e.g. direct
# cmdlet execution), start a minimal local pipeline progress panel so this cmdlet
# still shows step-level progress.
local_progress_ui = None
try:
existing_ui = pipeline_context.get_live_progress() if hasattr(pipeline_context, "get_live_progress") else None
except Exception:
existing_ui = None
try:
if existing_ui is None and bool(getattr(sys.stderr, "isatty", lambda: False)()):
from models import PipelineLiveProgress
local_progress_ui = PipelineLiveProgress(["screen-shot"], enabled=True)
local_progress_ui.start()
try:
if hasattr(pipeline_context, "set_live_progress"):
pipeline_context.set_live_progress(local_progress_ui)
except Exception:
pass
try:
local_progress_ui.begin_pipe(0, total_items=len(url_to_process), items_preview=[u for u, _ in url_to_process])
except Exception:
pass
except Exception:
local_progress_ui = None
# ========================================================================
# OUTPUT DIRECTORY RESOLUTION - Priority chain
# ========================================================================
@@ -749,6 +854,18 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
ensure_directory(screenshot_dir)
# If the caller isn't running the shared pipeline Live progress UI (e.g. direct
# cmdlet execution), start a minimal local pipeline progress panel so this cmdlet
# still shows step-level progress.
try:
progress.ensure_local_ui(
label="screen-shot",
total_items=len(url_to_process),
items_preview=[u for u, _ in url_to_process],
)
except Exception:
pass
# ========================================================================
# PREPARE SCREENSHOT OPTIONS
# ========================================================================
@@ -850,7 +967,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
options.target_selectors = auto_selectors
debug(f"[screen_shot] Auto selectors matched for url: {auto_selectors}")
screenshot_result = _capture_screenshot(options)
screenshot_result = _capture_screenshot(options, progress)
# Log results and warnings
debug(f"Screenshot captured to {screenshot_result.path}")
@@ -875,15 +992,18 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
capture_date = datetime.now().date().isoformat()
upstream_title = _clean_title(_extract_item_title(origin_item))
display_title = upstream_title or url
url_title = _title_from_url(url)
display_title = upstream_title or url_title or url
upstream_tags = _extract_item_tags(origin_item)
filtered_upstream_tags = [
t for t in upstream_tags
if not str(t).strip().lower().startswith(("type:", "date:"))
]
url_tags = _tags_from_url(url)
merged_tags = unique_preserve_order(
["type:screenshot", f"date:{capture_date}"] + filtered_upstream_tags
["type:screenshot", f"date:{capture_date}"] + filtered_upstream_tags + url_tags
)
pipe_obj = create_pipe_object_result(
@@ -910,11 +1030,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
all_emitted.append(pipe_obj)
# If we created a local progress UI, advance it per completed item.
if local_progress_ui is not None:
try:
local_progress_ui.on_emit(0, pipe_obj)
except Exception:
pass
progress.on_emit(pipe_obj)
except ScreenshotError as exc:
log(f"Error taking screenshot of {url}: {exc}", file=sys.stderr)
@@ -925,23 +1041,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
traceback.print_exc(file=sys.stderr)
exit_code = 1
try:
if local_progress_ui is not None:
try:
local_progress_ui.finish_pipe(0, force_complete=True)
except Exception:
pass
finally:
if local_progress_ui is not None:
try:
local_progress_ui.stop()
except Exception:
pass
try:
if hasattr(pipeline_context, "set_live_progress"):
pipeline_context.set_live_progress(None)
except Exception:
pass
progress.close_local_ui(force_complete=True)
if not all_emitted:
log(f"No screenshots were successfully captured", file=sys.stderr)

View File

@@ -336,6 +336,18 @@ def _resolve_upload_path(item: Any, config: Dict[str, Any]) -> Optional[str]:
def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
# Internal stage: send previously selected items to selected rooms.
if any(str(a).lower() == "-send" for a in (args or [])):
# Ensure we don't re-print the rooms picker table on the send stage.
try:
if hasattr(ctx, "set_last_result_table_overlay"):
ctx.set_last_result_table_overlay(None, None, None)
except Exception:
pass
try:
if hasattr(ctx, "set_current_stage_table"):
ctx.set_current_stage_table(None)
except Exception:
pass
rooms = _normalize_to_list(result)
room_ids: List[str] = []
for r in rooms:
@@ -430,7 +442,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
log("No joined rooms found.", file=sys.stderr)
return 0
table = ResultTable("Matrix Rooms")
table = ResultTable("Matrix Rooms (select with @N)")
table.set_table("matrix")
table.set_source_command(".matrix", [])
@@ -461,12 +473,6 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
ctx.set_last_result_table_overlay(table, room_items)
ctx.set_current_stage_table(table)
ctx.set_pending_pipeline_tail([[".matrix", "-send"]], ".matrix")
print()
from rich_display import stdout_console
stdout_console().print(table)
print("\nSelect room(s) with @N (e.g. @1 or @1-3) to send the selected item(s)")
return 0
CMDLET = Cmdlet(

View File

@@ -1,6 +1,6 @@
# Medios-Macina
Medios-Macina is a CLI-first media ingestion and management toolkit focused on reliably downloading, tagging, and storing media (audio, video, images, and text) from a variety of providers and sources. It is designed around a compact, pipeable command language ("cmdlets") so complex workflows can be composed simply and repeatably.
Medios-Macina is a CLI media manager and toolkit focused on downloading, tagging, and media storage (audio, video, images, and text) from a variety of providers and sources. It is designed around a compact, pipeable command language ("cmdlets") so complex workflows can be composed simply and repeatably.
## Highlights ✅
- Flexible pipeline-based CLI: chain cmdlets with `|` and use saved selections with `@N`.