This commit is contained in:
nose
2025-12-21 16:59:37 -08:00
parent 11a13edb84
commit d0b821b5dd
7 changed files with 508 additions and 136 deletions

View File

@@ -643,7 +643,7 @@ class Add_File(Cmdlet):
# Run search-store under a temporary stage context so its ctx.emit() calls
# don't interfere with the outer add-file pipeline stage.
prev_ctx = ctx.get_stage_context()
temp_ctx = ctx.PipelineStageContext(stage_index=0, total_stages=1, worker_id=getattr(prev_ctx, "worker_id", None))
temp_ctx = ctx.PipelineStageContext(stage_index=0, total_stages=1, pipe_index=0, worker_id=getattr(prev_ctx, "worker_id", None))
ctx.set_stage_context(temp_ctx)
try:
code = search_store_cmdlet.run(None, args, config)
@@ -1472,7 +1472,7 @@ class Add_File(Cmdlet):
# Run search-store under a temporary stage context so its ctx.emit() calls
# don't interfere with the outer add-file pipeline stage.
prev_ctx = ctx.get_stage_context()
temp_ctx = ctx.PipelineStageContext(stage_index=0, total_stages=1, worker_id=getattr(prev_ctx, "worker_id", None))
temp_ctx = ctx.PipelineStageContext(stage_index=0, total_stages=1, pipe_index=0, worker_id=getattr(prev_ctx, "worker_id", None))
ctx.set_stage_context(temp_ctx)
try:
code = search_store_cmdlet.run(None, args, config)

View File

@@ -48,6 +48,64 @@ coerce_to_pipe_object = sh.coerce_to_pipe_object
get_field = sh.get_field
def _live_ui_and_pipe_index() -> tuple[Optional[Any], int]:
ui = None
try:
ui = pipeline_context.get_live_progress() if hasattr(pipeline_context, "get_live_progress") else None
except Exception:
ui = None
pipe_idx: int = 0
try:
stage_ctx = pipeline_context.get_stage_context() if hasattr(pipeline_context, "get_stage_context") else None
maybe_idx = getattr(stage_ctx, "pipe_index", None) if stage_ctx is not None else None
if isinstance(maybe_idx, int):
pipe_idx = int(maybe_idx)
except Exception:
pipe_idx = 0
return ui, pipe_idx
def _begin_live_steps(total_steps: int) -> None:
"""Declare the total number of steps for the current pipe."""
ui, pipe_idx = _live_ui_and_pipe_index()
if ui is None:
return
try:
begin = getattr(ui, "begin_pipe_steps", None)
if callable(begin):
begin(int(pipe_idx), total_steps=int(total_steps))
except Exception:
return
def _step(text: str) -> None:
"""Emit a *new* step (increments i/N and advances percent automatically)."""
ui, pipe_idx = _live_ui_and_pipe_index()
if ui is None:
return
try:
adv = getattr(ui, "advance_pipe_step", None)
if callable(adv):
adv(int(pipe_idx), str(text))
except Exception:
return
def _set_pipe_percent(percent: int) -> None:
"""Best-effort percent update without changing step text."""
ui, pipe_idx = _live_ui_and_pipe_index()
if ui is None:
return
try:
set_pct = getattr(ui, "set_pipe_percent", None)
if callable(set_pct):
set_pct(int(pipe_idx), int(percent))
except Exception:
return
# Minimal inlined helpers from helper/download.py (is_url_supported_by_ytdlp, list_formats)
try:
import yt_dlp # type: ignore
@@ -353,7 +411,17 @@ def _download_with_sections_via_cli(url: str, ytdl_options: Dict[str, Any], sect
session_id = hashlib.md5((url + str(time.time()) + ''.join(random.choices(string.ascii_letters, k=10))).encode()).hexdigest()[:12]
first_section_info = None
total_sections = len(sections_list)
for section_idx, section in enumerate(sections_list, 1):
# While step 1/2 is "downloading", keep the pipe bar moving for multi-section clips.
# Map sections onto 50..99 so step 2/2 can still jump to 100.
try:
if total_sections > 0:
pct = 50 + int(((section_idx - 1) / max(1, total_sections)) * 49)
_set_pipe_percent(pct)
except Exception:
pass
base_outtmpl = ytdl_options.get("outtmpl", "%(title)s.%(ext)s")
output_dir_path = Path(base_outtmpl).parent
filename_tmpl = f"{session_id}_{section_idx}"
@@ -385,6 +453,17 @@ def _download_with_sections_via_cli(url: str, ytdl_options: Dict[str, Any], sect
debug(f"Error extracting metadata: {e}")
cmd = ["yt-dlp"]
if quiet:
cmd.append("--quiet")
cmd.append("--no-warnings")
cmd.append("--no-progress")
# Keep ffmpeg/merger output from taking over the terminal.
cmd.extend(["--postprocessor-args", "ffmpeg:-hide_banner -loglevel error"])
if ytdl_options.get("ffmpeg_location"):
try:
cmd.extend(["--ffmpeg-location", str(ytdl_options["ffmpeg_location"])])
except Exception:
pass
if ytdl_options.get("format"):
cmd.extend(["-f", ytdl_options["format"]])
if ytdl_options.get("merge_output_format"):
@@ -413,7 +492,7 @@ def _download_with_sections_via_cli(url: str, ytdl_options: Dict[str, Any], sect
cmd.append("--write-auto-sub")
cmd.extend(["--sub-format", "vtt"])
if ytdl_options.get("force_keyframes_at_cuts"):
cmd.extend(["--force-keyframes-at-cuts"]) if ytdl_options.get("force_keyframes_at_cuts") else None
cmd.append("--force-keyframes-at-cuts")
cmd.extend(["-o", section_outtmpl])
if ytdl_options.get("cookiefile"):
cookies_path = ytdl_options["cookiefile"].replace("\\", "/")
@@ -428,10 +507,23 @@ def _download_with_sections_via_cli(url: str, ytdl_options: Dict[str, Any], sect
if not quiet:
debug(f"Running yt-dlp for section: {section}")
try:
subprocess.run(cmd, check=True)
if quiet:
subprocess.run(cmd, check=True, capture_output=True, text=True)
else:
subprocess.run(cmd, check=True)
except subprocess.CalledProcessError as exc:
stderr_text = (exc.stderr or "")
tail = "\n".join(stderr_text.splitlines()[-12:]).strip()
details = f"\n{tail}" if tail else ""
raise DownloadError(f"yt-dlp failed for section {section} (exit {exc.returncode}){details}") from exc
except Exception as exc:
if not quiet:
debug(f"yt-dlp error for section {section}: {exc}")
raise DownloadError(f"yt-dlp failed for section {section}: {exc}") from exc
# Mark near-complete before returning so the runner can finalize cleanly.
try:
_set_pipe_percent(99)
except Exception:
pass
return session_id, first_section_info or {}
@@ -720,30 +812,16 @@ def download_media(
session_id = None
first_section_info = {}
if ytdl_options.get("download_sections"):
# The CLI path emits yt-dlp's own progress output; pause the pipeline Live UI
# so those progress bars remain visible instead of being clobbered.
try:
from contextlib import nullcontext
except Exception:
nullcontext = None # type: ignore
suspend = getattr(pipeline_context, "suspend_live_progress", None)
cm = suspend() if callable(suspend) else (nullcontext() if nullcontext else None)
if cm is None:
session_id, first_section_info = _download_with_sections_via_cli(
opts.url,
ytdl_options,
ytdl_options.get("download_sections", []),
quiet=opts.quiet,
)
else:
with cm:
session_id, first_section_info = _download_with_sections_via_cli(
opts.url,
ytdl_options,
ytdl_options.get("download_sections", []),
quiet=opts.quiet,
)
# For clip (download_sections), keep pipeline Live UI active and suppress
# yt-dlp/ffmpeg CLI spam when running in quiet/pipeline mode.
live_ui, _ = _live_ui_and_pipe_index()
quiet_sections = bool(opts.quiet) or (live_ui is not None)
session_id, first_section_info = _download_with_sections_via_cli(
opts.url,
ytdl_options,
ytdl_options.get("download_sections", []),
quiet=quiet_sections,
)
info = None
else:
with yt_dlp.YoutubeDL(ytdl_options) as ydl: # type: ignore[arg-type]
@@ -2168,7 +2246,6 @@ class Download_Media(Cmdlet):
pipeline_context.set_last_result_table(table, results_list)
log(f"", file=sys.stderr)
log(f"Use: @N to select and download format", file=sys.stderr)
return 0
# Download each URL
@@ -2196,6 +2273,11 @@ class Download_Media(Cmdlet):
log(f"Skipping download: {url}", file=sys.stderr)
continue
# Step progress is per-URL download.
# Keep steps meaningful: long-running download + finalize.
# (Fast internal bookkeeping should not be steps.)
_begin_live_steps(2)
# If playlist_items is specified but looks like a format ID (e.g. from table selection),
# treat it as a format selector instead of playlist items.
# This handles the case where @N selection passes -item <format_id>
@@ -2274,6 +2356,7 @@ class Download_Media(Cmdlet):
write_sub=write_sub,
)
_step("downloading")
# Use timeout wrapper to prevent hanging
debug(f"Starting download with 5-minute timeout...")
result_obj = _download_with_timeout(opts, timeout_seconds=300)
@@ -2416,6 +2499,10 @@ class Download_Media(Cmdlet):
except Exception:
pass
# Complete the step sequence: we return here and the user must
# re-run with @N selection.
_step("awaiting selection")
log("Requested format is not available; select a working format with @N", file=sys.stderr)
return 0
@@ -2522,6 +2609,10 @@ class Download_Media(Cmdlet):
debug(f"Emitting {len(pipe_objects)} result(s) to pipeline...")
# Mark complete *before* the first emit, because the pipeline clears the
# status line on emit().
_step("finalized")
stage_ctx = pipeline_context.get_stage_context()
emit_enabled = bool(stage_ctx is not None and not getattr(stage_ctx, "is_last_stage", False))
for pipe_obj_dict in pipe_objects:

View File

@@ -32,20 +32,52 @@ parse_cmdlet_args = sh.parse_cmdlet_args
import pipeline as pipeline_context
def _set_live_step(text: str) -> None:
"""Best-effort update to the pipeline Live progress title (if enabled)."""
def _live_ui_and_pipe_index() -> tuple[Optional[Any], int]:
ui = None
try:
ui = pipeline_context.get_live_progress() if hasattr(pipeline_context, "get_live_progress") else None
except Exception:
ui = None
pipe_idx: int = 0
try:
stage_ctx = pipeline_context.get_stage_context() if hasattr(pipeline_context, "get_stage_context") else None
maybe_idx = getattr(stage_ctx, "pipe_index", None) if stage_ctx is not None else None
if isinstance(maybe_idx, int):
pipe_idx = int(maybe_idx)
except Exception:
pipe_idx = 0
return ui, pipe_idx
def _begin_live_steps(total_steps: int) -> None:
"""Declare the total number of steps for this cmdlet run (per-pipe)."""
ui, pipe_idx = _live_ui_and_pipe_index()
if ui is None:
return
try:
setter = getattr(ui, "set_active_subtask_text", None)
if callable(setter):
setter(str(text or "").strip())
begin = getattr(ui, "begin_pipe_steps", None)
if callable(begin):
begin(int(pipe_idx), total_steps=int(total_steps))
except Exception:
pass
return
def _step(text: str) -> None:
"""Emit a *new* step.
Each call increments the step counter and advances percent automatically.
"""
ui, pipe_idx = _live_ui_and_pipe_index()
if ui is None:
return
try:
adv = getattr(ui, "advance_pipe_step", None)
if callable(adv):
adv(int(pipe_idx), str(text))
except Exception:
return
# ============================================================================
# CMDLET Metadata Declaration
@@ -186,28 +218,6 @@ def _format_suffix(fmt: str) -> str:
return ".jpg"
return f".{fmt}"
def _convert_to_webp(source_path: Path, dest_path: Path) -> None:
"""Convert an image file to WebP using Pillow."""
from PIL import Image
with Image.open(source_path) as img:
# Keep a sensible default: good quality + small size.
img.save(dest_path, format="WEBP", quality=100, method=6)
def _selectors_for_url(url: str) -> List[str]:
"""Return a list of likely content selectors for known platforms."""
u = url.lower()
sels: List[str] = []
for domain, selectors in SITE_SELECTORS.items():
if domain in u:
sels.extend(selectors)
return sels or ["article"]
def _matched_site_selectors(url: str) -> List[str]:
"""Return SITE_SELECTORS for a matched domain; empty if no match.
@@ -223,46 +233,47 @@ def _matched_site_selectors(url: str) -> List[str]:
def _platform_preprocess(url: str, page: Any, warnings: List[str], timeout_ms: int = 10_000) -> None:
"""Best-effort page tweaks for popular platforms before capture."""
u = url.lower()
try:
u = str(url or "").lower()
def _try_click_texts(texts: List[str], passes: int = 2, per_timeout: int = 700) -> int:
clicks = 0
for _ in range(max(1, passes)):
for t in texts:
try:
page.locator(f"text=/{t}/i").first.click(timeout=per_timeout)
clicks += 1
except PlaywrightTimeoutError:
pass
except Exception:
pass
time.sleep(0.1)
return clicks
def _try_click_buttons(names: List[str], passes: int = 2, per_timeout: int = 700) -> int:
clicks = 0
for _ in range(max(1, int(passes))):
for name in names:
try:
locator = page.get_by_role("button", name=name)
locator.first.click(timeout=int(per_timeout))
clicks += 1
except Exception:
pass
return clicks
# Dismiss common cookie/consent prompts
_try_click_texts(["accept", "i agree", "agree", "got it", "allow all", "consent"])
# Dismiss common cookie / consent prompts.
_try_click_buttons([
"Accept all",
"Accept",
"I agree",
"Agree",
"Allow all",
"OK",
])
# Platform-specific expansions
if "reddit.com" in u:
_try_click_texts(["see more", "read more", "show more", "more"])
if ("twitter.com" in u) or ("x.com" in u):
_try_click_texts(["show more", "more"])
if "instagram.com" in u:
_try_click_texts(["more", "see more"])
if "tiktok.com" in u:
_try_click_texts(["more", "see more"])
if ("facebook.com" in u) or ("fb.watch" in u):
_try_click_texts(["see more", "show more", "more"])
if "rumble.com" in u:
_try_click_texts(["accept", "agree", "close"])
# Some sites need small nudges (best-effort).
if "reddit.com" in u:
_try_click_buttons(["Accept all", "Accept"])
if ("twitter.com" in u) or ("x.com" in u):
_try_click_buttons(["Accept all", "Accept"])
if "instagram.com" in u:
_try_click_buttons(["Allow all", "Accept all", "Accept"])
except Exception as exc:
debug(f"[_platform_preprocess] skipped: {exc}")
return
def _submit_wayback(url: str, timeout: float) -> Optional[str]:
"""Submit URL to Internet Archive Wayback Machine."""
encoded = quote(url, safe="/:?=&")
with HTTPClient() as client:
with HTTPClient(headers={"User-Agent": USER_AGENT}) as client:
response = client.get(f"https://web.archive.org/save/{encoded}")
response.raise_for_status()
content_location = response.headers.get("Content-Location")
if content_location:
return urljoin("https://web.archive.org", content_location)
@@ -359,10 +370,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
"""Capture screenshot using Playwright."""
debug(f"[_capture] Starting capture for {options.url} -> {destination}")
try:
# Two-phase Live progress:
# 1) load + stabilize (ends right after the wait_after_load sleep)
# 2) capture + save (and any post-processing)
_set_live_step("screen-shot: loading")
_step("loading launching browser")
tool = options.playwright_tool or PlaywrightTool({})
# Ensure Chromium engine is used for the screen-shot cmdlet (force for consistency)
@@ -397,13 +405,16 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
try:
with tool.open_page(headless=headless) as page:
_step("loading navigating")
debug(f"Navigating to {options.url}...")
try:
tool.goto(page, options.url)
debug("Page loaded successfully")
_step("loading page loaded")
except PlaywrightTimeoutError:
warnings.append("navigation timeout; capturing current page state")
debug("Navigation timeout; proceeding with current state")
_step("loading navigation timeout")
# Skip article lookup by default (wait_for_article defaults to False)
if options.wait_for_article:
@@ -419,8 +430,9 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
debug(f"Waiting {options.wait_after_load}s for page stabilization...")
time.sleep(min(10.0, max(0.0, options.wait_after_load)))
# Phase 2 begins here (per request).
_set_live_step("screen-shot: capturing")
_step("loading stabilized")
_step("capturing preparing")
if options.replace_video_posters:
debug("Replacing video elements with posters...")
page.evaluate(
@@ -441,6 +453,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
if options.prefer_platform_target and format_name != "pdf":
debug(f"[_capture] Target capture enabled")
debug("Attempting platform-specific content capture...")
_step("capturing locating target")
try:
_platform_preprocess(options.url, page, warnings)
except Exception as e:
@@ -465,6 +478,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
el.scroll_into_view_if_needed(timeout=1000)
except Exception:
pass
_step("capturing output")
debug(f"Capturing element to {destination}...")
el.screenshot(path=str(destination), type=("jpeg" if format_name == "jpeg" else None))
element_captured = True
@@ -475,12 +489,14 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
debug(f"Failed to capture element: {exc}")
# Fallback to default capture paths
if element_captured:
pass
_step("capturing saved")
elif format_name == "pdf":
debug("Generating PDF...")
page.emulate_media(media="print")
_step("capturing output")
page.pdf(path=str(destination), print_background=True)
debug(f"PDF saved to {destination}")
_step("capturing saved")
else:
debug(f"Capturing full page to {destination}...")
screenshot_kwargs: Dict[str, Any] = {"path": str(destination)}
@@ -488,16 +504,20 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
screenshot_kwargs["type"] = "jpeg"
screenshot_kwargs["quality"] = 90
if options.full_page:
_step("capturing output")
page.screenshot(full_page=True, **screenshot_kwargs)
else:
article = page.query_selector("article")
if article is not None:
article_kwargs = dict(screenshot_kwargs)
article_kwargs.pop("full_page", None)
_step("capturing output")
article.screenshot(**article_kwargs)
else:
_step("capturing output")
page.screenshot(**screenshot_kwargs)
debug(f"Screenshot saved to {destination}")
_step("capturing saved")
except Exception as exc:
debug(f"[_capture] Exception launching browser/page: {exc}")
msg = str(exc).lower()
@@ -519,6 +539,13 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
destination = _prepare_output_path(options)
warnings: List[str] = []
will_target = bool(options.prefer_platform_target) and requested_format != "pdf"
will_convert = requested_format == "webp"
will_archive = bool(options.archive and options.url)
total_steps = 9 + (1 if will_target else 0) + (1 if will_convert else 0) + (1 if will_archive else 0)
_begin_live_steps(total_steps)
_step("loading starting")
# Playwright screenshots do not natively support WebP output.
# Capture as PNG, then convert via Pillow.
capture_path = destination
@@ -529,6 +556,7 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
_capture(options, capture_path, warnings)
if requested_format == "webp":
_step("capturing converting to webp")
debug(f"[_capture_screenshot] Converting png -> webp: {destination}")
try:
_convert_to_webp(capture_path, destination)
@@ -544,7 +572,7 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
url: List[str] = [options.url] if options.url else []
archive_url: List[str] = []
if options.archive and options.url:
_set_live_step("screen-shot: archiving")
_step("capturing archiving")
debug(f"[_capture_screenshot] Archiving enabled for {options.url}")
archives, archive_warnings = _archive_url(options.url, options.archive_timeout)
archive_url.extend(archives)
@@ -552,6 +580,8 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
if archives:
url = unique_preserve_order([*url, *archives])
_step("capturing finalized")
applied_tag = unique_preserve_order(list(tag for tag in options.tag if tag.strip()))
return ScreenshotResult(
@@ -768,7 +798,6 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
continue
try:
_set_live_step("screen-shot: starting")
# Create screenshot with provided options
# Force the Playwright engine to Chromium for the screen-shot cmdlet
# (this ensures consistent rendering and supports PDF output requirements).