sdfsdf

2025-12-21 16:59:37 -08:00
parent 11a13edb84
commit d0b821b5dd
7 changed files with 508 additions and 136 deletions
@@ -643,7 +643,7 @@ class Add_File(Cmdlet):
            # Run search-store under a temporary stage context so its ctx.emit() calls
            # don't interfere with the outer add-file pipeline stage.
            prev_ctx = ctx.get_stage_context()
-            temp_ctx = ctx.PipelineStageContext(stage_index=0, total_stages=1, worker_id=getattr(prev_ctx, "worker_id", None))
+            temp_ctx = ctx.PipelineStageContext(stage_index=0, total_stages=1, pipe_index=0, worker_id=getattr(prev_ctx, "worker_id", None))
            ctx.set_stage_context(temp_ctx)
            try:
                code = search_store_cmdlet.run(None, args, config)
@@ -1472,7 +1472,7 @@ class Add_File(Cmdlet):
            # Run search-store under a temporary stage context so its ctx.emit() calls
            # don't interfere with the outer add-file pipeline stage.
            prev_ctx = ctx.get_stage_context()
-            temp_ctx = ctx.PipelineStageContext(stage_index=0, total_stages=1, worker_id=getattr(prev_ctx, "worker_id", None))
+            temp_ctx = ctx.PipelineStageContext(stage_index=0, total_stages=1, pipe_index=0, worker_id=getattr(prev_ctx, "worker_id", None))
            ctx.set_stage_context(temp_ctx)
            try:
                code = search_store_cmdlet.run(None, args, config)
@@ -48,6 +48,64 @@ coerce_to_pipe_object = sh.coerce_to_pipe_object
 get_field = sh.get_field


+def _live_ui_and_pipe_index() -> tuple[Optional[Any], int]:
+    ui = None
+    try:
+        ui = pipeline_context.get_live_progress() if hasattr(pipeline_context, "get_live_progress") else None
+    except Exception:
+        ui = None
+
+    pipe_idx: int = 0
+    try:
+        stage_ctx = pipeline_context.get_stage_context() if hasattr(pipeline_context, "get_stage_context") else None
+        maybe_idx = getattr(stage_ctx, "pipe_index", None) if stage_ctx is not None else None
+        if isinstance(maybe_idx, int):
+            pipe_idx = int(maybe_idx)
+    except Exception:
+        pipe_idx = 0
+
+    return ui, pipe_idx
+
+
+def _begin_live_steps(total_steps: int) -> None:
+    """Declare the total number of steps for the current pipe."""
+    ui, pipe_idx = _live_ui_and_pipe_index()
+    if ui is None:
+        return
+    try:
+        begin = getattr(ui, "begin_pipe_steps", None)
+        if callable(begin):
+            begin(int(pipe_idx), total_steps=int(total_steps))
+    except Exception:
+        return
+
+
+def _step(text: str) -> None:
+    """Emit a *new* step (increments i/N and advances percent automatically)."""
+    ui, pipe_idx = _live_ui_and_pipe_index()
+    if ui is None:
+        return
+    try:
+        adv = getattr(ui, "advance_pipe_step", None)
+        if callable(adv):
+            adv(int(pipe_idx), str(text))
+    except Exception:
+        return
+
+
+def _set_pipe_percent(percent: int) -> None:
+    """Best-effort percent update without changing step text."""
+    ui, pipe_idx = _live_ui_and_pipe_index()
+    if ui is None:
+        return
+    try:
+        set_pct = getattr(ui, "set_pipe_percent", None)
+        if callable(set_pct):
+            set_pct(int(pipe_idx), int(percent))
+    except Exception:
+        return
+
+
 # Minimal inlined helpers from helper/download.py (is_url_supported_by_ytdlp, list_formats)
 try:
    import yt_dlp  # type: ignore
@@ -353,7 +411,17 @@ def _download_with_sections_via_cli(url: str, ytdl_options: Dict[str, Any], sect
    session_id = hashlib.md5((url + str(time.time()) + ''.join(random.choices(string.ascii_letters, k=10))).encode()).hexdigest()[:12]
    first_section_info = None

+    total_sections = len(sections_list)
    for section_idx, section in enumerate(sections_list, 1):
+        # While step 1/2 is "downloading", keep the pipe bar moving for multi-section clips.
+        # Map sections onto 50..99 so step 2/2 can still jump to 100.
+        try:
+            if total_sections > 0:
+                pct = 50 + int(((section_idx - 1) / max(1, total_sections)) * 49)
+                _set_pipe_percent(pct)
+        except Exception:
+            pass
+
        base_outtmpl = ytdl_options.get("outtmpl", "%(title)s.%(ext)s")
        output_dir_path = Path(base_outtmpl).parent
        filename_tmpl = f"{session_id}_{section_idx}"
@@ -385,6 +453,17 @@ def _download_with_sections_via_cli(url: str, ytdl_options: Dict[str, Any], sect
                    debug(f"Error extracting metadata: {e}")

        cmd = ["yt-dlp"]
+        if quiet:
+            cmd.append("--quiet")
+            cmd.append("--no-warnings")
+            cmd.append("--no-progress")
+            # Keep ffmpeg/merger output from taking over the terminal.
+            cmd.extend(["--postprocessor-args", "ffmpeg:-hide_banner -loglevel error"])
+        if ytdl_options.get("ffmpeg_location"):
+            try:
+                cmd.extend(["--ffmpeg-location", str(ytdl_options["ffmpeg_location"])])
+            except Exception:
+                pass
        if ytdl_options.get("format"):
            cmd.extend(["-f", ytdl_options["format"]])
        if ytdl_options.get("merge_output_format"):
@@ -413,7 +492,7 @@ def _download_with_sections_via_cli(url: str, ytdl_options: Dict[str, Any], sect
            cmd.append("--write-auto-sub")
            cmd.extend(["--sub-format", "vtt"])
        if ytdl_options.get("force_keyframes_at_cuts"):
-            cmd.extend(["--force-keyframes-at-cuts"]) if ytdl_options.get("force_keyframes_at_cuts") else None
+            cmd.append("--force-keyframes-at-cuts")
        cmd.extend(["-o", section_outtmpl])
        if ytdl_options.get("cookiefile"):
            cookies_path = ytdl_options["cookiefile"].replace("\\", "/")
@@ -428,10 +507,23 @@ def _download_with_sections_via_cli(url: str, ytdl_options: Dict[str, Any], sect
        if not quiet:
            debug(f"Running yt-dlp for section: {section}")
        try:
-            subprocess.run(cmd, check=True)
+            if quiet:
+                subprocess.run(cmd, check=True, capture_output=True, text=True)
+            else:
+                subprocess.run(cmd, check=True)
+        except subprocess.CalledProcessError as exc:
+            stderr_text = (exc.stderr or "")
+            tail = "\n".join(stderr_text.splitlines()[-12:]).strip()
+            details = f"\n{tail}" if tail else ""
+            raise DownloadError(f"yt-dlp failed for section {section} (exit {exc.returncode}){details}") from exc
        except Exception as exc:
-            if not quiet:
-                debug(f"yt-dlp error for section {section}: {exc}")
+            raise DownloadError(f"yt-dlp failed for section {section}: {exc}") from exc
+
+    # Mark near-complete before returning so the runner can finalize cleanly.
+    try:
+        _set_pipe_percent(99)
+    except Exception:
+        pass

    return session_id, first_section_info or {}

@@ -720,30 +812,16 @@ def download_media(
        session_id = None
        first_section_info = {}
        if ytdl_options.get("download_sections"):
-            # The CLI path emits yt-dlp's own progress output; pause the pipeline Live UI
-            # so those progress bars remain visible instead of being clobbered.
-            try:
-                from contextlib import nullcontext
-            except Exception:
-                nullcontext = None  # type: ignore
-
-            suspend = getattr(pipeline_context, "suspend_live_progress", None)
-            cm = suspend() if callable(suspend) else (nullcontext() if nullcontext else None)
-            if cm is None:
-                session_id, first_section_info = _download_with_sections_via_cli(
-                    opts.url,
-                    ytdl_options,
-                    ytdl_options.get("download_sections", []),
-                    quiet=opts.quiet,
-                )
-            else:
-                with cm:
-                    session_id, first_section_info = _download_with_sections_via_cli(
-                        opts.url,
-                        ytdl_options,
-                        ytdl_options.get("download_sections", []),
-                        quiet=opts.quiet,
-                    )
+            # For clip (download_sections), keep pipeline Live UI active and suppress
+            # yt-dlp/ffmpeg CLI spam when running in quiet/pipeline mode.
+            live_ui, _ = _live_ui_and_pipe_index()
+            quiet_sections = bool(opts.quiet) or (live_ui is not None)
+            session_id, first_section_info = _download_with_sections_via_cli(
+                opts.url,
+                ytdl_options,
+                ytdl_options.get("download_sections", []),
+                quiet=quiet_sections,
+            )
            info = None
        else:
            with yt_dlp.YoutubeDL(ytdl_options) as ydl:  # type: ignore[arg-type]
@@ -2168,7 +2246,6 @@ class Download_Media(Cmdlet):
                    pipeline_context.set_last_result_table(table, results_list)
                    
                    log(f"", file=sys.stderr)
-                    log(f"Use: @N to select and download format", file=sys.stderr)
                    return 0
            
            # Download each URL
@@ -2196,6 +2273,11 @@ class Download_Media(Cmdlet):
                            log(f"Skipping download: {url}", file=sys.stderr)
                            continue

+                    # Step progress is per-URL download.
+                    # Keep steps meaningful: long-running download + finalize.
+                    # (Fast internal bookkeeping should not be steps.)
+                    _begin_live_steps(2)
+
                    # If playlist_items is specified but looks like a format ID (e.g. from table selection),
                    # treat it as a format selector instead of playlist items.
                    # This handles the case where @N selection passes -item <format_id>
@@ -2274,6 +2356,7 @@ class Download_Media(Cmdlet):
                                write_sub=write_sub,
                            )

+                            _step("downloading")
                            # Use timeout wrapper to prevent hanging
                            debug(f"Starting download with 5-minute timeout...")
                            result_obj = _download_with_timeout(opts, timeout_seconds=300)
@@ -2416,6 +2499,10 @@ class Download_Media(Cmdlet):
                                    except Exception:
                                        pass

+                                    # Complete the step sequence: we return here and the user must
+                                    # re-run with @N selection.
+                                    _step("awaiting selection")
+
                                    log("Requested format is not available; select a working format with @N", file=sys.stderr)
                                    return 0

@@ -2522,6 +2609,10 @@ class Download_Media(Cmdlet):

                    debug(f"Emitting {len(pipe_objects)} result(s) to pipeline...")

+                    # Mark complete *before* the first emit, because the pipeline clears the
+                    # status line on emit().
+                    _step("finalized")
+
                    stage_ctx = pipeline_context.get_stage_context()
                    emit_enabled = bool(stage_ctx is not None and not getattr(stage_ctx, "is_last_stage", False))
                    for pipe_obj_dict in pipe_objects:
@@ -32,20 +32,52 @@ parse_cmdlet_args = sh.parse_cmdlet_args
 import pipeline as pipeline_context


-def _set_live_step(text: str) -> None:
-    """Best-effort update to the pipeline Live progress title (if enabled)."""
+def _live_ui_and_pipe_index() -> tuple[Optional[Any], int]:
+    ui = None
    try:
        ui = pipeline_context.get_live_progress() if hasattr(pipeline_context, "get_live_progress") else None
    except Exception:
        ui = None
+
+    pipe_idx: int = 0
+    try:
+        stage_ctx = pipeline_context.get_stage_context() if hasattr(pipeline_context, "get_stage_context") else None
+        maybe_idx = getattr(stage_ctx, "pipe_index", None) if stage_ctx is not None else None
+        if isinstance(maybe_idx, int):
+            pipe_idx = int(maybe_idx)
+    except Exception:
+        pipe_idx = 0
+
+    return ui, pipe_idx
+
+
+def _begin_live_steps(total_steps: int) -> None:
+    """Declare the total number of steps for this cmdlet run (per-pipe)."""
+    ui, pipe_idx = _live_ui_and_pipe_index()
    if ui is None:
        return
    try:
-        setter = getattr(ui, "set_active_subtask_text", None)
-        if callable(setter):
-            setter(str(text or "").strip())
+        begin = getattr(ui, "begin_pipe_steps", None)
+        if callable(begin):
+            begin(int(pipe_idx), total_steps=int(total_steps))
    except Exception:
-        pass
+        return
+
+
+def _step(text: str) -> None:
+    """Emit a *new* step.
+
+    Each call increments the step counter and advances percent automatically.
+    """
+    ui, pipe_idx = _live_ui_and_pipe_index()
+    if ui is None:
+        return
+    try:
+        adv = getattr(ui, "advance_pipe_step", None)
+        if callable(adv):
+            adv(int(pipe_idx), str(text))
+    except Exception:
+        return

 # ============================================================================
 # CMDLET Metadata Declaration
@@ -186,28 +218,6 @@ def _format_suffix(fmt: str) -> str:
        return ".jpg"
    return f".{fmt}"

-
-def _convert_to_webp(source_path: Path, dest_path: Path) -> None:
-    """Convert an image file to WebP using Pillow."""
-    from PIL import Image
-
-    with Image.open(source_path) as img:
-        # Keep a sensible default: good quality + small size.
-        img.save(dest_path, format="WEBP", quality=100, method=6)
-
-
-def _selectors_for_url(url: str) -> List[str]:
-    """Return a list of likely content selectors for known platforms."""
-    u = url.lower()
-    sels: List[str] = []
-    
-    for domain, selectors in SITE_SELECTORS.items():
-        if domain in u:
-            sels.extend(selectors)
-            
-    return sels or ["article"]
-
-
 def _matched_site_selectors(url: str) -> List[str]:
    """Return SITE_SELECTORS for a matched domain; empty if no match.

@@ -223,46 +233,47 @@ def _matched_site_selectors(url: str) -> List[str]:

 def _platform_preprocess(url: str, page: Any, warnings: List[str], timeout_ms: int = 10_000) -> None:
    """Best-effort page tweaks for popular platforms before capture."""
-    u = url.lower()
+    try:
+        u = str(url or "").lower()

-    def _try_click_texts(texts: List[str], passes: int = 2, per_timeout: int = 700) -> int:
-        clicks = 0
-        for _ in range(max(1, passes)):
-            for t in texts:
-                try:
-                    page.locator(f"text=/{t}/i").first.click(timeout=per_timeout)
-                    clicks += 1
-                except PlaywrightTimeoutError:
-                    pass
-                except Exception:
-                    pass
-            time.sleep(0.1)
-        return clicks
+        def _try_click_buttons(names: List[str], passes: int = 2, per_timeout: int = 700) -> int:
+            clicks = 0
+            for _ in range(max(1, int(passes))):
+                for name in names:
+                    try:
+                        locator = page.get_by_role("button", name=name)
+                        locator.first.click(timeout=int(per_timeout))
+                        clicks += 1
+                    except Exception:
+                        pass
+            return clicks

-    # Dismiss common cookie/consent prompts
-    _try_click_texts(["accept", "i agree", "agree", "got it", "allow all", "consent"])
+        # Dismiss common cookie / consent prompts.
+        _try_click_buttons([
+            "Accept all",
+            "Accept",
+            "I agree",
+            "Agree",
+            "Allow all",
+            "OK",
+        ])

-    # Platform-specific expansions
-    if "reddit.com" in u:
-        _try_click_texts(["see more", "read more", "show more", "more"])
-    if ("twitter.com" in u) or ("x.com" in u):
-        _try_click_texts(["show more", "more"])
-    if "instagram.com" in u:
-        _try_click_texts(["more", "see more"])
-    if "tiktok.com" in u:
-        _try_click_texts(["more", "see more"])
-    if ("facebook.com" in u) or ("fb.watch" in u):
-        _try_click_texts(["see more", "show more", "more"])
-    if "rumble.com" in u:
-        _try_click_texts(["accept", "agree", "close"])
+        # Some sites need small nudges (best-effort).
+        if "reddit.com" in u:
+            _try_click_buttons(["Accept all", "Accept"])
+        if ("twitter.com" in u) or ("x.com" in u):
+            _try_click_buttons(["Accept all", "Accept"])
+        if "instagram.com" in u:
+            _try_click_buttons(["Allow all", "Accept all", "Accept"])
+    except Exception as exc:
+        debug(f"[_platform_preprocess] skipped: {exc}")
+        return


 def _submit_wayback(url: str, timeout: float) -> Optional[str]:
-    """Submit URL to Internet Archive Wayback Machine."""
    encoded = quote(url, safe="/:?=&")
-    with HTTPClient() as client:
+    with HTTPClient(headers={"User-Agent": USER_AGENT}) as client:
        response = client.get(f"https://web.archive.org/save/{encoded}")
-        response.raise_for_status()
        content_location = response.headers.get("Content-Location")
        if content_location:
            return urljoin("https://web.archive.org", content_location)
@@ -359,10 +370,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
    """Capture screenshot using Playwright."""
    debug(f"[_capture] Starting capture for {options.url} -> {destination}")
    try:
-        # Two-phase Live progress:
-        #   1) load + stabilize (ends right after the wait_after_load sleep)
-        #   2) capture + save (and any post-processing)
-        _set_live_step("screen-shot: loading")
+        _step("loading launching browser")
        tool = options.playwright_tool or PlaywrightTool({})

        # Ensure Chromium engine is used for the screen-shot cmdlet (force for consistency)
@@ -397,13 +405,16 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])

        try:
            with tool.open_page(headless=headless) as page:
+                _step("loading navigating")
                debug(f"Navigating to {options.url}...")
                try:
                    tool.goto(page, options.url)
                    debug("Page loaded successfully")
+                    _step("loading page loaded")
                except PlaywrightTimeoutError:
                    warnings.append("navigation timeout; capturing current page state")
                    debug("Navigation timeout; proceeding with current state")
+                    _step("loading navigation timeout")
                
                # Skip article lookup by default (wait_for_article defaults to False)
                if options.wait_for_article:
@@ -419,8 +430,9 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
                    debug(f"Waiting {options.wait_after_load}s for page stabilization...")
                    time.sleep(min(10.0, max(0.0, options.wait_after_load)))

-                # Phase 2 begins here (per request).
-                _set_live_step("screen-shot: capturing")
+                _step("loading stabilized")
+
+                _step("capturing preparing")
                if options.replace_video_posters:
                    debug("Replacing video elements with posters...")
                    page.evaluate(
@@ -441,6 +453,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
                if options.prefer_platform_target and format_name != "pdf":
                    debug(f"[_capture] Target capture enabled")
                    debug("Attempting platform-specific content capture...")
+                    _step("capturing locating target")
                    try:
                        _platform_preprocess(options.url, page, warnings)
                    except Exception as e:
@@ -465,6 +478,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
                                    el.scroll_into_view_if_needed(timeout=1000)
                                except Exception:
                                    pass
+                                _step("capturing output")
                                debug(f"Capturing element to {destination}...")
                                el.screenshot(path=str(destination), type=("jpeg" if format_name == "jpeg" else None))
                                element_captured = True
@@ -475,12 +489,14 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
                            debug(f"Failed to capture element: {exc}")
                # Fallback to default capture paths
                if element_captured:
-                    pass
+                    _step("capturing saved")
                elif format_name == "pdf":
                    debug("Generating PDF...")
                    page.emulate_media(media="print")
+                    _step("capturing output")
                    page.pdf(path=str(destination), print_background=True)
                    debug(f"PDF saved to {destination}")
+                    _step("capturing saved")
                else:
                    debug(f"Capturing full page to {destination}...")
                    screenshot_kwargs: Dict[str, Any] = {"path": str(destination)}
@@ -488,16 +504,20 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
                        screenshot_kwargs["type"] = "jpeg"
                        screenshot_kwargs["quality"] = 90
                    if options.full_page:
+                        _step("capturing output")
                        page.screenshot(full_page=True, **screenshot_kwargs)
                    else:
                        article = page.query_selector("article")
                        if article is not None:
                            article_kwargs = dict(screenshot_kwargs)
                            article_kwargs.pop("full_page", None)
+                            _step("capturing output")
                            article.screenshot(**article_kwargs)
                        else:
+                            _step("capturing output")
                            page.screenshot(**screenshot_kwargs)
                    debug(f"Screenshot saved to {destination}")
+                    _step("capturing saved")
        except Exception as exc:
            debug(f"[_capture] Exception launching browser/page: {exc}")
            msg = str(exc).lower()
@@ -519,6 +539,13 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
    destination = _prepare_output_path(options)
    warnings: List[str] = []

+    will_target = bool(options.prefer_platform_target) and requested_format != "pdf"
+    will_convert = requested_format == "webp"
+    will_archive = bool(options.archive and options.url)
+    total_steps = 9 + (1 if will_target else 0) + (1 if will_convert else 0) + (1 if will_archive else 0)
+    _begin_live_steps(total_steps)
+    _step("loading starting")
+
    # Playwright screenshots do not natively support WebP output.
    # Capture as PNG, then convert via Pillow.
    capture_path = destination
@@ -529,6 +556,7 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
    _capture(options, capture_path, warnings)

    if requested_format == "webp":
+        _step("capturing converting to webp")
        debug(f"[_capture_screenshot] Converting png -> webp: {destination}")
        try:
            _convert_to_webp(capture_path, destination)
@@ -544,7 +572,7 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
    url: List[str] = [options.url] if options.url else []
    archive_url: List[str] = []
    if options.archive and options.url:
-        _set_live_step("screen-shot: archiving")
+        _step("capturing archiving")
        debug(f"[_capture_screenshot] Archiving enabled for {options.url}")
        archives, archive_warnings = _archive_url(options.url, options.archive_timeout)
        archive_url.extend(archives)
@@ -552,6 +580,8 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
        if archives:
            url = unique_preserve_order([*url, *archives])

+    _step("capturing finalized")
+
    applied_tag = unique_preserve_order(list(tag for tag in options.tag if tag.strip()))

    return ScreenshotResult(
@@ -768,7 +798,6 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
            continue
        
        try:
-            _set_live_step("screen-shot: starting")
            # Create screenshot with provided options
            # Force the Playwright engine to Chromium for the screen-shot cmdlet
            # (this ensures consistent rendering and supports PDF output requirements).