sdfsdf

2025-12-21 16:59:37 -08:00
parent 11a13edb84
commit d0b821b5dd
7 changed files with 508 additions and 136 deletions
--- a/cmdlet/screen_shot.py
+++ b/cmdlet/screen_shot.py
@@ -32,20 +32,52 @@ parse_cmdlet_args = sh.parse_cmdlet_args
 import pipeline as pipeline_context


-def _set_live_step(text: str) -> None:
-    """Best-effort update to the pipeline Live progress title (if enabled)."""
+def _live_ui_and_pipe_index() -> tuple[Optional[Any], int]:
+    ui = None
    try:
        ui = pipeline_context.get_live_progress() if hasattr(pipeline_context, "get_live_progress") else None
    except Exception:
        ui = None
+
+    pipe_idx: int = 0
+    try:
+        stage_ctx = pipeline_context.get_stage_context() if hasattr(pipeline_context, "get_stage_context") else None
+        maybe_idx = getattr(stage_ctx, "pipe_index", None) if stage_ctx is not None else None
+        if isinstance(maybe_idx, int):
+            pipe_idx = int(maybe_idx)
+    except Exception:
+        pipe_idx = 0
+
+    return ui, pipe_idx
+
+
+def _begin_live_steps(total_steps: int) -> None:
+    """Declare the total number of steps for this cmdlet run (per-pipe)."""
+    ui, pipe_idx = _live_ui_and_pipe_index()
    if ui is None:
        return
    try:
-        setter = getattr(ui, "set_active_subtask_text", None)
-        if callable(setter):
-            setter(str(text or "").strip())
+        begin = getattr(ui, "begin_pipe_steps", None)
+        if callable(begin):
+            begin(int(pipe_idx), total_steps=int(total_steps))
    except Exception:
-        pass
+        return
+
+
+def _step(text: str) -> None:
+    """Emit a *new* step.
+
+    Each call increments the step counter and advances percent automatically.
+    """
+    ui, pipe_idx = _live_ui_and_pipe_index()
+    if ui is None:
+        return
+    try:
+        adv = getattr(ui, "advance_pipe_step", None)
+        if callable(adv):
+            adv(int(pipe_idx), str(text))
+    except Exception:
+        return

 # ============================================================================
 # CMDLET Metadata Declaration
@@ -186,28 +218,6 @@ def _format_suffix(fmt: str) -> str:
        return ".jpg"
    return f".{fmt}"

-
-def _convert_to_webp(source_path: Path, dest_path: Path) -> None:
-    """Convert an image file to WebP using Pillow."""
-    from PIL import Image
-
-    with Image.open(source_path) as img:
-        # Keep a sensible default: good quality + small size.
-        img.save(dest_path, format="WEBP", quality=100, method=6)
-
-
-def _selectors_for_url(url: str) -> List[str]:
-    """Return a list of likely content selectors for known platforms."""
-    u = url.lower()
-    sels: List[str] = []
-    
-    for domain, selectors in SITE_SELECTORS.items():
-        if domain in u:
-            sels.extend(selectors)
-            
-    return sels or ["article"]
-
-
 def _matched_site_selectors(url: str) -> List[str]:
    """Return SITE_SELECTORS for a matched domain; empty if no match.

@@ -223,46 +233,47 @@ def _matched_site_selectors(url: str) -> List[str]:

 def _platform_preprocess(url: str, page: Any, warnings: List[str], timeout_ms: int = 10_000) -> None:
    """Best-effort page tweaks for popular platforms before capture."""
-    u = url.lower()
+    try:
+        u = str(url or "").lower()

-    def _try_click_texts(texts: List[str], passes: int = 2, per_timeout: int = 700) -> int:
-        clicks = 0
-        for _ in range(max(1, passes)):
-            for t in texts:
-                try:
-                    page.locator(f"text=/{t}/i").first.click(timeout=per_timeout)
-                    clicks += 1
-                except PlaywrightTimeoutError:
-                    pass
-                except Exception:
-                    pass
-            time.sleep(0.1)
-        return clicks
+        def _try_click_buttons(names: List[str], passes: int = 2, per_timeout: int = 700) -> int:
+            clicks = 0
+            for _ in range(max(1, int(passes))):
+                for name in names:
+                    try:
+                        locator = page.get_by_role("button", name=name)
+                        locator.first.click(timeout=int(per_timeout))
+                        clicks += 1
+                    except Exception:
+                        pass
+            return clicks

-    # Dismiss common cookie/consent prompts
-    _try_click_texts(["accept", "i agree", "agree", "got it", "allow all", "consent"])
+        # Dismiss common cookie / consent prompts.
+        _try_click_buttons([
+            "Accept all",
+            "Accept",
+            "I agree",
+            "Agree",
+            "Allow all",
+            "OK",
+        ])

-    # Platform-specific expansions
-    if "reddit.com" in u:
-        _try_click_texts(["see more", "read more", "show more", "more"])
-    if ("twitter.com" in u) or ("x.com" in u):
-        _try_click_texts(["show more", "more"])
-    if "instagram.com" in u:
-        _try_click_texts(["more", "see more"])
-    if "tiktok.com" in u:
-        _try_click_texts(["more", "see more"])
-    if ("facebook.com" in u) or ("fb.watch" in u):
-        _try_click_texts(["see more", "show more", "more"])
-    if "rumble.com" in u:
-        _try_click_texts(["accept", "agree", "close"])
+        # Some sites need small nudges (best-effort).
+        if "reddit.com" in u:
+            _try_click_buttons(["Accept all", "Accept"])
+        if ("twitter.com" in u) or ("x.com" in u):
+            _try_click_buttons(["Accept all", "Accept"])
+        if "instagram.com" in u:
+            _try_click_buttons(["Allow all", "Accept all", "Accept"])
+    except Exception as exc:
+        debug(f"[_platform_preprocess] skipped: {exc}")
+        return


 def _submit_wayback(url: str, timeout: float) -> Optional[str]:
-    """Submit URL to Internet Archive Wayback Machine."""
    encoded = quote(url, safe="/:?=&")
-    with HTTPClient() as client:
+    with HTTPClient(headers={"User-Agent": USER_AGENT}) as client:
        response = client.get(f"https://web.archive.org/save/{encoded}")
-        response.raise_for_status()
        content_location = response.headers.get("Content-Location")
        if content_location:
            return urljoin("https://web.archive.org", content_location)
@@ -359,10 +370,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
    """Capture screenshot using Playwright."""
    debug(f"[_capture] Starting capture for {options.url} -> {destination}")
    try:
-        # Two-phase Live progress:
-        #   1) load + stabilize (ends right after the wait_after_load sleep)
-        #   2) capture + save (and any post-processing)
-        _set_live_step("screen-shot: loading")
+        _step("loading launching browser")
        tool = options.playwright_tool or PlaywrightTool({})

        # Ensure Chromium engine is used for the screen-shot cmdlet (force for consistency)
@@ -397,13 +405,16 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])

        try:
            with tool.open_page(headless=headless) as page:
+                _step("loading navigating")
                debug(f"Navigating to {options.url}...")
                try:
                    tool.goto(page, options.url)
                    debug("Page loaded successfully")
+                    _step("loading page loaded")
                except PlaywrightTimeoutError:
                    warnings.append("navigation timeout; capturing current page state")
                    debug("Navigation timeout; proceeding with current state")
+                    _step("loading navigation timeout")
                
                # Skip article lookup by default (wait_for_article defaults to False)
                if options.wait_for_article:
@@ -419,8 +430,9 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
                    debug(f"Waiting {options.wait_after_load}s for page stabilization...")
                    time.sleep(min(10.0, max(0.0, options.wait_after_load)))

-                # Phase 2 begins here (per request).
-                _set_live_step("screen-shot: capturing")
+                _step("loading stabilized")
+
+                _step("capturing preparing")
                if options.replace_video_posters:
                    debug("Replacing video elements with posters...")
                    page.evaluate(
@@ -441,6 +453,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
                if options.prefer_platform_target and format_name != "pdf":
                    debug(f"[_capture] Target capture enabled")
                    debug("Attempting platform-specific content capture...")
+                    _step("capturing locating target")
                    try:
                        _platform_preprocess(options.url, page, warnings)
                    except Exception as e:
@@ -465,6 +478,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
                                    el.scroll_into_view_if_needed(timeout=1000)
                                except Exception:
                                    pass
+                                _step("capturing output")
                                debug(f"Capturing element to {destination}...")
                                el.screenshot(path=str(destination), type=("jpeg" if format_name == "jpeg" else None))
                                element_captured = True
@@ -475,12 +489,14 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
                            debug(f"Failed to capture element: {exc}")
                # Fallback to default capture paths
                if element_captured:
-                    pass
+                    _step("capturing saved")
                elif format_name == "pdf":
                    debug("Generating PDF...")
                    page.emulate_media(media="print")
+                    _step("capturing output")
                    page.pdf(path=str(destination), print_background=True)
                    debug(f"PDF saved to {destination}")
+                    _step("capturing saved")
                else:
                    debug(f"Capturing full page to {destination}...")
                    screenshot_kwargs: Dict[str, Any] = {"path": str(destination)}
@@ -488,16 +504,20 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
                        screenshot_kwargs["type"] = "jpeg"
                        screenshot_kwargs["quality"] = 90
                    if options.full_page:
+                        _step("capturing output")
                        page.screenshot(full_page=True, **screenshot_kwargs)
                    else:
                        article = page.query_selector("article")
                        if article is not None:
                            article_kwargs = dict(screenshot_kwargs)
                            article_kwargs.pop("full_page", None)
+                            _step("capturing output")
                            article.screenshot(**article_kwargs)
                        else:
+                            _step("capturing output")
                            page.screenshot(**screenshot_kwargs)
                    debug(f"Screenshot saved to {destination}")
+                    _step("capturing saved")
        except Exception as exc:
            debug(f"[_capture] Exception launching browser/page: {exc}")
            msg = str(exc).lower()
@@ -519,6 +539,13 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
    destination = _prepare_output_path(options)
    warnings: List[str] = []

+    will_target = bool(options.prefer_platform_target) and requested_format != "pdf"
+    will_convert = requested_format == "webp"
+    will_archive = bool(options.archive and options.url)
+    total_steps = 9 + (1 if will_target else 0) + (1 if will_convert else 0) + (1 if will_archive else 0)
+    _begin_live_steps(total_steps)
+    _step("loading starting")
+
    # Playwright screenshots do not natively support WebP output.
    # Capture as PNG, then convert via Pillow.
    capture_path = destination
@@ -529,6 +556,7 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
    _capture(options, capture_path, warnings)

    if requested_format == "webp":
+        _step("capturing converting to webp")
        debug(f"[_capture_screenshot] Converting png -> webp: {destination}")
        try:
            _convert_to_webp(capture_path, destination)
@@ -544,7 +572,7 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
    url: List[str] = [options.url] if options.url else []
    archive_url: List[str] = []
    if options.archive and options.url:
-        _set_live_step("screen-shot: archiving")
+        _step("capturing archiving")
        debug(f"[_capture_screenshot] Archiving enabled for {options.url}")
        archives, archive_warnings = _archive_url(options.url, options.archive_timeout)
        archive_url.extend(archives)
@@ -552,6 +580,8 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
        if archives:
            url = unique_preserve_order([*url, *archives])

+    _step("capturing finalized")
+
    applied_tag = unique_preserve_order(list(tag for tag in options.tag if tag.strip()))

    return ScreenshotResult(
@@ -768,7 +798,6 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
            continue
        
        try:
-            _set_live_step("screen-shot: starting")
            # Create screenshot with provided options
            # Force the Playwright engine to Chromium for the screen-shot cmdlet
            # (this ensures consistent rendering and supports PDF output requirements).