df
Some checks failed
smoke-mm / Install & smoke test mm --help (push) Has been cancelled

This commit is contained in:
2025-12-29 17:05:03 -08:00
parent 226de9316a
commit c019c00aed
104 changed files with 19669 additions and 12954 deletions

View File

@@ -37,7 +37,6 @@ import pipeline as pipeline_context
# ============================================================================
# ============================================================================
# Playwright & Screenshot Dependencies
# ============================================================================
@@ -104,7 +103,6 @@ SITE_SELECTORS: Dict[str, List[str]] = {
}
class ScreenshotError(RuntimeError):
"""Raised when screenshot capture or upload fails."""
@@ -146,6 +144,7 @@ class ScreenshotResult:
# Helper Functions
# ============================================================================
def _slugify_url(url: str) -> str:
"""Convert URL to filesystem-safe slug."""
parsed = urlsplit(url)
@@ -172,7 +171,11 @@ def _tags_from_url(url: str) -> List[str]:
parsed = None
try:
parsed = urlsplit(u)
host = str(getattr(parsed, "hostname", None) or getattr(parsed, "netloc", "") or "").strip().lower()
host = (
str(getattr(parsed, "hostname", None) or getattr(parsed, "netloc", "") or "")
.strip()
.lower()
)
except Exception:
parsed = None
host = ""
@@ -300,7 +303,12 @@ def _convert_to_webp(
except Exception:
w, h = 0, 0
if downscale_if_oversize and isinstance(max_dim, int) and max_dim > 0 and (w > max_dim or h > max_dim):
if (
downscale_if_oversize
and isinstance(max_dim, int)
and max_dim > 0
and (w > max_dim or h > max_dim)
):
scale = 1.0
try:
scale = min(float(max_dim) / float(w), float(max_dim) / float(h))
@@ -320,7 +328,9 @@ def _convert_to_webp(
im = im.resize((new_w, new_h), resample=resample)
did_downscale = True
except Exception as exc:
debug(f"[_convert_to_webp] Downscale failed; attempting direct WEBP save anyway: {exc}")
debug(
f"[_convert_to_webp] Downscale failed; attempting direct WEBP save anyway: {exc}"
)
im.save(tmp_path, **save_kwargs)
@@ -332,6 +342,7 @@ def _convert_to_webp(
except Exception:
pass
def _matched_site_selectors(url: str) -> List[str]:
"""Return SITE_SELECTORS for a matched domain; empty if no match.
@@ -355,7 +366,9 @@ def _selectors_for_url(url: str) -> List[str]:
return _matched_site_selectors(url)
def _platform_preprocess(url: str, page: Any, warnings: List[str], timeout_ms: int = 10_000) -> None:
def _platform_preprocess(
url: str, page: Any, warnings: List[str], timeout_ms: int = 10_000
) -> None:
"""Best-effort page tweaks for popular platforms before capture."""
try:
u = str(url or "").lower()
@@ -373,14 +386,16 @@ def _platform_preprocess(url: str, page: Any, warnings: List[str], timeout_ms: i
return clicks
# Dismiss common cookie / consent prompts.
_try_click_buttons([
"Accept all",
"Accept",
"I agree",
"Agree",
"Allow all",
"OK",
])
_try_click_buttons(
[
"Accept all",
"Accept",
"I agree",
"Agree",
"Allow all",
"OK",
]
)
# Some sites need small nudges (best-effort).
if "reddit.com" in u:
@@ -490,7 +505,9 @@ def _prepare_output_path(options: ScreenshotOptions) -> Path:
return unique_path(path)
def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str], progress: PipelineProgress) -> None:
def _capture(
options: ScreenshotOptions, destination: Path, warnings: List[str], progress: PipelineProgress
) -> None:
"""Capture screenshot using Playwright."""
debug(f"[_capture] Starting capture for {options.url} -> {destination}")
try:
@@ -499,16 +516,24 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str],
# Ensure Chromium engine is used for the screen-shot cmdlet (force for consistency)
try:
current_browser = getattr(tool.defaults, "browser", "").lower() if getattr(tool, "defaults", None) is not None else ""
current_browser = (
getattr(tool.defaults, "browser", "").lower()
if getattr(tool, "defaults", None) is not None
else ""
)
if current_browser != "chromium":
debug(f"[_capture] Overriding Playwright browser '{current_browser}' -> 'chromium' for screen-shot cmdlet")
debug(
f"[_capture] Overriding Playwright browser '{current_browser}' -> 'chromium' for screen-shot cmdlet"
)
base_cfg = {}
try:
base_cfg = dict(getattr(tool, "_config", {}) or {})
except Exception:
base_cfg = {}
tool_block = dict(base_cfg.get("tool") or {}) if isinstance(base_cfg, dict) else {}
pw_block = dict(tool_block.get("playwright") or {}) if isinstance(tool_block, dict) else {}
pw_block = (
dict(tool_block.get("playwright") or {}) if isinstance(tool_block, dict) else {}
)
pw_block["browser"] = "chromium"
tool_block["playwright"] = pw_block
if isinstance(base_cfg, dict):
@@ -523,7 +548,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str],
format_name = _normalise_format(options.output_format)
headless = options.headless or format_name == "pdf"
debug(f"[_capture] Format: {format_name}, Headless: {headless}")
if format_name == "pdf" and not options.headless:
warnings.append("pdf output requires headless Chromium; overriding headless mode")
@@ -539,7 +564,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str],
warnings.append("navigation timeout; capturing current page state")
debug("Navigation timeout; proceeding with current state")
progress.step("loading navigation timeout")
# Skip article lookup by default (wait_for_article defaults to False)
if options.wait_for_article:
try:
@@ -549,7 +574,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str],
except PlaywrightTimeoutError:
warnings.append("<article> selector not found; capturing fallback")
debug("Article element not found; using fallback")
if options.wait_after_load > 0:
debug(f"Waiting {options.wait_after_load}s for page stabilization...")
time.sleep(min(10.0, max(0.0, options.wait_after_load)))
@@ -591,7 +616,9 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str],
for sel in selectors:
try:
debug(f"Trying selector: {sel}")
el = page.wait_for_selector(sel, timeout=max(0, int(options.selector_timeout_ms)))
el = page.wait_for_selector(
sel, timeout=max(0, int(options.selector_timeout_ms))
)
except PlaywrightTimeoutError:
debug(f"Selector not found: {sel}")
continue
@@ -604,7 +631,10 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str],
pass
progress.step("capturing output")
debug(f"Capturing element to {destination}...")
el.screenshot(path=str(destination), type=("jpeg" if format_name == "jpeg" else None))
el.screenshot(
path=str(destination),
type=("jpeg" if format_name == "jpeg" else None),
)
element_captured = True
debug("Element captured successfully")
break
@@ -645,8 +675,13 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str],
except Exception as exc:
debug(f"[_capture] Exception launching browser/page: {exc}")
msg = str(exc).lower()
if any(k in msg for k in ["executable", "not found", "no such file", "cannot find", "install"]):
raise ScreenshotError("Chromium Playwright browser binaries not found. Install them: python ./scripts/bootstrap.py --playwright-only --browsers chromium") from exc
if any(
k in msg
for k in ["executable", "not found", "no such file", "cannot find", "install"]
):
raise ScreenshotError(
"Chromium Playwright browser binaries not found. Install them: python ./scripts/bootstrap.py --playwright-only --browsers chromium"
) from exc
raise
except ScreenshotError:
# Re-raise ScreenshotError raised intentionally (do not wrap)
@@ -666,7 +701,9 @@ def _capture_screenshot(options: ScreenshotOptions, progress: PipelineProgress)
will_target = bool(options.prefer_platform_target) and requested_format != "pdf"
will_convert = requested_format == "webp"
will_archive = bool(options.archive and options.url)
total_steps = 9 + (1 if will_target else 0) + (1 if will_convert else 0) + (1 if will_archive else 0)
total_steps = (
9 + (1 if will_target else 0) + (1 if will_convert else 0) + (1 if will_archive else 0)
)
progress.begin_steps(total_steps)
progress.step("loading starting")
@@ -726,19 +763,20 @@ def _capture_screenshot(options: ScreenshotOptions, progress: PipelineProgress)
# Main Cmdlet Function
# ============================================================================
def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
"""Take screenshots of url in the pipeline.
Accepts:
- Single result object (dict or PipeObject) with 'path' field
- List of result objects to screenshot each
- Direct URL as string
Emits PipeObject-formatted results for each screenshot with:
- action: 'cmdlet:screen-shot'
- is_temp: True (screenshots are temporary artifacts)
- parent_id: hash of the original file/URL
Screenshots are created using Playwright and marked as temporary
so they can be cleaned up later with the cleanup cmdlet.
"""
@@ -761,9 +799,9 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
# ========================================================================
# ARGUMENT PARSING
# ========================================================================
parsed = parse_cmdlet_args(args, CMDLET)
format_value = parsed.get("format")
if not format_value:
# Default format can be set via config.conf tool block:
@@ -782,7 +820,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
selector_arg = parsed.get("selector")
selectors = [selector_arg] if selector_arg else []
archive_enabled = parsed.get("archive", False)
# Positional URL argument (if provided)
url_arg = parsed.get("url")
positional_url = [str(url_arg)] if url_arg else []
@@ -801,15 +839,11 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
# Extract url from piped results
if piped_results:
for item in piped_results:
url = (
get_field(item, 'path')
or get_field(item, 'url')
or get_field(item, 'target')
)
url = get_field(item, "path") or get_field(item, "url") or get_field(item, "target")
if url:
url_to_process.append((str(url), item))
if not url_to_process:
log(f"No url to process for screen-shot cmdlet", file=sys.stderr)
return 1
@@ -819,9 +853,9 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
# ========================================================================
# OUTPUT DIRECTORY RESOLUTION - Priority chain
# ========================================================================
screenshot_dir: Optional[Path] = None
# Primary: Use --storage if provided (highest priority)
if storage_value:
try:
@@ -830,7 +864,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
except ValueError as e:
log(str(e), file=sys.stderr)
return 1
# Secondary: Use config-based resolver ONLY if --storage not provided
if screenshot_dir is None and resolve_output_dir is not None:
try:
@@ -838,7 +872,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
debug(f"[screen_shot] Using config resolver: {screenshot_dir}")
except Exception:
pass
# Tertiary: Use config outfile ONLY if neither --storage nor resolver worked
if screenshot_dir is None and config and config.get("outfile"):
try:
@@ -846,12 +880,12 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
debug(f"[screen_shot] Using config outfile: {screenshot_dir}")
except Exception:
pass
# Default: User's Videos directory
if screenshot_dir is None:
screenshot_dir = Path.home() / "Videos"
debug(f"[screen_shot] Using default directory: {screenshot_dir}")
ensure_directory(screenshot_dir)
# If the caller isn't running the shared pipeline Live progress UI (e.g. direct
@@ -869,21 +903,21 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
# ========================================================================
# PREPARE SCREENSHOT OPTIONS
# ========================================================================
format_name = _normalise_format(format_value)
filtered_selectors = [str(s).strip() for s in selectors if str(s).strip()]
manual_target_selectors = filtered_selectors if filtered_selectors else None
all_emitted = []
exit_code = 0
# ========================================================================
# PROCESS url AND CAPTURE SCREENSHOTS
# ========================================================================
def _extract_item_tags(item: Any) -> List[str]:
if item is None:
return []
raw = get_field(item, 'tag')
raw = get_field(item, "tag")
if isinstance(raw, list):
return [str(t) for t in raw if t is not None and str(t).strip()]
if isinstance(raw, str) and raw.strip():
@@ -913,7 +947,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
if not url.lower().startswith(("http://", "https://", "file://")):
log(f"[screen_shot] Skipping non-URL input: {url}", file=sys.stderr)
continue
try:
# Create screenshot with provided options
# Force the Playwright engine to Chromium for the screen-shot cmdlet
@@ -966,28 +1000,32 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
options.prefer_platform_target = True
options.target_selectors = auto_selectors
debug(f"[screen_shot] Auto selectors matched for url: {auto_selectors}")
screenshot_result = _capture_screenshot(options, progress)
# Log results and warnings
debug(f"Screenshot captured to {screenshot_result.path}")
if screenshot_result.archive_url:
debug(f"Archives: {', '.join(screenshot_result.archive_url)}")
for warning in screenshot_result.warnings:
debug(f"Warning: {warning}")
# Compute hash of screenshot file
screenshot_hash = None
try:
with open(screenshot_result.path, 'rb') as f:
with open(screenshot_result.path, "rb") as f:
screenshot_hash = hashlib.sha256(f.read()).hexdigest()
except Exception:
pass
# Create PipeObject result - marked as TEMP since derivative artifact
capture_date = ""
try:
capture_date = datetime.fromtimestamp(screenshot_result.path.stat().st_mtime).date().isoformat()
capture_date = (
datetime.fromtimestamp(screenshot_result.path.stat().st_mtime)
.date()
.isoformat()
)
except Exception:
capture_date = datetime.now().date().isoformat()
@@ -997,7 +1035,8 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
upstream_tags = _extract_item_tags(origin_item)
filtered_upstream_tags = [
t for t in upstream_tags
t
for t in upstream_tags
if not str(t).strip().lower().startswith(("type:", "date:"))
]
@@ -1007,40 +1046,41 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
)
pipe_obj = create_pipe_object_result(
source='screenshot',
store='PATH',
source="screenshot",
store="PATH",
identifier=Path(screenshot_result.path).stem,
file_path=str(screenshot_result.path),
cmdlet_name='screen-shot',
cmdlet_name="screen-shot",
title=display_title,
hash_value=screenshot_hash,
is_temp=True,
parent_hash=hashlib.sha256(url.encode()).hexdigest(),
tag=merged_tags,
extra={
'source_url': url,
'archive_url': screenshot_result.archive_url,
'url': screenshot_result.url,
'target': str(screenshot_result.path), # Explicit target for add-file
}
"source_url": url,
"archive_url": screenshot_result.archive_url,
"url": screenshot_result.url,
"target": str(screenshot_result.path), # Explicit target for add-file
},
)
# Emit the result so downstream cmdlet (like add-file) can use it
pipeline_context.emit(pipe_obj)
all_emitted.append(pipe_obj)
# If we created a local progress UI, advance it per completed item.
progress.on_emit(pipe_obj)
except ScreenshotError as exc:
log(f"Error taking screenshot of {url}: {exc}", file=sys.stderr)
exit_code = 1
except Exception as exc:
log(f"Unexpected error taking screenshot of {url}: {exc}", file=sys.stderr)
import traceback
traceback.print_exc(file=sys.stderr)
exit_code = 1
progress.close_local_ui(force_complete=True)
if not all_emitted:
@@ -1051,6 +1091,8 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
log(f"✓ Successfully captured {len(all_emitted)} screenshot(s)")
return exit_code
CMDLET = Cmdlet(
name="screen-shot",
summary="Capture a website screenshot",
@@ -1058,16 +1100,17 @@ CMDLET = Cmdlet(
alias=["screenshot", "ss"],
arg=[
SharedArgs.URL,
CmdletArg(name="format", type="string", description="Output format: webp, png, jpeg, or pdf"),
CmdletArg(
name="format", type="string", description="Output format: webp, png, jpeg, or pdf"
),
CmdletArg(name="selector", type="string", description="CSS selector for element capture"),
SharedArgs.PATH
SharedArgs.PATH,
],
detail=[
"Uses Playwright Chromium engine only. Install Chromium with: python ./scripts/bootstrap.py --playwright-only --browsers chromium",
"PDF output requires headless Chromium (the cmdlet will enforce headless mode for PDF).",
"Screenshots are temporary artifacts stored in the configured `temp` directory.",
]
],
)
CMDLET.exec = _run