Add YAPF style + ignore, and format tracked Python files

This commit is contained in:
2025-12-29 18:42:02 -08:00
parent c019c00aed
commit 507946a3e4
108 changed files with 11664 additions and 6494 deletions

View File

@@ -36,7 +36,6 @@ import pipeline as pipeline_context
# CMDLET Metadata Declaration
# ============================================================================
# ============================================================================
# Playwright & Screenshot Dependencies
# ============================================================================
@@ -44,13 +43,13 @@ import pipeline as pipeline_context
from tool.playwright import HAS_PLAYWRIGHT, PlaywrightTimeoutError, PlaywrightTool
try:
from config import resolve_output_dir
from SYS.config import resolve_output_dir
except ImportError:
try:
_parent_dir = str(Path(__file__).parent.parent)
if _parent_dir not in sys.path:
sys.path.insert(0, _parent_dir)
from config import resolve_output_dir
from SYS.config import resolve_output_dir
except ImportError:
resolve_output_dir = None
@@ -64,7 +63,11 @@ USER_AGENT = (
"Chrome/120.0.0.0 Safari/537.36"
)
DEFAULT_VIEWPORT: dict[str, int] = {"width": 1920, "height": 1080}
DEFAULT_VIEWPORT: dict[str,
int] = {
"width": 1920,
"height": 1080
}
ARCHIVE_TIMEOUT = 30.0
# WebP has a hard maximum dimension per side.
@@ -72,35 +75,36 @@ ARCHIVE_TIMEOUT = 30.0
WEBP_MAX_DIM = 16_383
# Configurable selectors for specific websites
SITE_SELECTORS: Dict[str, List[str]] = {
"twitter.com": [
"article[role='article']",
"div[data-testid='tweet']",
"div[data-testid='cellInnerDiv'] article",
],
"x.com": [
"article[role='article']",
"div[data-testid='tweet']",
"div[data-testid='cellInnerDiv'] article",
],
"instagram.com": [
"article[role='presentation']",
"article[role='article']",
"div[role='dialog'] article",
"section main article",
],
"reddit.com": [
"shreddit-post",
"div[data-testid='post-container']",
"div[data-click-id='background']",
"article",
],
"rumble.com": [
"rumble-player, iframe.rumble",
"div.video-item--main",
"main article",
],
}
SITE_SELECTORS: Dict[str,
List[str]] = {
"twitter.com": [
"article[role='article']",
"div[data-testid='tweet']",
"div[data-testid='cellInnerDiv'] article",
],
"x.com": [
"article[role='article']",
"div[data-testid='tweet']",
"div[data-testid='cellInnerDiv'] article",
],
"instagram.com": [
"article[role='presentation']",
"article[role='article']",
"div[role='dialog'] article",
"section main article",
],
"reddit.com": [
"shreddit-post",
"div[data-testid='post-container']",
"div[data-click-id='background']",
"article",
],
"rumble.com": [
"rumble-player, iframe.rumble",
"div.video-item--main",
"main article",
],
}
class ScreenshotError(RuntimeError):
@@ -172,9 +176,13 @@ def _tags_from_url(url: str) -> List[str]:
try:
parsed = urlsplit(u)
host = (
str(getattr(parsed, "hostname", None) or getattr(parsed, "netloc", "") or "")
.strip()
.lower()
str(
getattr(parsed,
"hostname",
None) or getattr(parsed,
"netloc",
"") or ""
).strip().lower()
)
except Exception:
parsed = None
@@ -187,7 +195,7 @@ def _tags_from_url(url: str) -> List[str]:
if ":" in host:
host = host.split(":", 1)[0]
if host.startswith("www."):
host = host[len("www.") :]
host = host[len("www."):]
path = ""
if parsed is not None:
@@ -230,7 +238,7 @@ def _title_from_url(url: str) -> str:
"""Return the normalized title derived from a URL's last path segment."""
for t in _tags_from_url(url):
if str(t).lower().startswith("title:"):
return str(t)[len("title:") :].strip()
return str(t)[len("title:"):].strip()
return ""
@@ -239,9 +247,12 @@ def _normalise_format(fmt: Optional[str]) -> str:
if not fmt:
return "webp"
value = fmt.strip().lower()
if value in {"jpg", "jpeg"}:
if value in {"jpg",
"jpeg"}:
return "jpeg"
if value in {"png", "pdf", "webp"}:
if value in {"png",
"pdf",
"webp"}:
return value
return "webp"
@@ -285,11 +296,12 @@ def _convert_to_webp(
try:
with Image.open(src_png) as im:
did_downscale = False
save_kwargs: Dict[str, Any] = {
"format": "WEBP",
"quality": int(quality),
"method": int(method),
}
save_kwargs: Dict[str,
Any] = {
"format": "WEBP",
"quality": int(quality),
"method": int(method),
}
# Preserve alpha when present; Pillow handles it for WEBP.
# Normalize palette images to RGBA to avoid odd palette artifacts.
@@ -303,12 +315,9 @@ def _convert_to_webp(
except Exception:
w, h = 0, 0
if (
downscale_if_oversize
and isinstance(max_dim, int)
and max_dim > 0
and (w > max_dim or h > max_dim)
):
if (downscale_if_oversize and isinstance(max_dim,
int) and max_dim > 0
and (w > max_dim or h > max_dim)):
scale = 1.0
try:
scale = min(float(max_dim) / float(w), float(max_dim) / float(h))
@@ -322,7 +331,13 @@ def _convert_to_webp(
f"[_convert_to_webp] Image exceeds WebP limit ({w}x{h}); downscaling -> {new_w}x{new_h}"
)
try:
resample = getattr(getattr(Image, "Resampling", Image), "LANCZOS", None)
resample = getattr(
getattr(Image,
"Resampling",
Image),
"LANCZOS",
None
)
if resample is None:
resample = getattr(Image, "LANCZOS", 1)
im = im.resize((new_w, new_h), resample=resample)
@@ -367,13 +382,20 @@ def _selectors_for_url(url: str) -> List[str]:
def _platform_preprocess(
url: str, page: Any, warnings: List[str], timeout_ms: int = 10_000
url: str,
page: Any,
warnings: List[str],
timeout_ms: int = 10_000
) -> None:
"""Best-effort page tweaks for popular platforms before capture."""
try:
u = str(url or "").lower()
def _try_click_buttons(names: List[str], passes: int = 2, per_timeout: int = 700) -> int:
def _try_click_buttons(
names: List[str],
passes: int = 2,
per_timeout: int = 700
) -> int:
clicks = 0
for _ in range(max(1, int(passes))):
for name in names:
@@ -411,7 +433,9 @@ def _platform_preprocess(
def _submit_wayback(url: str, timeout: float) -> Optional[str]:
encoded = quote(url, safe="/:?=&")
with HTTPClient(headers={"User-Agent": USER_AGENT}) as client:
with HTTPClient(headers={
"User-Agent": USER_AGENT
}) as client:
response = client.get(f"https://web.archive.org/save/{encoded}")
content_location = response.headers.get("Content-Location")
if content_location:
@@ -422,7 +446,9 @@ def _submit_wayback(url: str, timeout: float) -> Optional[str]:
def _submit_archive_today(url: str, timeout: float) -> Optional[str]:
"""Submit URL to Archive.today."""
encoded = quote(url, safe=":/?#[]@!$&'()*+,;=")
with HTTPClient(headers={"User-Agent": USER_AGENT}) as client:
with HTTPClient(headers={
"User-Agent": USER_AGENT
}) as client:
response = client.get(f"https://archive.today/submit/?url={encoded}")
response.raise_for_status()
final = str(response.url)
@@ -434,7 +460,9 @@ def _submit_archive_today(url: str, timeout: float) -> Optional[str]:
def _submit_archive_ph(url: str, timeout: float) -> Optional[str]:
"""Submit URL to Archive.ph."""
encoded = quote(url, safe=":/?#[]@!$&'()*+,;=")
with HTTPClient(headers={"User-Agent": USER_AGENT}) as client:
with HTTPClient(headers={
"User-Agent": USER_AGENT
}) as client:
response = client.get(f"https://archive.ph/submit/?url={encoded}")
response.raise_for_status()
final = str(response.url)
@@ -460,7 +488,9 @@ def _archive_url(url: str, timeout: float) -> Tuple[List[str], List[str]]:
warnings.append(f"archive {label} rate limited (HTTP 429)")
debug(f"{label}: Rate limited (HTTP 429)")
else:
warnings.append(f"archive {label} failed: HTTP {exc.response.status_code}")
warnings.append(
f"archive {label} failed: HTTP {exc.response.status_code}"
)
debug(f"{label}: HTTP {exc.response.status_code}")
except httpx.RequestError as exc:
warnings.append(f"archive {label} failed: {exc}")
@@ -480,7 +510,9 @@ def _archive_url(url: str, timeout: float) -> Tuple[List[str], List[str]]:
def _prepare_output_path(options: ScreenshotOptions) -> Path:
"""Prepare and validate output path for screenshot."""
ensure_directory(options.output_dir)
explicit_format = _normalise_format(options.output_format) if options.output_format else None
explicit_format = _normalise_format(
options.output_format
) if options.output_format else None
inferred_format: Optional[str] = None
if options.output_path is not None:
path = options.output_path
@@ -506,7 +538,10 @@ def _prepare_output_path(options: ScreenshotOptions) -> Path:
def _capture(
options: ScreenshotOptions, destination: Path, warnings: List[str], progress: PipelineProgress
options: ScreenshotOptions,
destination: Path,
warnings: List[str],
progress: PipelineProgress
) -> None:
"""Capture screenshot using Playwright."""
debug(f"[_capture] Starting capture for {options.url} -> {destination}")
@@ -517,9 +552,11 @@ def _capture(
# Ensure Chromium engine is used for the screen-shot cmdlet (force for consistency)
try:
current_browser = (
getattr(tool.defaults, "browser", "").lower()
if getattr(tool, "defaults", None) is not None
else ""
getattr(tool.defaults,
"browser",
"").lower() if getattr(tool,
"defaults",
None) is not None else ""
)
if current_browser != "chromium":
debug(
@@ -527,12 +564,18 @@ def _capture(
)
base_cfg = {}
try:
base_cfg = dict(getattr(tool, "_config", {}) or {})
base_cfg = dict(getattr(tool,
"_config",
{}) or {})
except Exception:
base_cfg = {}
tool_block = dict(base_cfg.get("tool") or {}) if isinstance(base_cfg, dict) else {}
tool_block = dict(base_cfg.get("tool") or {}
) if isinstance(base_cfg,
dict) else {}
pw_block = (
dict(tool_block.get("playwright") or {}) if isinstance(tool_block, dict) else {}
dict(tool_block.get("playwright") or {})
if isinstance(tool_block,
dict) else {}
)
pw_block["browser"] = "chromium"
tool_block["playwright"] = pw_block
@@ -540,7 +583,13 @@ def _capture(
base_cfg["tool"] = tool_block
tool = PlaywrightTool(base_cfg)
except Exception:
tool = PlaywrightTool({"tool": {"playwright": {"browser": "chromium"}}})
tool = PlaywrightTool({
"tool": {
"playwright": {
"browser": "chromium"
}
}
})
tool.debug_dump()
@@ -550,7 +599,9 @@ def _capture(
debug(f"[_capture] Format: {format_name}, Headless: {headless}")
if format_name == "pdf" and not options.headless:
warnings.append("pdf output requires headless Chromium; overriding headless mode")
warnings.append(
"pdf output requires headless Chromium; overriding headless mode"
)
try:
with tool.open_page(headless=headless) as page:
@@ -572,11 +623,15 @@ def _capture(
page.wait_for_selector("article", timeout=10_000)
debug("Article element found")
except PlaywrightTimeoutError:
warnings.append("<article> selector not found; capturing fallback")
warnings.append(
"<article> selector not found; capturing fallback"
)
debug("Article element not found; using fallback")
if options.wait_after_load > 0:
debug(f"Waiting {options.wait_after_load}s for page stabilization...")
debug(
f"Waiting {options.wait_after_load}s for page stabilization..."
)
time.sleep(min(10.0, max(0.0, options.wait_after_load)))
progress.step("loading stabilized")
@@ -617,7 +672,9 @@ def _capture(
try:
debug(f"Trying selector: {sel}")
el = page.wait_for_selector(
sel, timeout=max(0, int(options.selector_timeout_ms))
sel,
timeout=max(0,
int(options.selector_timeout_ms))
)
except PlaywrightTimeoutError:
debug(f"Selector not found: {sel}")
@@ -639,7 +696,9 @@ def _capture(
debug("Element captured successfully")
break
except Exception as exc:
warnings.append(f"element capture failed for '{sel}': {exc}")
warnings.append(
f"element capture failed for '{sel}': {exc}"
)
debug(f"Failed to capture element: {exc}")
# Fallback to default capture paths
if element_captured:
@@ -653,7 +712,10 @@ def _capture(
progress.step("capturing saved")
else:
debug(f"Capturing full page to {destination}...")
screenshot_kwargs: Dict[str, Any] = {"path": str(destination)}
screenshot_kwargs: Dict[str,
Any] = {
"path": str(destination)
}
if format_name == "jpeg":
screenshot_kwargs["type"] = "jpeg"
screenshot_kwargs["quality"] = 90
@@ -675,10 +737,8 @@ def _capture(
except Exception as exc:
debug(f"[_capture] Exception launching browser/page: {exc}")
msg = str(exc).lower()
if any(
k in msg
for k in ["executable", "not found", "no such file", "cannot find", "install"]
):
if any(k in msg for k in ["executable", "not found", "no such file",
"cannot find", "install"]):
raise ScreenshotError(
"Chromium Playwright browser binaries not found. Install them: python ./scripts/bootstrap.py --playwright-only --browsers chromium"
) from exc
@@ -691,7 +751,10 @@ def _capture(
raise ScreenshotError(f"Failed to capture screenshot: {exc}") from exc
def _capture_screenshot(options: ScreenshotOptions, progress: PipelineProgress) -> ScreenshotResult:
def _capture_screenshot(
options: ScreenshotOptions,
progress: PipelineProgress
) -> ScreenshotResult:
"""Capture a screenshot for the given options."""
debug(f"[_capture_screenshot] Preparing capture for {options.url}")
requested_format = _normalise_format(options.output_format)
@@ -702,7 +765,8 @@ def _capture_screenshot(options: ScreenshotOptions, progress: PipelineProgress)
will_convert = requested_format == "webp"
will_archive = bool(options.archive and options.url)
total_steps = (
9 + (1 if will_target else 0) + (1 if will_convert else 0) + (1 if will_archive else 0)
9 + (1 if will_target else 0) + (1 if will_convert else 0) +
(1 if will_archive else 0)
)
progress.begin_steps(total_steps)
progress.step("loading starting")
@@ -712,7 +776,9 @@ def _capture_screenshot(options: ScreenshotOptions, progress: PipelineProgress)
capture_path = destination
if requested_format == "webp":
capture_path = unique_path(destination.with_suffix(".png"))
debug(f"[_capture_screenshot] Requested webp; capturing intermediate png -> {capture_path}")
debug(
f"[_capture_screenshot] Requested webp; capturing intermediate png -> {capture_path}"
)
options.output_format = "png"
_capture(options, capture_path, warnings, progress)
@@ -808,7 +874,9 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
# [tool=playwright]
# format="pdf"
try:
tool_cfg = config.get("tool", {}) if isinstance(config, dict) else {}
tool_cfg = config.get("tool",
{}) if isinstance(config,
dict) else {}
pw_cfg = tool_cfg.get("playwright") if isinstance(tool_cfg, dict) else None
if isinstance(pw_cfg, dict):
format_value = pw_cfg.get("format")
@@ -839,7 +907,11 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
# Extract url from piped results
if piped_results:
for item in piped_results:
url = get_field(item, "path") or get_field(item, "url") or get_field(item, "target")
url = get_field(item,
"path") or get_field(item,
"url"
) or get_field(item,
"target")
if url:
url_to_process.append((str(url), item))
@@ -910,6 +982,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
all_emitted = []
exit_code = 0
# ========================================================================
# PROCESS url AND CAPTURE SCREENSHOTS
# ========================================================================
@@ -970,8 +1043,11 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
"playwright": {
"browser": "chromium",
"user_agent": "native",
"viewport_width": int(DEFAULT_VIEWPORT.get("width", 1920)),
"viewport_height": int(DEFAULT_VIEWPORT.get("height", 1080)),
"viewport_width": int(DEFAULT_VIEWPORT.get("width",
1920)),
"viewport_height":
int(DEFAULT_VIEWPORT.get("height",
1080)),
}
}
}
@@ -995,7 +1071,9 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
if manual_target_selectors:
options.prefer_platform_target = True
options.target_selectors = manual_target_selectors
debug(f"[screen_shot] Using explicit selector(s): {manual_target_selectors}")
debug(
f"[screen_shot] Using explicit selector(s): {manual_target_selectors}"
)
elif auto_selectors:
options.prefer_platform_target = True
options.target_selectors = auto_selectors
@@ -1022,9 +1100,8 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
capture_date = ""
try:
capture_date = (
datetime.fromtimestamp(screenshot_result.path.stat().st_mtime)
.date()
.isoformat()
datetime.fromtimestamp(screenshot_result.path.stat().st_mtime
).date().isoformat()
)
except Exception:
capture_date = datetime.now().date().isoformat()
@@ -1035,14 +1112,14 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
upstream_tags = _extract_item_tags(origin_item)
filtered_upstream_tags = [
t
for t in upstream_tags
t for t in upstream_tags
if not str(t).strip().lower().startswith(("type:", "date:"))
]
url_tags = _tags_from_url(url)
merged_tags = unique_preserve_order(
["type:screenshot", f"date:{capture_date}"] + filtered_upstream_tags + url_tags
["type:screenshot",
f"date:{capture_date}"] + filtered_upstream_tags + url_tags
)
pipe_obj = create_pipe_object_result(
@@ -1097,13 +1174,20 @@ CMDLET = Cmdlet(
name="screen-shot",
summary="Capture a website screenshot",
usage="screen-shot <url> [options]",
alias=["screenshot", "ss"],
alias=["screenshot",
"ss"],
arg=[
SharedArgs.URL,
CmdletArg(
name="format", type="string", description="Output format: webp, png, jpeg, or pdf"
name="format",
type="string",
description="Output format: webp, png, jpeg, or pdf"
),
CmdletArg(
name="selector",
type="string",
description="CSS selector for element capture"
),
CmdletArg(name="selector", type="string", description="CSS selector for element capture"),
SharedArgs.PATH,
],
detail=[