refactor(download): remove ProviderCore/download.py, move sanitize_filename to SYS.utils, replace callers to use API.HTTP.HTTPClient

This commit is contained in:
2026-01-06 01:38:59 -08:00
parent 3b363dd536
commit 41c11d39fd
38 changed files with 2640 additions and 526 deletions

View File

@@ -1,29 +1,24 @@
from __future__ import annotations
import contextlib
import re
import tempfile
import traceback
from dataclasses import dataclass
from typing import Any, Dict, Iterator, Optional
from pathlib import Path
from typing import Any, Dict, Iterator, Optional, Union
from SYS.logger import debug
try:
from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
from playwright.sync_api import sync_playwright
HAS_PLAYWRIGHT = True
_PLAYWRIGHT_IMPORT_ERROR: Optional[Exception] = None
except Exception as exc: # pragma: no cover
HAS_PLAYWRIGHT = False
_PLAYWRIGHT_IMPORT_ERROR = exc
PlaywrightTimeoutError = TimeoutError # type: ignore
sync_playwright = None # type: ignore
from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
from playwright.sync_api import sync_playwright
# Re-export for consumers (e.g. cmdlets catching navigation timeouts)
__all__ = [
"HAS_PLAYWRIGHT",
"PlaywrightTimeoutError",
"PlaywrightTool",
"PlaywrightDefaults"
"PlaywrightDefaults",
"PlaywrightDownloadResult",
]
@@ -36,6 +31,36 @@ def _get_nested(config: Dict[str, Any], *path: str) -> Any:
return cur
def _resolve_out_dir(arg_outdir: Optional[Union[str, Path]]) -> Path:
"""Resolve an output directory using config when possible."""
if arg_outdir:
p = Path(arg_outdir)
p.mkdir(parents=True, exist_ok=True)
return p
try:
from SYS.config import load_config, resolve_output_dir
cfg = load_config()
p = resolve_output_dir(cfg)
try:
p.mkdir(parents=True, exist_ok=True)
except Exception:
pass
return p
except Exception:
return Path(tempfile.mkdtemp(prefix="pwdl_"))
def _find_filename_from_cd(cd: str) -> Optional[str]:
if not cd:
return None
m = re.search(r"filename\*?=(?:UTF-8''\s*)?\"?([^\";]+)\"?", cd)
if m:
return m.group(1)
return None
@dataclass(slots=True)
class PlaywrightDefaults:
browser: str = "chromium" # chromium|firefox|webkit
@@ -51,6 +76,24 @@ class PlaywrightDefaults:
ignore_https_errors: bool = True
@dataclass(slots=True)
class PlaywrightDownloadResult:
ok: bool
path: Optional[Path] = None
url: Optional[str] = None
mode: Optional[str] = None
error: Optional[str] = None
def to_dict(self) -> Dict[str, Any]:
return {
"ok": bool(self.ok),
"path": str(self.path) if self.path else None,
"url": self.url,
"mode": self.mode,
"error": self.error,
}
class PlaywrightTool:
"""Small wrapper to standardize Playwright defaults and lifecycle.
@@ -130,13 +173,13 @@ class PlaywrightTool:
)
def require(self) -> None:
if HAS_PLAYWRIGHT and sync_playwright is not None:
return
detail = str(_PLAYWRIGHT_IMPORT_ERROR or "playwright is not installed")
raise RuntimeError(
"playwright is required; install with: pip install playwright; then: playwright install\n"
f"detail: {detail}"
)
"""Ensure Playwright is present; raise a helpful RuntimeError if not."""
try:
assert sync_playwright is not None
except Exception:
raise RuntimeError(
"playwright is required; install with: pip install playwright; then: playwright install"
)
@contextlib.contextmanager
def open_page(
@@ -147,6 +190,7 @@ class PlaywrightTool:
viewport_width: Optional[int] = None,
viewport_height: Optional[int] = None,
ignore_https_errors: Optional[bool] = None,
accept_downloads: bool = False,
) -> Iterator[Any]:
"""Context manager yielding a Playwright page with sane defaults."""
self.require()
@@ -198,6 +242,7 @@ class PlaywrightTool:
"height": vh
},
"ignore_https_errors": ihe,
"accept_downloads": bool(accept_downloads),
}
if ua_value is not None:
context_kwargs["user_agent"] = ua_value
@@ -233,6 +278,146 @@ class PlaywrightTool:
except Exception:
raise
def download_file(
self,
url: str,
*,
selector: str = "form#dl_form button[type=submit]",
out_dir: Optional[Union[str, Path]] = None,
timeout_sec: int = 60,
headless_first: bool = False,
debug_mode: bool = False,
) -> PlaywrightDownloadResult:
"""Download a file by clicking a selector and capturing the response.
The helper mirrors the standalone `scripts/playwright_fetch.py` logic
and tries multiple click strategies (expect_download, tooltip continue,
submitDL, JS/mouse click) to coax stubborn sites.
"""
try:
self.require()
except Exception as exc:
return PlaywrightDownloadResult(ok=False, error=str(exc))
out_path_base = _resolve_out_dir(out_dir)
timeout_ms = max(10_000, int(timeout_sec) * 1000 if timeout_sec is not None else int(self.defaults.navigation_timeout_ms))
nav_timeout_ms = max(timeout_ms, int(self.defaults.navigation_timeout_ms))
selector_timeout_ms = 10_000
# Preserve legacy behaviour: headless_first=False tries headful then headless; True reverses the order.
order = [True, False] if headless_first else [False, True]
seen = set()
modes = []
for m in order:
if m in seen:
continue
seen.add(m)
modes.append(m)
last_error: Optional[str] = None
for mode in modes:
try:
if debug_mode:
debug(f"[playwright] download url={url} selector={selector} headless={mode} out_dir={out_path_base}")
with self.open_page(headless=mode, accept_downloads=True) as page:
page.goto(url, wait_until="networkidle", timeout=nav_timeout_ms)
page.wait_for_selector(selector, timeout=selector_timeout_ms)
self._wait_for_block_clear(page, timeout_ms=6000)
el = page.query_selector(selector)
# 1) Direct click with expect_download
try:
with page.expect_download(timeout=timeout_ms) as dl_info:
if el:
el.click()
else:
page.click(selector)
dl = dl_info.value
filename = dl.suggested_filename or Path(dl.url).name or "download"
out_path = out_path_base / filename
dl.save_as(str(out_path))
return PlaywrightDownloadResult(ok=True, path=out_path, url=dl.url, mode="download")
except PlaywrightTimeoutError:
last_error = "download timeout"
except Exception as click_exc:
last_error = str(click_exc) or last_error
# 2) Tooltip continue flow
try:
btn = page.query_selector("#tooltip4 input[type=button]")
if btn:
btn.click()
with page.expect_download(timeout=timeout_ms) as dl_info:
if el:
el.click()
else:
page.click(selector)
dl = dl_info.value
filename = dl.suggested_filename or Path(dl.url).name or "download"
out_path = out_path_base / filename
dl.save_as(str(out_path))
return PlaywrightDownloadResult(ok=True, path=out_path, url=dl.url, mode="tooltip-download")
except Exception as tooltip_exc:
last_error = str(tooltip_exc) or last_error
# 3) Submit handler that respects tooltip flow
try:
page.evaluate("() => { try { submitDL(document.forms['dl_form'], 'tooltip4'); } catch (e) {} }")
resp = page.wait_for_response(
lambda r: r.status == 200 and any(k.lower() == 'content-disposition' for k in r.headers.keys()),
timeout=timeout_ms,
)
if resp:
out_path = self._save_response(resp, out_path_base)
if out_path:
return PlaywrightDownloadResult(ok=True, path=out_path, url=getattr(resp, "url", None), mode="response")
except Exception as resp_exc:
last_error = str(resp_exc) or last_error
# 4) JS/mouse click and capture response
try:
if el:
try:
page.evaluate("el => el.click()", el)
except Exception:
page.evaluate(f"() => document.querySelector('{selector}').click()")
else:
page.evaluate(f"() => document.querySelector('{selector}').click()")
if el:
try:
box = el.bounding_box()
if box:
page.mouse.move(box['x'] + box['width'] / 2, box['y'] + box['height'] / 2)
page.mouse.click(box['x'] + box['width'] / 2, box['y'] + box['height'] / 2)
except Exception:
pass
resp = page.wait_for_response(
lambda r: r.status == 200 and any(k.lower() == 'content-disposition' for k in r.headers.keys()),
timeout=timeout_ms,
)
if resp:
out_path = self._save_response(resp, out_path_base)
if out_path:
return PlaywrightDownloadResult(ok=True, path=out_path, url=getattr(resp, "url", None), mode="response-fallback")
except Exception as final_exc:
last_error = str(final_exc) or last_error
except Exception as exc:
last_error = str(exc)
if debug_mode:
try:
debug(f"[playwright] attempt failed (headless={mode}): {traceback.format_exc()}")
except Exception:
pass
continue
return PlaywrightDownloadResult(ok=False, error=last_error or "no download captured")
def debug_dump(self) -> None:
try:
debug(
@@ -242,3 +427,34 @@ class PlaywrightTool:
)
except Exception:
pass
def _wait_for_block_clear(self, page: Any, timeout_ms: int = 8000) -> bool:
try:
page.wait_for_function(
"() => { for (const k in window) { if (Object.prototype.hasOwnProperty.call(window, k) && k.startsWith('blocked_')) { try { return window[k] === false; } catch(e) {} return false; } } return true; }",
timeout=timeout_ms,
)
return True
except Exception:
return False
def _save_response(self, response: Any, out_dir: Path) -> Optional[Path]:
try:
cd = ""
try:
headers = getattr(response, "headers", {}) or {}
cd = "".join([v for k, v in headers.items() if str(k).lower() == "content-disposition"])
except Exception:
cd = ""
filename = _find_filename_from_cd(cd) or Path(str(getattr(response, "url", "") or "")).name or "download"
body = response.body()
out_path = out_dir / filename
out_path.write_bytes(body)
return out_path
except Exception as exc:
try:
debug(f"[playwright] failed to save response: {exc}")
except Exception:
pass
return None