refactor(download): remove ProviderCore/download.py, move sanitize_filename to SYS.utils, replace callers to use API.HTTP.HTTPClient
This commit is contained in:
@@ -1,29 +1,24 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import re
|
||||
import tempfile
|
||||
import traceback
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, Iterator, Optional
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterator, Optional, Union
|
||||
|
||||
from SYS.logger import debug
|
||||
|
||||
try:
|
||||
from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
HAS_PLAYWRIGHT = True
|
||||
_PLAYWRIGHT_IMPORT_ERROR: Optional[Exception] = None
|
||||
except Exception as exc: # pragma: no cover
|
||||
HAS_PLAYWRIGHT = False
|
||||
_PLAYWRIGHT_IMPORT_ERROR = exc
|
||||
PlaywrightTimeoutError = TimeoutError # type: ignore
|
||||
sync_playwright = None # type: ignore
|
||||
from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
# Re-export for consumers (e.g. cmdlets catching navigation timeouts)
|
||||
__all__ = [
|
||||
"HAS_PLAYWRIGHT",
|
||||
"PlaywrightTimeoutError",
|
||||
"PlaywrightTool",
|
||||
"PlaywrightDefaults"
|
||||
"PlaywrightDefaults",
|
||||
"PlaywrightDownloadResult",
|
||||
]
|
||||
|
||||
|
||||
@@ -36,6 +31,36 @@ def _get_nested(config: Dict[str, Any], *path: str) -> Any:
|
||||
return cur
|
||||
|
||||
|
||||
def _resolve_out_dir(arg_outdir: Optional[Union[str, Path]]) -> Path:
|
||||
"""Resolve an output directory using config when possible."""
|
||||
if arg_outdir:
|
||||
p = Path(arg_outdir)
|
||||
p.mkdir(parents=True, exist_ok=True)
|
||||
return p
|
||||
|
||||
try:
|
||||
from SYS.config import load_config, resolve_output_dir
|
||||
|
||||
cfg = load_config()
|
||||
p = resolve_output_dir(cfg)
|
||||
try:
|
||||
p.mkdir(parents=True, exist_ok=True)
|
||||
except Exception:
|
||||
pass
|
||||
return p
|
||||
except Exception:
|
||||
return Path(tempfile.mkdtemp(prefix="pwdl_"))
|
||||
|
||||
|
||||
def _find_filename_from_cd(cd: str) -> Optional[str]:
|
||||
if not cd:
|
||||
return None
|
||||
m = re.search(r"filename\*?=(?:UTF-8''\s*)?\"?([^\";]+)\"?", cd)
|
||||
if m:
|
||||
return m.group(1)
|
||||
return None
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class PlaywrightDefaults:
|
||||
browser: str = "chromium" # chromium|firefox|webkit
|
||||
@@ -51,6 +76,24 @@ class PlaywrightDefaults:
|
||||
ignore_https_errors: bool = True
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class PlaywrightDownloadResult:
|
||||
ok: bool
|
||||
path: Optional[Path] = None
|
||||
url: Optional[str] = None
|
||||
mode: Optional[str] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"ok": bool(self.ok),
|
||||
"path": str(self.path) if self.path else None,
|
||||
"url": self.url,
|
||||
"mode": self.mode,
|
||||
"error": self.error,
|
||||
}
|
||||
|
||||
|
||||
class PlaywrightTool:
|
||||
"""Small wrapper to standardize Playwright defaults and lifecycle.
|
||||
|
||||
@@ -130,13 +173,13 @@ class PlaywrightTool:
|
||||
)
|
||||
|
||||
def require(self) -> None:
|
||||
if HAS_PLAYWRIGHT and sync_playwright is not None:
|
||||
return
|
||||
detail = str(_PLAYWRIGHT_IMPORT_ERROR or "playwright is not installed")
|
||||
raise RuntimeError(
|
||||
"playwright is required; install with: pip install playwright; then: playwright install\n"
|
||||
f"detail: {detail}"
|
||||
)
|
||||
"""Ensure Playwright is present; raise a helpful RuntimeError if not."""
|
||||
try:
|
||||
assert sync_playwright is not None
|
||||
except Exception:
|
||||
raise RuntimeError(
|
||||
"playwright is required; install with: pip install playwright; then: playwright install"
|
||||
)
|
||||
|
||||
@contextlib.contextmanager
|
||||
def open_page(
|
||||
@@ -147,6 +190,7 @@ class PlaywrightTool:
|
||||
viewport_width: Optional[int] = None,
|
||||
viewport_height: Optional[int] = None,
|
||||
ignore_https_errors: Optional[bool] = None,
|
||||
accept_downloads: bool = False,
|
||||
) -> Iterator[Any]:
|
||||
"""Context manager yielding a Playwright page with sane defaults."""
|
||||
self.require()
|
||||
@@ -198,6 +242,7 @@ class PlaywrightTool:
|
||||
"height": vh
|
||||
},
|
||||
"ignore_https_errors": ihe,
|
||||
"accept_downloads": bool(accept_downloads),
|
||||
}
|
||||
if ua_value is not None:
|
||||
context_kwargs["user_agent"] = ua_value
|
||||
@@ -233,6 +278,146 @@ class PlaywrightTool:
|
||||
except Exception:
|
||||
raise
|
||||
|
||||
def download_file(
|
||||
self,
|
||||
url: str,
|
||||
*,
|
||||
selector: str = "form#dl_form button[type=submit]",
|
||||
out_dir: Optional[Union[str, Path]] = None,
|
||||
timeout_sec: int = 60,
|
||||
headless_first: bool = False,
|
||||
debug_mode: bool = False,
|
||||
) -> PlaywrightDownloadResult:
|
||||
"""Download a file by clicking a selector and capturing the response.
|
||||
|
||||
The helper mirrors the standalone `scripts/playwright_fetch.py` logic
|
||||
and tries multiple click strategies (expect_download, tooltip continue,
|
||||
submitDL, JS/mouse click) to coax stubborn sites.
|
||||
"""
|
||||
try:
|
||||
self.require()
|
||||
except Exception as exc:
|
||||
return PlaywrightDownloadResult(ok=False, error=str(exc))
|
||||
|
||||
out_path_base = _resolve_out_dir(out_dir)
|
||||
timeout_ms = max(10_000, int(timeout_sec) * 1000 if timeout_sec is not None else int(self.defaults.navigation_timeout_ms))
|
||||
nav_timeout_ms = max(timeout_ms, int(self.defaults.navigation_timeout_ms))
|
||||
selector_timeout_ms = 10_000
|
||||
|
||||
# Preserve legacy behaviour: headless_first=False tries headful then headless; True reverses the order.
|
||||
order = [True, False] if headless_first else [False, True]
|
||||
seen = set()
|
||||
modes = []
|
||||
for m in order:
|
||||
if m in seen:
|
||||
continue
|
||||
seen.add(m)
|
||||
modes.append(m)
|
||||
|
||||
last_error: Optional[str] = None
|
||||
|
||||
for mode in modes:
|
||||
try:
|
||||
if debug_mode:
|
||||
debug(f"[playwright] download url={url} selector={selector} headless={mode} out_dir={out_path_base}")
|
||||
|
||||
with self.open_page(headless=mode, accept_downloads=True) as page:
|
||||
page.goto(url, wait_until="networkidle", timeout=nav_timeout_ms)
|
||||
page.wait_for_selector(selector, timeout=selector_timeout_ms)
|
||||
self._wait_for_block_clear(page, timeout_ms=6000)
|
||||
|
||||
el = page.query_selector(selector)
|
||||
|
||||
# 1) Direct click with expect_download
|
||||
try:
|
||||
with page.expect_download(timeout=timeout_ms) as dl_info:
|
||||
if el:
|
||||
el.click()
|
||||
else:
|
||||
page.click(selector)
|
||||
dl = dl_info.value
|
||||
filename = dl.suggested_filename or Path(dl.url).name or "download"
|
||||
out_path = out_path_base / filename
|
||||
dl.save_as(str(out_path))
|
||||
return PlaywrightDownloadResult(ok=True, path=out_path, url=dl.url, mode="download")
|
||||
except PlaywrightTimeoutError:
|
||||
last_error = "download timeout"
|
||||
except Exception as click_exc:
|
||||
last_error = str(click_exc) or last_error
|
||||
|
||||
# 2) Tooltip continue flow
|
||||
try:
|
||||
btn = page.query_selector("#tooltip4 input[type=button]")
|
||||
if btn:
|
||||
btn.click()
|
||||
with page.expect_download(timeout=timeout_ms) as dl_info:
|
||||
if el:
|
||||
el.click()
|
||||
else:
|
||||
page.click(selector)
|
||||
dl = dl_info.value
|
||||
filename = dl.suggested_filename or Path(dl.url).name or "download"
|
||||
out_path = out_path_base / filename
|
||||
dl.save_as(str(out_path))
|
||||
return PlaywrightDownloadResult(ok=True, path=out_path, url=dl.url, mode="tooltip-download")
|
||||
except Exception as tooltip_exc:
|
||||
last_error = str(tooltip_exc) or last_error
|
||||
|
||||
# 3) Submit handler that respects tooltip flow
|
||||
try:
|
||||
page.evaluate("() => { try { submitDL(document.forms['dl_form'], 'tooltip4'); } catch (e) {} }")
|
||||
resp = page.wait_for_response(
|
||||
lambda r: r.status == 200 and any(k.lower() == 'content-disposition' for k in r.headers.keys()),
|
||||
timeout=timeout_ms,
|
||||
)
|
||||
if resp:
|
||||
out_path = self._save_response(resp, out_path_base)
|
||||
if out_path:
|
||||
return PlaywrightDownloadResult(ok=True, path=out_path, url=getattr(resp, "url", None), mode="response")
|
||||
except Exception as resp_exc:
|
||||
last_error = str(resp_exc) or last_error
|
||||
|
||||
# 4) JS/mouse click and capture response
|
||||
try:
|
||||
if el:
|
||||
try:
|
||||
page.evaluate("el => el.click()", el)
|
||||
except Exception:
|
||||
page.evaluate(f"() => document.querySelector('{selector}').click()")
|
||||
else:
|
||||
page.evaluate(f"() => document.querySelector('{selector}').click()")
|
||||
|
||||
if el:
|
||||
try:
|
||||
box = el.bounding_box()
|
||||
if box:
|
||||
page.mouse.move(box['x'] + box['width'] / 2, box['y'] + box['height'] / 2)
|
||||
page.mouse.click(box['x'] + box['width'] / 2, box['y'] + box['height'] / 2)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
resp = page.wait_for_response(
|
||||
lambda r: r.status == 200 and any(k.lower() == 'content-disposition' for k in r.headers.keys()),
|
||||
timeout=timeout_ms,
|
||||
)
|
||||
if resp:
|
||||
out_path = self._save_response(resp, out_path_base)
|
||||
if out_path:
|
||||
return PlaywrightDownloadResult(ok=True, path=out_path, url=getattr(resp, "url", None), mode="response-fallback")
|
||||
except Exception as final_exc:
|
||||
last_error = str(final_exc) or last_error
|
||||
|
||||
except Exception as exc:
|
||||
last_error = str(exc)
|
||||
if debug_mode:
|
||||
try:
|
||||
debug(f"[playwright] attempt failed (headless={mode}): {traceback.format_exc()}")
|
||||
except Exception:
|
||||
pass
|
||||
continue
|
||||
|
||||
return PlaywrightDownloadResult(ok=False, error=last_error or "no download captured")
|
||||
|
||||
def debug_dump(self) -> None:
|
||||
try:
|
||||
debug(
|
||||
@@ -242,3 +427,34 @@ class PlaywrightTool:
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _wait_for_block_clear(self, page: Any, timeout_ms: int = 8000) -> bool:
|
||||
try:
|
||||
page.wait_for_function(
|
||||
"() => { for (const k in window) { if (Object.prototype.hasOwnProperty.call(window, k) && k.startsWith('blocked_')) { try { return window[k] === false; } catch(e) {} return false; } } return true; }",
|
||||
timeout=timeout_ms,
|
||||
)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _save_response(self, response: Any, out_dir: Path) -> Optional[Path]:
|
||||
try:
|
||||
cd = ""
|
||||
try:
|
||||
headers = getattr(response, "headers", {}) or {}
|
||||
cd = "".join([v for k, v in headers.items() if str(k).lower() == "content-disposition"])
|
||||
except Exception:
|
||||
cd = ""
|
||||
|
||||
filename = _find_filename_from_cd(cd) or Path(str(getattr(response, "url", "") or "")).name or "download"
|
||||
body = response.body()
|
||||
out_path = out_dir / filename
|
||||
out_path.write_bytes(body)
|
||||
return out_path
|
||||
except Exception as exc:
|
||||
try:
|
||||
debug(f"[playwright] failed to save response: {exc}")
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user