Files
Medios-Macina/API/HTTP.py
2026-02-02 19:49:07 -08:00

1240 lines
42 KiB
Python

"""
Unified HTTP client for downlow using httpx.
Provides synchronous and asynchronous HTTP operations with:
- Automatic retries on transient failures
- Configurable timeouts and headers
- Built-in progress tracking for downloads
- Request/response logging support
"""
import httpx
import asyncio
import sys
import time
import traceback
import re
import os
from typing import Optional, Dict, Any, Callable, List, Union
from pathlib import Path
from urllib.parse import unquote, urlparse, parse_qs
import logging
from SYS.logger import debug, is_debug_enabled, log
from SYS.models import DebugLogger, DownloadError, DownloadMediaResult, ProgressBar
from SYS.utils import ensure_directory, sha256_file
try: # Optional; used for metadata extraction when available
from SYS.yt_metadata import extract_ytdlp_tags
except Exception: # pragma: no cover - optional dependency
extract_ytdlp_tags = None # type: ignore[assignment]
logger = logging.getLogger(__name__)
def _resolve_verify_value(verify_ssl: bool) -> Union[bool, str]:
"""Return the httpx verify argument, preferring system-aware bundles.
Order of precedence:
1. If verify_ssl is not True (False or path), return it.
2. Respect existing SSL_CERT_FILE env var if present.
3. Prefer `pip_system_certs` if present and it exposes a bundle path.
4. Prefer `certifi_win32`/similar helpers by invoking them and reading certifi.where().
5. Fall back to `certifi.where()` if available.
6. Otherwise, return True to let httpx use system defaults.
"""
if verify_ssl is not True:
return verify_ssl
env_cert = os.environ.get("SSL_CERT_FILE")
if env_cert:
return env_cert
def _try_module_bundle(mod_name: str) -> Optional[str]:
# Prefer checking sys.modules first (helps test injection / monkeypatching)
mod = sys.modules.get(mod_name)
if mod is None:
# Avoid raising ModuleNotFoundError so debuggers and callers aren't interrupted.
# Check for module availability before attempting to import it.
try:
import importlib.util
spec = importlib.util.find_spec(mod_name)
if spec is None:
return None
import importlib
mod = importlib.import_module(mod_name)
except Exception:
# Treat any import/initialization failure as module not available.
return None
# Common APIs that return a bundle path
for attr in ("where", "get_ca_bundle", "bundle_path", "get_bundle_path", "get_bundle"):
fn = getattr(mod, attr, None)
if callable(fn):
try:
res = fn()
if res:
return res
except Exception:
continue
elif isinstance(fn, str) and fn:
return fn
# Some helpers (e.g., certifi_win32) expose an action to merge system certs
for call_attr in ("add_windows_store_certs", "add_system_certs", "merge_system_certs"):
fn = getattr(mod, call_attr, None)
if callable(fn):
try:
fn()
try:
import certifi as _certifi
res = _certifi.where()
if res:
return res
except Exception:
logger.exception("Failed while probing certifi helper inner block")
except Exception:
logger.exception("Failed while invoking cert helper function")
return None
# Prefer helpful modules if available (use safe checks to avoid first-chance import errors)
for mod_name in ("pip_system_certs", "certifi_win32"):
path = _try_module_bundle(mod_name)
if path:
try:
os.environ["SSL_CERT_FILE"] = path
except Exception:
logger.exception("Failed to set SSL_CERT_FILE environment variable")
logger.info(f"SSL_CERT_FILE not set; using bundle from {mod_name}: {path}")
return path
# Fallback to certifi
try:
import certifi # type: ignore
path = certifi.where()
if path:
try:
os.environ["SSL_CERT_FILE"] = path
except Exception:
logger.exception("Failed to set SSL_CERT_FILE environment variable during certifi fallback")
logger.info(f"SSL_CERT_FILE not set; using certifi bundle: {path}")
return path
except Exception:
logger.exception("Failed to probe certifi for trust bundle")
# Fallback to certifi
try:
import certifi # type: ignore
path = certifi.where()
if path:
try:
os.environ["SSL_CERT_FILE"] = path
except Exception:
logger.exception("Failed to set SSL_CERT_FILE environment variable during certifi fallback")
logger.info(f"SSL_CERT_FILE not set; using certifi bundle: {path}")
return path
except Exception:
logger.exception("Failed to probe certifi for trust bundle")
return True
def get_requests_verify_value(verify_ssl: bool = True) -> Union[bool, str]:
"""Expose the verified value for reuse outside of HTTPClient (requests sessions)."""
return _resolve_verify_value(verify_ssl)
# Default configuration
DEFAULT_TIMEOUT = 30.0
DEFAULT_RETRIES = 3
DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
class HTTPClient:
"""Unified HTTP client with sync support."""
def __init__(
self,
timeout: float = DEFAULT_TIMEOUT,
retries: int = DEFAULT_RETRIES,
user_agent: str = DEFAULT_USER_AGENT,
verify_ssl: bool = True,
headers: Optional[Dict[str,
str]] = None,
):
"""
Initialize HTTP client.
Args:
timeout: Request timeout in seconds
retries: Number of retries on transient failures
user_agent: User-Agent header value
verify_ssl: Whether to verify SSL certificates
headers: Additional headers to include in all requests
"""
self.timeout = timeout
self.retries = retries
self.user_agent = user_agent
self.verify_ssl = verify_ssl
self.base_headers = headers or {}
self._client: Optional[httpx.Client] = None
self._httpx_verify = _resolve_verify_value(verify_ssl)
# Debug helpers
def _debug_panel(self, title: str, rows: List[tuple[str, Any]]) -> None:
if not is_debug_enabled():
return
try:
from rich.table import Table as RichTable
from rich.panel import Panel
grid = RichTable.grid(padding=(0, 1))
grid.add_column("Key", style="cyan", no_wrap=True)
grid.add_column("Value")
for key, val in rows:
try:
grid.add_row(str(key), str(val))
except Exception:
grid.add_row(str(key), "<unprintable>")
debug(Panel(grid, title=title, expand=False))
except Exception:
# Fallback to simple debug output
debug(title, rows)
def __enter__(self):
"""Context manager entry."""
self._client = httpx.Client(
timeout=self.timeout,
verify=self._httpx_verify,
headers=self._get_headers(),
)
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit."""
if self._client:
self._client.close()
self._client = None
def _get_headers(self) -> Dict[str, str]:
"""Get request headers with user-agent."""
headers = {
"User-Agent": self.user_agent
}
headers.update(self.base_headers)
return headers
def get(
self,
url: str,
params: Optional[Dict[str,
Any]] = None,
headers: Optional[Dict[str,
str]] = None,
allow_redirects: bool = True,
) -> httpx.Response:
"""
Make a GET request.
Args:
url: Request URL
params: Query parameters
headers: Additional headers
allow_redirects: Follow redirects
Returns:
httpx.Response object
"""
return self._request(
"GET",
url,
params=params,
headers=headers,
follow_redirects=allow_redirects,
)
def post(
self,
url: str,
data: Optional[Any] = None,
json: Optional[Dict] = None,
files: Optional[Dict] = None,
headers: Optional[Dict[str,
str]] = None,
) -> httpx.Response:
"""
Make a POST request.
Args:
url: Request URL
data: Form data
json: JSON data
files: Files to upload
headers: Additional headers
Returns:
httpx.Response object
"""
return self._request(
"POST",
url,
data=data,
json=json,
files=files,
headers=headers,
)
def put(
self,
url: str,
data: Optional[Any] = None,
json: Optional[Dict] = None,
content: Optional[Any] = None,
files: Optional[Dict] = None,
headers: Optional[Dict[str,
str]] = None,
) -> httpx.Response:
"""
Make a PUT request.
Args:
url: Request URL
data: Form data
json: JSON data
content: Raw content
files: Files to upload
headers: Additional headers
Returns:
httpx.Response object
"""
return self._request(
"PUT",
url,
data=data,
json=json,
content=content,
files=files,
headers=headers,
)
def delete(
self,
url: str,
headers: Optional[Dict[str,
str]] = None,
) -> httpx.Response:
"""
Make a DELETE request.
Args:
url: Request URL
headers: Additional headers
Returns:
httpx.Response object
"""
return self._request(
"DELETE",
url,
headers=headers,
)
def request(self, method: str, url: str, **kwargs) -> httpx.Response:
"""
Make a generic HTTP request.
Args:
method: HTTP method
url: Request URL
**kwargs: Additional arguments
Returns:
httpx.Response object
"""
return self._request(method, url, **kwargs)
def download(
self,
url: str,
file_path: str,
chunk_size: int = 8192,
progress_callback: Optional[Callable[[int,
int],
None]] = None,
headers: Optional[Dict[str,
str]] = None,
) -> Path:
"""
Download a file from URL with optional progress tracking.
Args:
url: File URL
file_path: Local file path to save to
chunk_size: Download chunk size
progress_callback: Callback(bytes_downloaded, total_bytes)
headers: Additional headers
Returns:
Path object of downloaded file
"""
path = Path(file_path)
path.parent.mkdir(parents=True, exist_ok=True)
with self._request_stream("GET",
url,
headers=headers,
follow_redirects=True) as response:
response.raise_for_status()
total_bytes = int(response.headers.get("content-length", 0))
bytes_downloaded = 0
# Render progress immediately (even if the transfer is very fast)
if progress_callback:
try:
progress_callback(0, total_bytes)
except Exception:
logger.exception("Error in progress_callback initial call")
with open(path, "wb") as f:
for chunk in response.iter_bytes(chunk_size):
if chunk:
f.write(chunk)
bytes_downloaded += len(chunk)
if progress_callback:
progress_callback(bytes_downloaded, total_bytes)
# Ensure a final callback is emitted.
if progress_callback:
try:
progress_callback(bytes_downloaded, total_bytes)
except Exception:
logger.exception("Error in progress_callback final call")
return path
def _request(
self,
method: str,
url: str,
raise_for_status: bool = True,
log_http_errors: bool = True,
**kwargs,
) -> httpx.Response:
"""
Make an HTTP request with automatic retries.
Args:
method: HTTP method
url: Request URL
**kwargs: Additional arguments for httpx.Client.request()
Returns:
httpx.Response object
"""
if not self._client:
raise RuntimeError(
"HTTPClient must be used with context manager (with statement)"
)
# Merge headers
if "headers" in kwargs and kwargs["headers"]:
headers = self._get_headers()
headers.update(kwargs["headers"])
kwargs["headers"] = headers
else:
kwargs["headers"] = self._get_headers()
last_exception: Exception | None = None
for attempt in range(self.retries):
self._debug_panel(
"HTTP request",
[
("method", method),
("url", url),
("attempt", f"{attempt + 1}/{self.retries}"),
("params", kwargs.get("params")),
("headers", kwargs.get("headers")),
("verify", self._httpx_verify),
("follow_redirects", kwargs.get("follow_redirects", False)),
],
)
try:
response = self._client.request(method, url, **kwargs)
self._debug_panel(
"HTTP response",
[
("method", method),
("url", url),
("status", getattr(response, "status_code", "")),
("elapsed", getattr(response, "elapsed", "")),
(
"content_length",
response.headers.get("content-length") if hasattr(response, "headers") else "",
),
],
)
if raise_for_status:
response.raise_for_status()
return response
except httpx.TimeoutException as e:
last_exception = e
logger.warning(
f"Timeout on attempt {attempt + 1}/{self.retries}: {url}"
)
if attempt < self.retries - 1:
continue
except httpx.HTTPStatusError as e:
# Don't retry on 4xx errors
if 400 <= e.response.status_code < 500:
try:
response_text = e.response.text[:500]
except Exception:
response_text = "<unable to read response>"
if log_http_errors:
logger.error(
f"HTTP {e.response.status_code} from {url}: {response_text}"
)
raise
last_exception = e
try:
response_text = e.response.text[:200]
except Exception:
response_text = "<unable to read response>"
logger.warning(
f"HTTP {e.response.status_code} on attempt {attempt + 1}/{self.retries}: {url} - {response_text}"
)
if attempt < self.retries - 1:
continue
except (httpx.RequestError, httpx.ConnectError) as e:
last_exception = e
logger.warning(
f"Connection error on attempt {attempt + 1}/{self.retries}: {url} - {e}"
)
# Detect certificate verification failures in the underlying error
msg = str(e or "").lower()
if ("certificate verify failed" in msg or "unable to get local issuer certificate" in msg):
logger.info("Certificate verification failed; attempting to retry with a system-aware CA bundle")
try:
import httpx as _httpx
# Use the client's precomputed verify argument (set at init)
verify_override = self._httpx_verify
with _httpx.Client(timeout=self.timeout, verify=verify_override, headers=self._get_headers()) as temp_client:
try:
response = temp_client.request(method, url, **kwargs)
if raise_for_status:
response.raise_for_status()
return response
except Exception as e2:
last_exception = e2
except Exception:
# certifi/pip-system-certs/httpx not available; fall back to existing retry behavior
pass
if attempt < self.retries - 1:
continue
except Exception as e:
# Catch-all to handle non-httpx exceptions that may represent
# certificate verification failures from underlying transports.
last_exception = e
logger.warning(f"Request exception on attempt {attempt + 1}/{self.retries}: {url} - {e}")
msg = str(e or "").lower()
if ("certificate verify failed" in msg or "unable to get local issuer certificate" in msg):
logger.info("Certificate verification failed; attempting to retry with a system-aware CA bundle")
try:
import httpx as _httpx
# Use the client's precomputed verify argument (set at init)
verify_override = self._httpx_verify
with _httpx.Client(timeout=self.timeout, verify=verify_override, headers=self._get_headers()) as temp_client:
try:
response = temp_client.request(method, url, **kwargs)
if raise_for_status:
response.raise_for_status()
return response
except Exception as e2:
last_exception = e2
except Exception:
# certifi/pip-system-certs/httpx not available; fall back to existing retry behavior
pass
if attempt < self.retries - 1:
continue
if last_exception:
logger.error(
f"Request failed after {self.retries} attempts: {url} - {last_exception}"
)
raise last_exception
raise RuntimeError("Request failed after retries")
def _request_stream(self, method: str, url: str, **kwargs):
"""Make a streaming request."""
if not self._client:
raise RuntimeError(
"HTTPClient must be used with context manager (with statement)"
)
# Merge headers
if "headers" in kwargs and kwargs["headers"]:
headers = self._get_headers()
headers.update(kwargs["headers"])
kwargs["headers"] = headers
else:
kwargs["headers"] = self._get_headers()
self._debug_panel(
"HTTP stream",
[
("method", method),
("url", url),
("headers", kwargs.get("headers")),
("follow_redirects", kwargs.get("follow_redirects", False)),
],
)
return self._client.stream(method, url, **kwargs)
def download_direct_file(
url: str,
output_dir: Path,
debug_logger: Optional[DebugLogger] = None,
quiet: bool = False,
suggested_filename: Optional[str] = None,
pipeline_progress: Optional[Any] = None,
) -> DownloadMediaResult:
"""Download a direct file (PDF, image, document, etc.) with guardrails and metadata hooks."""
ensure_directory(output_dir)
def _sanitize_filename(name: str) -> str:
# Windows-safe filename sanitization.
text = str(name or "").strip()
if not text:
return ""
text = text.replace("/", "\\")
text = text.split("\\")[-1]
invalid = set('<>:"/\\|?*')
cleaned_chars: List[str] = []
for ch in text:
o = ord(ch)
if o < 32 or ch in invalid:
cleaned_chars.append(" ")
continue
cleaned_chars.append(ch)
cleaned = " ".join("".join(cleaned_chars).split()).strip()
cleaned = cleaned.rstrip(" .")
return cleaned
def _unique_path(path: Path) -> Path:
if not path.exists():
return path
stem = path.stem
suffix = path.suffix
parent = path.parent
for i in range(1, 10_000):
candidate = parent / f"{stem} ({i}){suffix}"
if not candidate.exists():
return candidate
return parent / f"{stem} ({int(time.time())}){suffix}"
parsed_url = urlparse(url)
url_path = parsed_url.path
filename: Optional[str] = None
if parsed_url.query:
query_params = parse_qs(parsed_url.query)
for param_name in ("filename", "download", "file", "name"):
if param_name in query_params and query_params[param_name]:
filename = query_params[param_name][0]
filename = unquote(filename)
break
if not filename or not filename.strip():
filename = url_path.split("/")[-1] if url_path else ""
filename = unquote(filename)
if "?" in filename:
filename = filename.split("?")[0]
content_type = ""
try:
with HTTPClient(timeout=10.0) as client:
response = client._request("HEAD", url, follow_redirects=True)
content_disposition = response.headers.get("content-disposition", "")
try:
content_type = str(response.headers.get("content-type", "") or "").strip().lower()
except Exception:
content_type = ""
if content_disposition:
match = re.search(r'filename\*?=(?:"([^"]*)"|([^;\s]*))', content_disposition)
if match:
extracted_name = match.group(1) or match.group(2)
if extracted_name:
filename = unquote(extracted_name)
if not quiet:
debug(f"Filename from Content-Disposition: {filename}")
except Exception as exc:
if not quiet:
log(f"Could not get filename from headers: {exc}", file=sys.stderr)
try:
page_like_exts = {".php", ".asp", ".aspx", ".jsp", ".cgi"}
ext = ""
try:
ext = Path(str(filename or "")).suffix.lower()
except Exception:
ext = ""
ct0 = (content_type or "").split(";", 1)[0].strip().lower()
must_probe = bool(ct0.startswith("text/html") or ext in page_like_exts)
if must_probe:
with HTTPClient(timeout=10.0) as client:
with client._request_stream("GET", url, follow_redirects=True) as resp:
resp.raise_for_status()
ct = (
str(resp.headers.get("content-type", "") or "")
.split(";", 1)[0]
.strip()
.lower()
)
if ct.startswith("text/html"):
raise DownloadError("URL appears to be an HTML page, not a direct file")
except DownloadError:
raise
except Exception:
logger.exception("Unexpected error while probing URL content")
suggested = _sanitize_filename(suggested_filename) if suggested_filename else ""
if suggested:
suggested_path = Path(suggested)
if suggested_path.suffix:
filename = suggested
else:
detected_ext = ""
try:
detected_ext = Path(str(filename)).suffix
except Exception:
logger.exception("Failed to detect file extension from filename")
detected_ext = ""
filename = suggested + detected_ext if detected_ext else suggested
try:
has_ext = bool(filename and Path(str(filename)).suffix)
except Exception:
logger.exception("Failed to determine if filename has extension")
has_ext = False
if filename and (not has_ext):
ct = (content_type or "").split(";", 1)[0].strip().lower()
ext_by_ct = {
"application/pdf": ".pdf",
"application/epub+zip": ".epub",
"application/x-mobipocket-ebook": ".mobi",
"image/jpeg": ".jpg",
"image/png": ".png",
"image/webp": ".webp",
"image/gif": ".gif",
"text/plain": ".txt",
"application/zip": ".zip",
}
if ct in ext_by_ct:
filename = f"{filename}{ext_by_ct[ct]}"
elif ct.startswith("text/html"):
raise DownloadError("URL appears to be an HTML page, not a direct file")
if not filename or not str(filename).strip():
raise DownloadError(
"Could not determine filename for URL (no Content-Disposition and no path filename)"
)
file_path = _unique_path(output_dir / str(filename))
use_pipeline_transfer = False
try:
if pipeline_progress is not None and hasattr(pipeline_progress, "update_transfer"):
ui = None
if hasattr(pipeline_progress, "ui_and_pipe_index"):
ui, _ = pipeline_progress.ui_and_pipe_index() # type: ignore[attr-defined]
use_pipeline_transfer = ui is not None
except Exception:
use_pipeline_transfer = False
progress_bar: Optional[ProgressBar] = None
if (not quiet) and (not use_pipeline_transfer):
progress_bar = ProgressBar()
transfer_started = [False]
if not quiet:
debug(f"Direct download: {filename}")
try:
start_time = time.time()
downloaded_bytes = [0]
transfer_started[0] = False
def _maybe_begin_transfer(content_length: int) -> None:
if pipeline_progress is None or transfer_started[0]:
return
try:
total_val: Optional[int] = (
int(content_length)
if isinstance(content_length, int) and content_length > 0
else None
)
except Exception:
total_val = None
try:
if hasattr(pipeline_progress, "begin_transfer"):
pipeline_progress.begin_transfer(
label=str(filename or "download"),
total=total_val,
)
transfer_started[0] = True
except Exception:
return
def progress_callback(bytes_downloaded: int, content_length: int) -> None:
downloaded_bytes[0] = int(bytes_downloaded or 0)
try:
if pipeline_progress is not None and hasattr(pipeline_progress, "update_transfer"):
_maybe_begin_transfer(content_length)
total_val: Optional[int] = (
int(content_length)
if isinstance(content_length, int) and content_length > 0
else None
)
pipeline_progress.update_transfer(
label=str(filename or "download"),
completed=int(bytes_downloaded or 0),
total=total_val,
)
except Exception:
logger.exception("Error updating pipeline progress transfer")
if progress_bar is not None:
progress_bar.update(
downloaded=int(bytes_downloaded or 0),
total=int(content_length) if content_length and content_length > 0 else None,
label=str(filename or "download"),
file=sys.stderr,
)
with HTTPClient(timeout=30.0) as client:
client.download(url, str(file_path), progress_callback=progress_callback)
elapsed = time.time() - start_time
try:
if progress_bar is not None:
progress_bar.finish()
except Exception:
logger.exception("Failed to finish progress bar")
try:
if pipeline_progress is not None and transfer_started[0] and hasattr(
pipeline_progress, "finish_transfer"
):
pipeline_progress.finish_transfer(label=str(filename or "download"))
except Exception:
logger.exception("Failed to finish pipeline transfer")
if not quiet:
debug(f"✓ Downloaded in {elapsed:.1f}s")
ext_out = ""
try:
ext_out = Path(str(filename)).suffix.lstrip(".")
except Exception:
ext_out = ""
info: Dict[str, Any] = {
"id": str(filename).rsplit(".", 1)[0] if "." in str(filename) else str(filename),
"ext": ext_out,
"webpage_url": url,
}
hash_value = None
try:
hash_value = sha256_file(file_path)
except Exception:
logger.exception("Failed to compute SHA256 of downloaded file")
tags: List[str] = []
if extract_ytdlp_tags is not None:
try:
tags = extract_ytdlp_tags(info)
except Exception as exc:
log(f"Error extracting tags: {exc}", file=sys.stderr)
if not any(str(t).startswith("title:") for t in tags):
info["title"] = str(filename)
tags = []
if extract_ytdlp_tags is not None:
try:
tags = extract_ytdlp_tags(info)
except Exception as exc:
log(f"Error extracting tags with filename: {exc}", file=sys.stderr)
if debug_logger is not None:
debug_logger.write_record(
"direct-file-downloaded",
{"url": url, "path": str(file_path), "hash": hash_value},
)
return DownloadMediaResult(
path=file_path,
info=info,
tag=tags,
source_url=url,
hash_value=hash_value,
)
except (httpx.HTTPError, httpx.RequestError) as exc:
try:
if progress_bar is not None:
progress_bar.finish()
except Exception:
logger.exception("Failed to finish progress bar during HTTP error handling")
try:
if pipeline_progress is not None and transfer_started[0] and hasattr(
pipeline_progress, "finish_transfer"
):
pipeline_progress.finish_transfer(label=str(filename or "download"))
except Exception:
logger.exception("Failed to finish pipeline transfer during HTTP error handling")
log(f"Download error: {exc}", file=sys.stderr)
if debug_logger is not None:
debug_logger.write_record(
"exception",
{"phase": "direct-file", "url": url, "error": str(exc)},
)
raise DownloadError(f"Failed to download {url}: {exc}") from exc
except Exception as exc:
try:
if progress_bar is not None:
progress_bar.finish()
except Exception:
logger.exception("Failed to finish progress bar during error handling")
try:
if pipeline_progress is not None and transfer_started[0] and hasattr(
pipeline_progress, "finish_transfer"
):
pipeline_progress.finish_transfer(label=str(filename or "download"))
except Exception:
logger.exception("Failed to finish pipeline transfer during error handling")
log(f"Error downloading file: {exc}", file=sys.stderr)
if debug_logger is not None:
debug_logger.write_record(
"exception",
{
"phase": "direct-file",
"url": url,
"error": str(exc),
"traceback": traceback.format_exc(),
},
)
raise DownloadError(f"Error downloading file: {exc}") from exc
# Back-compat alias
_download_direct_file = download_direct_file
class AsyncHTTPClient:
"""Unified async HTTP client with asyncio support."""
def __init__(
self,
timeout: float = DEFAULT_TIMEOUT,
retries: int = DEFAULT_RETRIES,
user_agent: str = DEFAULT_USER_AGENT,
verify_ssl: bool = True,
headers: Optional[Dict[str,
str]] = None,
):
"""
Initialize async HTTP client.
Args:
timeout: Request timeout in seconds
retries: Number of retries on transient failures
user_agent: User-Agent header value
verify_ssl: Whether to verify SSL certificates
headers: Additional headers to include in all requests
"""
self.timeout = timeout
self.retries = retries
self.user_agent = user_agent
self.verify_ssl = verify_ssl
self.base_headers = headers or {}
self._client: Optional[httpx.AsyncClient] = None
self._httpx_verify = _resolve_verify_value(verify_ssl)
async def __aenter__(self):
"""Async context manager entry."""
self._client = httpx.AsyncClient(
timeout=self.timeout,
verify=self._httpx_verify,
headers=self._get_headers(),
)
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
"""Async context manager exit."""
if self._client:
await self._client.aclose()
self._client = None
def _get_headers(self) -> Dict[str, str]:
"""Get request headers with user-agent."""
headers = {
"User-Agent": self.user_agent
}
headers.update(self.base_headers)
return headers
async def get(
self,
url: str,
params: Optional[Dict[str,
Any]] = None,
headers: Optional[Dict[str,
str]] = None,
allow_redirects: bool = True,
) -> httpx.Response:
"""
Make an async GET request.
Args:
url: Request URL
params: Query parameters
headers: Additional headers
allow_redirects: Follow redirects
Returns:
httpx.Response object
"""
return await self._request(
"GET",
url,
params=params,
headers=headers,
follow_redirects=allow_redirects,
)
async def post(
self,
url: str,
data: Optional[Any] = None,
json: Optional[Dict] = None,
headers: Optional[Dict[str,
str]] = None,
) -> httpx.Response:
"""
Make an async POST request.
Args:
url: Request URL
data: Form data
json: JSON data
headers: Additional headers
Returns:
httpx.Response object
"""
return await self._request(
"POST",
url,
data=data,
json=json,
headers=headers,
)
async def download(
self,
url: str,
file_path: str,
chunk_size: int = 8192,
progress_callback: Optional[Callable[[int,
int],
None]] = None,
headers: Optional[Dict[str,
str]] = None,
) -> Path:
"""
Download a file from URL asynchronously with optional progress tracking.
Args:
url: File URL
file_path: Local file path to save to
chunk_size: Download chunk size
progress_callback: Callback(bytes_downloaded, total_bytes)
headers: Additional headers
Returns:
Path object of downloaded file
"""
path = Path(file_path)
path.parent.mkdir(parents=True, exist_ok=True)
async with self._request_stream("GET", url, headers=headers) as response:
response.raise_for_status()
total_bytes = int(response.headers.get("content-length", 0))
bytes_downloaded = 0
with open(path, "wb") as f:
async for chunk in response.aiter_bytes(chunk_size):
if chunk:
f.write(chunk)
bytes_downloaded += len(chunk)
if progress_callback:
progress_callback(bytes_downloaded, total_bytes)
return path
async def _request(self, method: str, url: str, **kwargs) -> httpx.Response:
"""
Make an async HTTP request with automatic retries.
Args:
method: HTTP method
url: Request URL
**kwargs: Additional arguments for httpx.AsyncClient.request()
Returns:
httpx.Response object
"""
if not self._client:
raise RuntimeError(
"AsyncHTTPClient must be used with async context manager"
)
# Merge headers
if "headers" in kwargs and kwargs["headers"]:
headers = self._get_headers()
headers.update(kwargs["headers"])
kwargs["headers"] = headers
else:
kwargs["headers"] = self._get_headers()
last_exception: Exception | None = None
for attempt in range(self.retries):
try:
response = await self._client.request(method, url, **kwargs)
response.raise_for_status()
return response
except httpx.TimeoutException as e:
last_exception = e
logger.warning(
f"Timeout on attempt {attempt + 1}/{self.retries}: {url}"
)
if attempt < self.retries - 1:
await asyncio.sleep(0.5) # Brief delay before retry
continue
except httpx.HTTPStatusError as e:
# Don't retry on 4xx errors
if 400 <= e.response.status_code < 500:
try:
response_text = e.response.text[:500]
except Exception:
response_text = "<unable to read response>"
logger.error(
f"HTTP {e.response.status_code} from {url}: {response_text}"
)
raise
last_exception = e
try:
response_text = e.response.text[:200]
except Exception:
response_text = "<unable to read response>"
logger.warning(
f"HTTP {e.response.status_code} on attempt {attempt + 1}/{self.retries}: {url} - {response_text}"
)
if attempt < self.retries - 1:
await asyncio.sleep(0.5)
continue
except (httpx.RequestError, httpx.ConnectError) as e:
last_exception = e
logger.warning(
f"Connection error on attempt {attempt + 1}/{self.retries}: {url} - {e}"
)
if attempt < self.retries - 1:
await asyncio.sleep(0.5)
continue
if last_exception:
logger.error(
f"Request failed after {self.retries} attempts: {url} - {last_exception}"
)
raise last_exception
raise RuntimeError("Request failed after retries")
def _request_stream(self, method: str, url: str, **kwargs):
"""Make a streaming request."""
if not self._client:
raise RuntimeError(
"AsyncHTTPClient must be used with async context manager"
)
# Merge headers
if "headers" in kwargs and kwargs["headers"]:
headers = self._get_headers()
headers.update(kwargs["headers"])
kwargs["headers"] = headers
else:
kwargs["headers"] = self._get_headers()
return self._client.stream(method, url, **kwargs)
# Convenience function for quick sync requests
def get(url: str, **kwargs) -> httpx.Response:
"""Quick GET request without context manager."""
with HTTPClient() as client:
return client.get(url, **kwargs)
def post(url: str, **kwargs) -> httpx.Response:
"""Quick POST request without context manager."""
with HTTPClient() as client:
return client.post(url, **kwargs)
def download(
url: str,
file_path: str,
progress_callback: Optional[Callable[[int,
int],
None]] = None,
**kwargs,
) -> Path:
"""Quick file download without context manager."""
with HTTPClient() as client:
return client.download(
url,
file_path,
progress_callback=progress_callback,
**kwargs
)