Files
Medios-Macina/helper/http_client.py

580 lines
18 KiB
Python
Raw Permalink Normal View History

2025-11-25 20:09:33 -08:00
"""
Unified HTTP client for downlow using httpx.
Provides synchronous and asynchronous HTTP operations with:
- Automatic retries on transient failures
- Configurable timeouts and headers
- Built-in progress tracking for downloads
- Request/response logging support
"""
import httpx
import asyncio
from typing import Optional, Dict, Any, Callable, BinaryIO
from pathlib import Path
import logging
logger = logging.getLogger(__name__)
# Default configuration
DEFAULT_TIMEOUT = 30.0
DEFAULT_RETRIES = 3
DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
class HTTPClient:
"""Unified HTTP client with sync support."""
def __init__(
self,
timeout: float = DEFAULT_TIMEOUT,
retries: int = DEFAULT_RETRIES,
user_agent: str = DEFAULT_USER_AGENT,
verify_ssl: bool = True,
headers: Optional[Dict[str, str]] = None,
):
"""
Initialize HTTP client.
Args:
timeout: Request timeout in seconds
retries: Number of retries on transient failures
user_agent: User-Agent header value
verify_ssl: Whether to verify SSL certificates
headers: Additional headers to include in all requests
"""
self.timeout = timeout
self.retries = retries
self.user_agent = user_agent
self.verify_ssl = verify_ssl
self.base_headers = headers or {}
self._client: Optional[httpx.Client] = None
def __enter__(self):
"""Context manager entry."""
self._client = httpx.Client(
timeout=self.timeout,
verify=self.verify_ssl,
headers=self._get_headers(),
)
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit."""
if self._client:
self._client.close()
self._client = None
def _get_headers(self) -> Dict[str, str]:
"""Get request headers with user-agent."""
headers = {"User-Agent": self.user_agent}
headers.update(self.base_headers)
return headers
def get(
self,
url: str,
params: Optional[Dict[str, Any]] = None,
headers: Optional[Dict[str, str]] = None,
allow_redirects: bool = True,
) -> httpx.Response:
"""
Make a GET request.
Args:
url: Request URL
params: Query parameters
headers: Additional headers
allow_redirects: Follow redirects
Returns:
httpx.Response object
"""
return self._request(
"GET",
url,
params=params,
headers=headers,
follow_redirects=allow_redirects,
)
def post(
self,
url: str,
data: Optional[Any] = None,
json: Optional[Dict] = None,
files: Optional[Dict] = None,
headers: Optional[Dict[str, str]] = None,
) -> httpx.Response:
"""
Make a POST request.
Args:
url: Request URL
data: Form data
json: JSON data
files: Files to upload
headers: Additional headers
Returns:
httpx.Response object
"""
return self._request(
"POST",
url,
data=data,
json=json,
files=files,
headers=headers,
)
def put(
self,
url: str,
data: Optional[Any] = None,
json: Optional[Dict] = None,
content: Optional[Any] = None,
files: Optional[Dict] = None,
headers: Optional[Dict[str, str]] = None,
) -> httpx.Response:
"""
Make a PUT request.
Args:
url: Request URL
data: Form data
json: JSON data
content: Raw content
files: Files to upload
headers: Additional headers
Returns:
httpx.Response object
"""
return self._request(
"PUT",
url,
data=data,
json=json,
content=content,
files=files,
headers=headers,
)
def delete(
self,
url: str,
headers: Optional[Dict[str, str]] = None,
) -> httpx.Response:
"""
Make a DELETE request.
Args:
url: Request URL
headers: Additional headers
Returns:
httpx.Response object
"""
return self._request(
"DELETE",
url,
headers=headers,
)
def request(
self,
method: str,
url: str,
**kwargs
) -> httpx.Response:
"""
Make a generic HTTP request.
Args:
method: HTTP method
url: Request URL
**kwargs: Additional arguments
Returns:
httpx.Response object
"""
return self._request(method, url, **kwargs)
def download(
self,
url: str,
file_path: str,
chunk_size: int = 8192,
progress_callback: Optional[Callable[[int, int], None]] = None,
headers: Optional[Dict[str, str]] = None,
) -> Path:
"""
Download a file from URL with optional progress tracking.
Args:
url: File URL
file_path: Local file path to save to
chunk_size: Download chunk size
progress_callback: Callback(bytes_downloaded, total_bytes)
headers: Additional headers
Returns:
Path object of downloaded file
"""
path = Path(file_path)
path.parent.mkdir(parents=True, exist_ok=True)
with self._request_stream("GET", url, headers=headers, follow_redirects=True) as response:
response.raise_for_status()
total_bytes = int(response.headers.get("content-length", 0))
bytes_downloaded = 0
with open(path, "wb") as f:
for chunk in response.iter_bytes(chunk_size):
if chunk:
f.write(chunk)
bytes_downloaded += len(chunk)
if progress_callback:
progress_callback(bytes_downloaded, total_bytes)
return path
def _request(
self,
method: str,
url: str,
**kwargs
) -> httpx.Response:
"""
Make an HTTP request with automatic retries.
Args:
method: HTTP method
url: Request URL
**kwargs: Additional arguments for httpx.Client.request()
Returns:
httpx.Response object
"""
if not self._client:
raise RuntimeError("HTTPClient must be used with context manager (with statement)")
# Merge headers
if "headers" in kwargs and kwargs["headers"]:
headers = self._get_headers()
headers.update(kwargs["headers"])
kwargs["headers"] = headers
else:
kwargs["headers"] = self._get_headers()
last_exception = None
for attempt in range(self.retries):
try:
response = self._client.request(method, url, **kwargs)
response.raise_for_status()
return response
except httpx.TimeoutException as e:
last_exception = e
logger.warning(f"Timeout on attempt {attempt + 1}/{self.retries}: {url}")
if attempt < self.retries - 1:
continue
except httpx.HTTPStatusError as e:
# Don't retry on 4xx errors
if 400 <= e.response.status_code < 500:
try:
response_text = e.response.text[:500]
except:
response_text = "<unable to read response>"
logger.error(f"HTTP {e.response.status_code} from {url}: {response_text}")
raise
last_exception = e
try:
response_text = e.response.text[:200]
except:
response_text = "<unable to read response>"
logger.warning(f"HTTP {e.response.status_code} on attempt {attempt + 1}/{self.retries}: {url} - {response_text}")
if attempt < self.retries - 1:
continue
except (httpx.RequestError, httpx.ConnectError) as e:
last_exception = e
logger.warning(f"Connection error on attempt {attempt + 1}/{self.retries}: {url} - {e}")
if attempt < self.retries - 1:
continue
if last_exception:
logger.error(f"Request failed after {self.retries} attempts: {url} - {last_exception}")
raise last_exception
raise RuntimeError("Request failed after retries")
def _request_stream(self, method: str, url: str, **kwargs):
"""Make a streaming request."""
if not self._client:
raise RuntimeError("HTTPClient must be used with context manager (with statement)")
# Merge headers
if "headers" in kwargs and kwargs["headers"]:
headers = self._get_headers()
headers.update(kwargs["headers"])
kwargs["headers"] = headers
else:
kwargs["headers"] = self._get_headers()
return self._client.stream(method, url, **kwargs)
class AsyncHTTPClient:
"""Unified async HTTP client with asyncio support."""
def __init__(
self,
timeout: float = DEFAULT_TIMEOUT,
retries: int = DEFAULT_RETRIES,
user_agent: str = DEFAULT_USER_AGENT,
verify_ssl: bool = True,
headers: Optional[Dict[str, str]] = None,
):
"""
Initialize async HTTP client.
Args:
timeout: Request timeout in seconds
retries: Number of retries on transient failures
user_agent: User-Agent header value
verify_ssl: Whether to verify SSL certificates
headers: Additional headers to include in all requests
"""
self.timeout = timeout
self.retries = retries
self.user_agent = user_agent
self.verify_ssl = verify_ssl
self.base_headers = headers or {}
self._client: Optional[httpx.AsyncClient] = None
async def __aenter__(self):
"""Async context manager entry."""
self._client = httpx.AsyncClient(
timeout=self.timeout,
verify=self.verify_ssl,
headers=self._get_headers(),
)
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
"""Async context manager exit."""
if self._client:
await self._client.aclose()
self._client = None
def _get_headers(self) -> Dict[str, str]:
"""Get request headers with user-agent."""
headers = {"User-Agent": self.user_agent}
headers.update(self.base_headers)
return headers
async def get(
self,
url: str,
params: Optional[Dict[str, Any]] = None,
headers: Optional[Dict[str, str]] = None,
allow_redirects: bool = True,
) -> httpx.Response:
"""
Make an async GET request.
Args:
url: Request URL
params: Query parameters
headers: Additional headers
allow_redirects: Follow redirects
Returns:
httpx.Response object
"""
return await self._request(
"GET",
url,
params=params,
headers=headers,
follow_redirects=allow_redirects,
)
async def post(
self,
url: str,
data: Optional[Any] = None,
json: Optional[Dict] = None,
headers: Optional[Dict[str, str]] = None,
) -> httpx.Response:
"""
Make an async POST request.
Args:
url: Request URL
data: Form data
json: JSON data
headers: Additional headers
Returns:
httpx.Response object
"""
return await self._request(
"POST",
url,
data=data,
json=json,
headers=headers,
)
async def download(
self,
url: str,
file_path: str,
chunk_size: int = 8192,
progress_callback: Optional[Callable[[int, int], None]] = None,
headers: Optional[Dict[str, str]] = None,
) -> Path:
"""
Download a file from URL asynchronously with optional progress tracking.
Args:
url: File URL
file_path: Local file path to save to
chunk_size: Download chunk size
progress_callback: Callback(bytes_downloaded, total_bytes)
headers: Additional headers
Returns:
Path object of downloaded file
"""
path = Path(file_path)
path.parent.mkdir(parents=True, exist_ok=True)
async with self._request_stream("GET", url, headers=headers) as response:
response.raise_for_status()
total_bytes = int(response.headers.get("content-length", 0))
bytes_downloaded = 0
with open(path, "wb") as f:
async for chunk in response.aiter_bytes(chunk_size):
if chunk:
f.write(chunk)
bytes_downloaded += len(chunk)
if progress_callback:
progress_callback(bytes_downloaded, total_bytes)
return path
async def _request(
self,
method: str,
url: str,
**kwargs
) -> httpx.Response:
"""
Make an async HTTP request with automatic retries.
Args:
method: HTTP method
url: Request URL
**kwargs: Additional arguments for httpx.AsyncClient.request()
Returns:
httpx.Response object
"""
if not self._client:
raise RuntimeError("AsyncHTTPClient must be used with async context manager")
# Merge headers
if "headers" in kwargs and kwargs["headers"]:
headers = self._get_headers()
headers.update(kwargs["headers"])
kwargs["headers"] = headers
else:
kwargs["headers"] = self._get_headers()
last_exception = None
for attempt in range(self.retries):
try:
response = await self._client.request(method, url, **kwargs)
response.raise_for_status()
return response
except httpx.TimeoutException as e:
last_exception = e
logger.warning(f"Timeout on attempt {attempt + 1}/{self.retries}: {url}")
if attempt < self.retries - 1:
await asyncio.sleep(0.5) # Brief delay before retry
continue
except httpx.HTTPStatusError as e:
# Don't retry on 4xx errors
if 400 <= e.response.status_code < 500:
try:
response_text = e.response.text[:500]
except:
response_text = "<unable to read response>"
logger.error(f"HTTP {e.response.status_code} from {url}: {response_text}")
raise
last_exception = e
try:
response_text = e.response.text[:200]
except:
response_text = "<unable to read response>"
logger.warning(f"HTTP {e.response.status_code} on attempt {attempt + 1}/{self.retries}: {url} - {response_text}")
if attempt < self.retries - 1:
await asyncio.sleep(0.5)
continue
except (httpx.RequestError, httpx.ConnectError) as e:
last_exception = e
logger.warning(f"Connection error on attempt {attempt + 1}/{self.retries}: {url} - {e}")
if attempt < self.retries - 1:
await asyncio.sleep(0.5)
continue
if last_exception:
logger.error(f"Request failed after {self.retries} attempts: {url} - {last_exception}")
raise last_exception
raise RuntimeError("Request failed after retries")
def _request_stream(self, method: str, url: str, **kwargs):
"""Make a streaming request."""
if not self._client:
raise RuntimeError("AsyncHTTPClient must be used with async context manager")
# Merge headers
if "headers" in kwargs and kwargs["headers"]:
headers = self._get_headers()
headers.update(kwargs["headers"])
kwargs["headers"] = headers
else:
kwargs["headers"] = self._get_headers()
return self._client.stream(method, url, **kwargs)
# Convenience function for quick sync requests
def get(url: str, **kwargs) -> httpx.Response:
"""Quick GET request without context manager."""
with HTTPClient() as client:
return client.get(url, **kwargs)
def post(url: str, **kwargs) -> httpx.Response:
"""Quick POST request without context manager."""
with HTTPClient() as client:
return client.post(url, **kwargs)
def download(
url: str,
file_path: str,
progress_callback: Optional[Callable[[int, int], None]] = None,
**kwargs
) -> Path:
"""Quick file download without context manager."""
with HTTPClient() as client:
return client.download(url, file_path, progress_callback=progress_callback, **kwargs)