580 lines
18 KiB
Python
580 lines
18 KiB
Python
|
|
"""
|
||
|
|
Unified HTTP client for downlow using httpx.
|
||
|
|
|
||
|
|
Provides synchronous and asynchronous HTTP operations with:
|
||
|
|
- Automatic retries on transient failures
|
||
|
|
- Configurable timeouts and headers
|
||
|
|
- Built-in progress tracking for downloads
|
||
|
|
- Request/response logging support
|
||
|
|
"""
|
||
|
|
|
||
|
|
import httpx
|
||
|
|
import asyncio
|
||
|
|
from typing import Optional, Dict, Any, Callable, BinaryIO
|
||
|
|
from pathlib import Path
|
||
|
|
import logging
|
||
|
|
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
# Default configuration
|
||
|
|
DEFAULT_TIMEOUT = 30.0
|
||
|
|
DEFAULT_RETRIES = 3
|
||
|
|
DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||
|
|
|
||
|
|
|
||
|
|
class HTTPClient:
|
||
|
|
"""Unified HTTP client with sync support."""
|
||
|
|
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
timeout: float = DEFAULT_TIMEOUT,
|
||
|
|
retries: int = DEFAULT_RETRIES,
|
||
|
|
user_agent: str = DEFAULT_USER_AGENT,
|
||
|
|
verify_ssl: bool = True,
|
||
|
|
headers: Optional[Dict[str, str]] = None,
|
||
|
|
):
|
||
|
|
"""
|
||
|
|
Initialize HTTP client.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
timeout: Request timeout in seconds
|
||
|
|
retries: Number of retries on transient failures
|
||
|
|
user_agent: User-Agent header value
|
||
|
|
verify_ssl: Whether to verify SSL certificates
|
||
|
|
headers: Additional headers to include in all requests
|
||
|
|
"""
|
||
|
|
self.timeout = timeout
|
||
|
|
self.retries = retries
|
||
|
|
self.user_agent = user_agent
|
||
|
|
self.verify_ssl = verify_ssl
|
||
|
|
self.base_headers = headers or {}
|
||
|
|
self._client: Optional[httpx.Client] = None
|
||
|
|
|
||
|
|
def __enter__(self):
|
||
|
|
"""Context manager entry."""
|
||
|
|
self._client = httpx.Client(
|
||
|
|
timeout=self.timeout,
|
||
|
|
verify=self.verify_ssl,
|
||
|
|
headers=self._get_headers(),
|
||
|
|
)
|
||
|
|
return self
|
||
|
|
|
||
|
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
||
|
|
"""Context manager exit."""
|
||
|
|
if self._client:
|
||
|
|
self._client.close()
|
||
|
|
self._client = None
|
||
|
|
|
||
|
|
def _get_headers(self) -> Dict[str, str]:
|
||
|
|
"""Get request headers with user-agent."""
|
||
|
|
headers = {"User-Agent": self.user_agent}
|
||
|
|
headers.update(self.base_headers)
|
||
|
|
return headers
|
||
|
|
|
||
|
|
def get(
|
||
|
|
self,
|
||
|
|
url: str,
|
||
|
|
params: Optional[Dict[str, Any]] = None,
|
||
|
|
headers: Optional[Dict[str, str]] = None,
|
||
|
|
allow_redirects: bool = True,
|
||
|
|
) -> httpx.Response:
|
||
|
|
"""
|
||
|
|
Make a GET request.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
url: Request URL
|
||
|
|
params: Query parameters
|
||
|
|
headers: Additional headers
|
||
|
|
allow_redirects: Follow redirects
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
httpx.Response object
|
||
|
|
"""
|
||
|
|
return self._request(
|
||
|
|
"GET",
|
||
|
|
url,
|
||
|
|
params=params,
|
||
|
|
headers=headers,
|
||
|
|
follow_redirects=allow_redirects,
|
||
|
|
)
|
||
|
|
|
||
|
|
def post(
|
||
|
|
self,
|
||
|
|
url: str,
|
||
|
|
data: Optional[Any] = None,
|
||
|
|
json: Optional[Dict] = None,
|
||
|
|
files: Optional[Dict] = None,
|
||
|
|
headers: Optional[Dict[str, str]] = None,
|
||
|
|
) -> httpx.Response:
|
||
|
|
"""
|
||
|
|
Make a POST request.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
url: Request URL
|
||
|
|
data: Form data
|
||
|
|
json: JSON data
|
||
|
|
files: Files to upload
|
||
|
|
headers: Additional headers
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
httpx.Response object
|
||
|
|
"""
|
||
|
|
return self._request(
|
||
|
|
"POST",
|
||
|
|
url,
|
||
|
|
data=data,
|
||
|
|
json=json,
|
||
|
|
files=files,
|
||
|
|
headers=headers,
|
||
|
|
)
|
||
|
|
|
||
|
|
def put(
|
||
|
|
self,
|
||
|
|
url: str,
|
||
|
|
data: Optional[Any] = None,
|
||
|
|
json: Optional[Dict] = None,
|
||
|
|
content: Optional[Any] = None,
|
||
|
|
files: Optional[Dict] = None,
|
||
|
|
headers: Optional[Dict[str, str]] = None,
|
||
|
|
) -> httpx.Response:
|
||
|
|
"""
|
||
|
|
Make a PUT request.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
url: Request URL
|
||
|
|
data: Form data
|
||
|
|
json: JSON data
|
||
|
|
content: Raw content
|
||
|
|
files: Files to upload
|
||
|
|
headers: Additional headers
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
httpx.Response object
|
||
|
|
"""
|
||
|
|
return self._request(
|
||
|
|
"PUT",
|
||
|
|
url,
|
||
|
|
data=data,
|
||
|
|
json=json,
|
||
|
|
content=content,
|
||
|
|
files=files,
|
||
|
|
headers=headers,
|
||
|
|
)
|
||
|
|
|
||
|
|
def delete(
|
||
|
|
self,
|
||
|
|
url: str,
|
||
|
|
headers: Optional[Dict[str, str]] = None,
|
||
|
|
) -> httpx.Response:
|
||
|
|
"""
|
||
|
|
Make a DELETE request.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
url: Request URL
|
||
|
|
headers: Additional headers
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
httpx.Response object
|
||
|
|
"""
|
||
|
|
return self._request(
|
||
|
|
"DELETE",
|
||
|
|
url,
|
||
|
|
headers=headers,
|
||
|
|
)
|
||
|
|
|
||
|
|
def request(
|
||
|
|
self,
|
||
|
|
method: str,
|
||
|
|
url: str,
|
||
|
|
**kwargs
|
||
|
|
) -> httpx.Response:
|
||
|
|
"""
|
||
|
|
Make a generic HTTP request.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
method: HTTP method
|
||
|
|
url: Request URL
|
||
|
|
**kwargs: Additional arguments
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
httpx.Response object
|
||
|
|
"""
|
||
|
|
return self._request(method, url, **kwargs)
|
||
|
|
|
||
|
|
def download(
|
||
|
|
self,
|
||
|
|
url: str,
|
||
|
|
file_path: str,
|
||
|
|
chunk_size: int = 8192,
|
||
|
|
progress_callback: Optional[Callable[[int, int], None]] = None,
|
||
|
|
headers: Optional[Dict[str, str]] = None,
|
||
|
|
) -> Path:
|
||
|
|
"""
|
||
|
|
Download a file from URL with optional progress tracking.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
url: File URL
|
||
|
|
file_path: Local file path to save to
|
||
|
|
chunk_size: Download chunk size
|
||
|
|
progress_callback: Callback(bytes_downloaded, total_bytes)
|
||
|
|
headers: Additional headers
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Path object of downloaded file
|
||
|
|
"""
|
||
|
|
path = Path(file_path)
|
||
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
with self._request_stream("GET", url, headers=headers, follow_redirects=True) as response:
|
||
|
|
response.raise_for_status()
|
||
|
|
total_bytes = int(response.headers.get("content-length", 0))
|
||
|
|
bytes_downloaded = 0
|
||
|
|
|
||
|
|
with open(path, "wb") as f:
|
||
|
|
for chunk in response.iter_bytes(chunk_size):
|
||
|
|
if chunk:
|
||
|
|
f.write(chunk)
|
||
|
|
bytes_downloaded += len(chunk)
|
||
|
|
if progress_callback:
|
||
|
|
progress_callback(bytes_downloaded, total_bytes)
|
||
|
|
|
||
|
|
return path
|
||
|
|
|
||
|
|
def _request(
|
||
|
|
self,
|
||
|
|
method: str,
|
||
|
|
url: str,
|
||
|
|
**kwargs
|
||
|
|
) -> httpx.Response:
|
||
|
|
"""
|
||
|
|
Make an HTTP request with automatic retries.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
method: HTTP method
|
||
|
|
url: Request URL
|
||
|
|
**kwargs: Additional arguments for httpx.Client.request()
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
httpx.Response object
|
||
|
|
"""
|
||
|
|
if not self._client:
|
||
|
|
raise RuntimeError("HTTPClient must be used with context manager (with statement)")
|
||
|
|
|
||
|
|
# Merge headers
|
||
|
|
if "headers" in kwargs and kwargs["headers"]:
|
||
|
|
headers = self._get_headers()
|
||
|
|
headers.update(kwargs["headers"])
|
||
|
|
kwargs["headers"] = headers
|
||
|
|
else:
|
||
|
|
kwargs["headers"] = self._get_headers()
|
||
|
|
|
||
|
|
last_exception = None
|
||
|
|
|
||
|
|
for attempt in range(self.retries):
|
||
|
|
try:
|
||
|
|
response = self._client.request(method, url, **kwargs)
|
||
|
|
response.raise_for_status()
|
||
|
|
return response
|
||
|
|
except httpx.TimeoutException as e:
|
||
|
|
last_exception = e
|
||
|
|
logger.warning(f"Timeout on attempt {attempt + 1}/{self.retries}: {url}")
|
||
|
|
if attempt < self.retries - 1:
|
||
|
|
continue
|
||
|
|
except httpx.HTTPStatusError as e:
|
||
|
|
# Don't retry on 4xx errors
|
||
|
|
if 400 <= e.response.status_code < 500:
|
||
|
|
try:
|
||
|
|
response_text = e.response.text[:500]
|
||
|
|
except:
|
||
|
|
response_text = "<unable to read response>"
|
||
|
|
logger.error(f"HTTP {e.response.status_code} from {url}: {response_text}")
|
||
|
|
raise
|
||
|
|
last_exception = e
|
||
|
|
try:
|
||
|
|
response_text = e.response.text[:200]
|
||
|
|
except:
|
||
|
|
response_text = "<unable to read response>"
|
||
|
|
logger.warning(f"HTTP {e.response.status_code} on attempt {attempt + 1}/{self.retries}: {url} - {response_text}")
|
||
|
|
if attempt < self.retries - 1:
|
||
|
|
continue
|
||
|
|
except (httpx.RequestError, httpx.ConnectError) as e:
|
||
|
|
last_exception = e
|
||
|
|
logger.warning(f"Connection error on attempt {attempt + 1}/{self.retries}: {url} - {e}")
|
||
|
|
if attempt < self.retries - 1:
|
||
|
|
continue
|
||
|
|
|
||
|
|
if last_exception:
|
||
|
|
logger.error(f"Request failed after {self.retries} attempts: {url} - {last_exception}")
|
||
|
|
raise last_exception
|
||
|
|
|
||
|
|
raise RuntimeError("Request failed after retries")
|
||
|
|
|
||
|
|
def _request_stream(self, method: str, url: str, **kwargs):
|
||
|
|
"""Make a streaming request."""
|
||
|
|
if not self._client:
|
||
|
|
raise RuntimeError("HTTPClient must be used with context manager (with statement)")
|
||
|
|
|
||
|
|
# Merge headers
|
||
|
|
if "headers" in kwargs and kwargs["headers"]:
|
||
|
|
headers = self._get_headers()
|
||
|
|
headers.update(kwargs["headers"])
|
||
|
|
kwargs["headers"] = headers
|
||
|
|
else:
|
||
|
|
kwargs["headers"] = self._get_headers()
|
||
|
|
|
||
|
|
return self._client.stream(method, url, **kwargs)
|
||
|
|
|
||
|
|
|
||
|
|
class AsyncHTTPClient:
|
||
|
|
"""Unified async HTTP client with asyncio support."""
|
||
|
|
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
timeout: float = DEFAULT_TIMEOUT,
|
||
|
|
retries: int = DEFAULT_RETRIES,
|
||
|
|
user_agent: str = DEFAULT_USER_AGENT,
|
||
|
|
verify_ssl: bool = True,
|
||
|
|
headers: Optional[Dict[str, str]] = None,
|
||
|
|
):
|
||
|
|
"""
|
||
|
|
Initialize async HTTP client.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
timeout: Request timeout in seconds
|
||
|
|
retries: Number of retries on transient failures
|
||
|
|
user_agent: User-Agent header value
|
||
|
|
verify_ssl: Whether to verify SSL certificates
|
||
|
|
headers: Additional headers to include in all requests
|
||
|
|
"""
|
||
|
|
self.timeout = timeout
|
||
|
|
self.retries = retries
|
||
|
|
self.user_agent = user_agent
|
||
|
|
self.verify_ssl = verify_ssl
|
||
|
|
self.base_headers = headers or {}
|
||
|
|
self._client: Optional[httpx.AsyncClient] = None
|
||
|
|
|
||
|
|
async def __aenter__(self):
|
||
|
|
"""Async context manager entry."""
|
||
|
|
self._client = httpx.AsyncClient(
|
||
|
|
timeout=self.timeout,
|
||
|
|
verify=self.verify_ssl,
|
||
|
|
headers=self._get_headers(),
|
||
|
|
)
|
||
|
|
return self
|
||
|
|
|
||
|
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||
|
|
"""Async context manager exit."""
|
||
|
|
if self._client:
|
||
|
|
await self._client.aclose()
|
||
|
|
self._client = None
|
||
|
|
|
||
|
|
def _get_headers(self) -> Dict[str, str]:
|
||
|
|
"""Get request headers with user-agent."""
|
||
|
|
headers = {"User-Agent": self.user_agent}
|
||
|
|
headers.update(self.base_headers)
|
||
|
|
return headers
|
||
|
|
|
||
|
|
async def get(
|
||
|
|
self,
|
||
|
|
url: str,
|
||
|
|
params: Optional[Dict[str, Any]] = None,
|
||
|
|
headers: Optional[Dict[str, str]] = None,
|
||
|
|
allow_redirects: bool = True,
|
||
|
|
) -> httpx.Response:
|
||
|
|
"""
|
||
|
|
Make an async GET request.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
url: Request URL
|
||
|
|
params: Query parameters
|
||
|
|
headers: Additional headers
|
||
|
|
allow_redirects: Follow redirects
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
httpx.Response object
|
||
|
|
"""
|
||
|
|
return await self._request(
|
||
|
|
"GET",
|
||
|
|
url,
|
||
|
|
params=params,
|
||
|
|
headers=headers,
|
||
|
|
follow_redirects=allow_redirects,
|
||
|
|
)
|
||
|
|
|
||
|
|
async def post(
|
||
|
|
self,
|
||
|
|
url: str,
|
||
|
|
data: Optional[Any] = None,
|
||
|
|
json: Optional[Dict] = None,
|
||
|
|
headers: Optional[Dict[str, str]] = None,
|
||
|
|
) -> httpx.Response:
|
||
|
|
"""
|
||
|
|
Make an async POST request.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
url: Request URL
|
||
|
|
data: Form data
|
||
|
|
json: JSON data
|
||
|
|
headers: Additional headers
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
httpx.Response object
|
||
|
|
"""
|
||
|
|
return await self._request(
|
||
|
|
"POST",
|
||
|
|
url,
|
||
|
|
data=data,
|
||
|
|
json=json,
|
||
|
|
headers=headers,
|
||
|
|
)
|
||
|
|
|
||
|
|
async def download(
|
||
|
|
self,
|
||
|
|
url: str,
|
||
|
|
file_path: str,
|
||
|
|
chunk_size: int = 8192,
|
||
|
|
progress_callback: Optional[Callable[[int, int], None]] = None,
|
||
|
|
headers: Optional[Dict[str, str]] = None,
|
||
|
|
) -> Path:
|
||
|
|
"""
|
||
|
|
Download a file from URL asynchronously with optional progress tracking.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
url: File URL
|
||
|
|
file_path: Local file path to save to
|
||
|
|
chunk_size: Download chunk size
|
||
|
|
progress_callback: Callback(bytes_downloaded, total_bytes)
|
||
|
|
headers: Additional headers
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Path object of downloaded file
|
||
|
|
"""
|
||
|
|
path = Path(file_path)
|
||
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
async with self._request_stream("GET", url, headers=headers) as response:
|
||
|
|
response.raise_for_status()
|
||
|
|
total_bytes = int(response.headers.get("content-length", 0))
|
||
|
|
bytes_downloaded = 0
|
||
|
|
|
||
|
|
with open(path, "wb") as f:
|
||
|
|
async for chunk in response.aiter_bytes(chunk_size):
|
||
|
|
if chunk:
|
||
|
|
f.write(chunk)
|
||
|
|
bytes_downloaded += len(chunk)
|
||
|
|
if progress_callback:
|
||
|
|
progress_callback(bytes_downloaded, total_bytes)
|
||
|
|
|
||
|
|
return path
|
||
|
|
|
||
|
|
async def _request(
|
||
|
|
self,
|
||
|
|
method: str,
|
||
|
|
url: str,
|
||
|
|
**kwargs
|
||
|
|
) -> httpx.Response:
|
||
|
|
"""
|
||
|
|
Make an async HTTP request with automatic retries.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
method: HTTP method
|
||
|
|
url: Request URL
|
||
|
|
**kwargs: Additional arguments for httpx.AsyncClient.request()
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
httpx.Response object
|
||
|
|
"""
|
||
|
|
if not self._client:
|
||
|
|
raise RuntimeError("AsyncHTTPClient must be used with async context manager")
|
||
|
|
|
||
|
|
# Merge headers
|
||
|
|
if "headers" in kwargs and kwargs["headers"]:
|
||
|
|
headers = self._get_headers()
|
||
|
|
headers.update(kwargs["headers"])
|
||
|
|
kwargs["headers"] = headers
|
||
|
|
else:
|
||
|
|
kwargs["headers"] = self._get_headers()
|
||
|
|
|
||
|
|
last_exception = None
|
||
|
|
|
||
|
|
for attempt in range(self.retries):
|
||
|
|
try:
|
||
|
|
response = await self._client.request(method, url, **kwargs)
|
||
|
|
response.raise_for_status()
|
||
|
|
return response
|
||
|
|
except httpx.TimeoutException as e:
|
||
|
|
last_exception = e
|
||
|
|
logger.warning(f"Timeout on attempt {attempt + 1}/{self.retries}: {url}")
|
||
|
|
if attempt < self.retries - 1:
|
||
|
|
await asyncio.sleep(0.5) # Brief delay before retry
|
||
|
|
continue
|
||
|
|
except httpx.HTTPStatusError as e:
|
||
|
|
# Don't retry on 4xx errors
|
||
|
|
if 400 <= e.response.status_code < 500:
|
||
|
|
try:
|
||
|
|
response_text = e.response.text[:500]
|
||
|
|
except:
|
||
|
|
response_text = "<unable to read response>"
|
||
|
|
logger.error(f"HTTP {e.response.status_code} from {url}: {response_text}")
|
||
|
|
raise
|
||
|
|
last_exception = e
|
||
|
|
try:
|
||
|
|
response_text = e.response.text[:200]
|
||
|
|
except:
|
||
|
|
response_text = "<unable to read response>"
|
||
|
|
logger.warning(f"HTTP {e.response.status_code} on attempt {attempt + 1}/{self.retries}: {url} - {response_text}")
|
||
|
|
if attempt < self.retries - 1:
|
||
|
|
await asyncio.sleep(0.5)
|
||
|
|
continue
|
||
|
|
except (httpx.RequestError, httpx.ConnectError) as e:
|
||
|
|
last_exception = e
|
||
|
|
logger.warning(f"Connection error on attempt {attempt + 1}/{self.retries}: {url} - {e}")
|
||
|
|
if attempt < self.retries - 1:
|
||
|
|
await asyncio.sleep(0.5)
|
||
|
|
continue
|
||
|
|
|
||
|
|
if last_exception:
|
||
|
|
logger.error(f"Request failed after {self.retries} attempts: {url} - {last_exception}")
|
||
|
|
raise last_exception
|
||
|
|
|
||
|
|
raise RuntimeError("Request failed after retries")
|
||
|
|
|
||
|
|
def _request_stream(self, method: str, url: str, **kwargs):
|
||
|
|
"""Make a streaming request."""
|
||
|
|
if not self._client:
|
||
|
|
raise RuntimeError("AsyncHTTPClient must be used with async context manager")
|
||
|
|
|
||
|
|
# Merge headers
|
||
|
|
if "headers" in kwargs and kwargs["headers"]:
|
||
|
|
headers = self._get_headers()
|
||
|
|
headers.update(kwargs["headers"])
|
||
|
|
kwargs["headers"] = headers
|
||
|
|
else:
|
||
|
|
kwargs["headers"] = self._get_headers()
|
||
|
|
|
||
|
|
return self._client.stream(method, url, **kwargs)
|
||
|
|
|
||
|
|
|
||
|
|
# Convenience function for quick sync requests
|
||
|
|
def get(url: str, **kwargs) -> httpx.Response:
|
||
|
|
"""Quick GET request without context manager."""
|
||
|
|
with HTTPClient() as client:
|
||
|
|
return client.get(url, **kwargs)
|
||
|
|
|
||
|
|
|
||
|
|
def post(url: str, **kwargs) -> httpx.Response:
|
||
|
|
"""Quick POST request without context manager."""
|
||
|
|
with HTTPClient() as client:
|
||
|
|
return client.post(url, **kwargs)
|
||
|
|
|
||
|
|
|
||
|
|
def download(
|
||
|
|
url: str,
|
||
|
|
file_path: str,
|
||
|
|
progress_callback: Optional[Callable[[int, int], None]] = None,
|
||
|
|
**kwargs
|
||
|
|
) -> Path:
|
||
|
|
"""Quick file download without context manager."""
|
||
|
|
with HTTPClient() as client:
|
||
|
|
return client.download(url, file_path, progress_callback=progress_callback, **kwargs)
|