""" Unified HTTP client for downlow using httpx. Provides synchronous and asynchronous HTTP operations with: - Automatic retries on transient failures - Configurable timeouts and headers - Built-in progress tracking for downloads - Request/response logging support """ import httpx import asyncio import sys import time import traceback import re from typing import Optional, Dict, Any, Callable, BinaryIO, List, Iterable, Set from pathlib import Path from urllib.parse import unquote, urlparse, parse_qs import logging from SYS.logger import debug, log from SYS.models import DebugLogger, DownloadError, DownloadMediaResult, ProgressBar from SYS.utils import ensure_directory, sha256_file try: # Optional; used for metadata extraction when available from SYS.metadata import extract_ytdlp_tags except Exception: # pragma: no cover - optional dependency extract_ytdlp_tags = None # type: ignore[assignment] logger = logging.getLogger(__name__) # Default configuration DEFAULT_TIMEOUT = 30.0 DEFAULT_RETRIES = 3 DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" class HTTPClient: """Unified HTTP client with sync support.""" def __init__( self, timeout: float = DEFAULT_TIMEOUT, retries: int = DEFAULT_RETRIES, user_agent: str = DEFAULT_USER_AGENT, verify_ssl: bool = True, headers: Optional[Dict[str, str]] = None, ): """ Initialize HTTP client. Args: timeout: Request timeout in seconds retries: Number of retries on transient failures user_agent: User-Agent header value verify_ssl: Whether to verify SSL certificates headers: Additional headers to include in all requests """ self.timeout = timeout self.retries = retries self.user_agent = user_agent self.verify_ssl = verify_ssl self.base_headers = headers or {} self._client: Optional[httpx.Client] = None def __enter__(self): """Context manager entry.""" self._client = httpx.Client( timeout=self.timeout, verify=self.verify_ssl, headers=self._get_headers(), ) return self def __exit__(self, exc_type, exc_val, exc_tb): """Context manager exit.""" if self._client: self._client.close() self._client = None def _get_headers(self) -> Dict[str, str]: """Get request headers with user-agent.""" headers = { "User-Agent": self.user_agent } headers.update(self.base_headers) return headers def get( self, url: str, params: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None, allow_redirects: bool = True, ) -> httpx.Response: """ Make a GET request. Args: url: Request URL params: Query parameters headers: Additional headers allow_redirects: Follow redirects Returns: httpx.Response object """ return self._request( "GET", url, params=params, headers=headers, follow_redirects=allow_redirects, ) def post( self, url: str, data: Optional[Any] = None, json: Optional[Dict] = None, files: Optional[Dict] = None, headers: Optional[Dict[str, str]] = None, ) -> httpx.Response: """ Make a POST request. Args: url: Request URL data: Form data json: JSON data files: Files to upload headers: Additional headers Returns: httpx.Response object """ return self._request( "POST", url, data=data, json=json, files=files, headers=headers, ) def put( self, url: str, data: Optional[Any] = None, json: Optional[Dict] = None, content: Optional[Any] = None, files: Optional[Dict] = None, headers: Optional[Dict[str, str]] = None, ) -> httpx.Response: """ Make a PUT request. Args: url: Request URL data: Form data json: JSON data content: Raw content files: Files to upload headers: Additional headers Returns: httpx.Response object """ return self._request( "PUT", url, data=data, json=json, content=content, files=files, headers=headers, ) def delete( self, url: str, headers: Optional[Dict[str, str]] = None, ) -> httpx.Response: """ Make a DELETE request. Args: url: Request URL headers: Additional headers Returns: httpx.Response object """ return self._request( "DELETE", url, headers=headers, ) def request(self, method: str, url: str, **kwargs) -> httpx.Response: """ Make a generic HTTP request. Args: method: HTTP method url: Request URL **kwargs: Additional arguments Returns: httpx.Response object """ return self._request(method, url, **kwargs) def download( self, url: str, file_path: str, chunk_size: int = 8192, progress_callback: Optional[Callable[[int, int], None]] = None, headers: Optional[Dict[str, str]] = None, ) -> Path: """ Download a file from URL with optional progress tracking. Args: url: File URL file_path: Local file path to save to chunk_size: Download chunk size progress_callback: Callback(bytes_downloaded, total_bytes) headers: Additional headers Returns: Path object of downloaded file """ path = Path(file_path) path.parent.mkdir(parents=True, exist_ok=True) with self._request_stream("GET", url, headers=headers, follow_redirects=True) as response: response.raise_for_status() total_bytes = int(response.headers.get("content-length", 0)) bytes_downloaded = 0 # Render progress immediately (even if the transfer is very fast) if progress_callback: try: progress_callback(0, total_bytes) except Exception: pass with open(path, "wb") as f: for chunk in response.iter_bytes(chunk_size): if chunk: f.write(chunk) bytes_downloaded += len(chunk) if progress_callback: progress_callback(bytes_downloaded, total_bytes) # Ensure a final callback is emitted. if progress_callback: try: progress_callback(bytes_downloaded, total_bytes) except Exception: pass return path def _request( self, method: str, url: str, raise_for_status: bool = True, log_http_errors: bool = True, **kwargs, ) -> httpx.Response: """ Make an HTTP request with automatic retries. Args: method: HTTP method url: Request URL **kwargs: Additional arguments for httpx.Client.request() Returns: httpx.Response object """ if not self._client: raise RuntimeError( "HTTPClient must be used with context manager (with statement)" ) # Merge headers if "headers" in kwargs and kwargs["headers"]: headers = self._get_headers() headers.update(kwargs["headers"]) kwargs["headers"] = headers else: kwargs["headers"] = self._get_headers() last_exception = None for attempt in range(self.retries): try: response = self._client.request(method, url, **kwargs) if raise_for_status: response.raise_for_status() return response except httpx.TimeoutException as e: last_exception = e logger.warning( f"Timeout on attempt {attempt + 1}/{self.retries}: {url}" ) if attempt < self.retries - 1: continue except httpx.HTTPStatusError as e: # Don't retry on 4xx errors if 400 <= e.response.status_code < 500: try: response_text = e.response.text[:500] except: response_text = "" if log_http_errors: logger.error( f"HTTP {e.response.status_code} from {url}: {response_text}" ) raise last_exception = e try: response_text = e.response.text[:200] except: response_text = "" logger.warning( f"HTTP {e.response.status_code} on attempt {attempt + 1}/{self.retries}: {url} - {response_text}" ) if attempt < self.retries - 1: continue except (httpx.RequestError, httpx.ConnectError) as e: last_exception = e logger.warning( f"Connection error on attempt {attempt + 1}/{self.retries}: {url} - {e}" ) if attempt < self.retries - 1: continue if last_exception: logger.error( f"Request failed after {self.retries} attempts: {url} - {last_exception}" ) raise last_exception raise RuntimeError("Request failed after retries") def _request_stream(self, method: str, url: str, **kwargs): """Make a streaming request.""" if not self._client: raise RuntimeError( "HTTPClient must be used with context manager (with statement)" ) # Merge headers if "headers" in kwargs and kwargs["headers"]: headers = self._get_headers() headers.update(kwargs["headers"]) kwargs["headers"] = headers else: kwargs["headers"] = self._get_headers() return self._client.stream(method, url, **kwargs) def download_direct_file( url: str, output_dir: Path, debug_logger: Optional[DebugLogger] = None, quiet: bool = False, suggested_filename: Optional[str] = None, pipeline_progress: Optional[Any] = None, ) -> DownloadMediaResult: """Download a direct file (PDF, image, document, etc.) with guardrails and metadata hooks.""" ensure_directory(output_dir) def _sanitize_filename(name: str) -> str: # Windows-safe filename sanitization. text = str(name or "").strip() if not text: return "" text = text.replace("/", "\\") text = text.split("\\")[-1] invalid = set('<>:"/\\|?*') cleaned_chars: List[str] = [] for ch in text: o = ord(ch) if o < 32 or ch in invalid: cleaned_chars.append(" ") continue cleaned_chars.append(ch) cleaned = " ".join("".join(cleaned_chars).split()).strip() cleaned = cleaned.rstrip(" .") return cleaned def _unique_path(path: Path) -> Path: if not path.exists(): return path stem = path.stem suffix = path.suffix parent = path.parent for i in range(1, 10_000): candidate = parent / f"{stem} ({i}){suffix}" if not candidate.exists(): return candidate return parent / f"{stem} ({int(time.time())}){suffix}" parsed_url = urlparse(url) url_path = parsed_url.path filename: Optional[str] = None if parsed_url.query: query_params = parse_qs(parsed_url.query) for param_name in ("filename", "download", "file", "name"): if param_name in query_params and query_params[param_name]: filename = query_params[param_name][0] filename = unquote(filename) break if not filename or not filename.strip(): filename = url_path.split("/")[-1] if url_path else "" filename = unquote(filename) if "?" in filename: filename = filename.split("?")[0] content_type = "" try: with HTTPClient(timeout=10.0) as client: response = client._request("HEAD", url, follow_redirects=True) content_disposition = response.headers.get("content-disposition", "") try: content_type = str(response.headers.get("content-type", "") or "").strip().lower() except Exception: content_type = "" if content_disposition: match = re.search(r'filename\*?=(?:"([^"]*)"|([^;\s]*))', content_disposition) if match: extracted_name = match.group(1) or match.group(2) if extracted_name: filename = unquote(extracted_name) if not quiet: debug(f"Filename from Content-Disposition: {filename}") except Exception as exc: if not quiet: log(f"Could not get filename from headers: {exc}", file=sys.stderr) try: page_like_exts = {".php", ".asp", ".aspx", ".jsp", ".cgi"} ext = "" try: ext = Path(str(filename or "")).suffix.lower() except Exception: ext = "" ct0 = (content_type or "").split(";", 1)[0].strip().lower() must_probe = bool(ct0.startswith("text/html") or ext in page_like_exts) if must_probe: with HTTPClient(timeout=10.0) as client: with client._request_stream("GET", url, follow_redirects=True) as resp: resp.raise_for_status() ct = ( str(resp.headers.get("content-type", "") or "") .split(";", 1)[0] .strip() .lower() ) if ct.startswith("text/html"): raise DownloadError("URL appears to be an HTML page, not a direct file") except DownloadError: raise except Exception: pass suggested = _sanitize_filename(suggested_filename) if suggested_filename else "" if suggested: suggested_path = Path(suggested) if suggested_path.suffix: filename = suggested else: detected_ext = "" try: detected_ext = Path(str(filename)).suffix except Exception: detected_ext = "" filename = suggested + detected_ext if detected_ext else suggested try: has_ext = bool(filename and Path(str(filename)).suffix) except Exception: has_ext = False if filename and (not has_ext): ct = (content_type or "").split(";", 1)[0].strip().lower() ext_by_ct = { "application/pdf": ".pdf", "application/epub+zip": ".epub", "application/x-mobipocket-ebook": ".mobi", "image/jpeg": ".jpg", "image/png": ".png", "image/webp": ".webp", "image/gif": ".gif", "text/plain": ".txt", "application/zip": ".zip", } if ct in ext_by_ct: filename = f"{filename}{ext_by_ct[ct]}" elif ct.startswith("text/html"): raise DownloadError("URL appears to be an HTML page, not a direct file") if not filename or not str(filename).strip(): raise DownloadError( "Could not determine filename for URL (no Content-Disposition and no path filename)" ) file_path = _unique_path(output_dir / str(filename)) use_pipeline_transfer = False try: if pipeline_progress is not None and hasattr(pipeline_progress, "update_transfer"): ui = None if hasattr(pipeline_progress, "ui_and_pipe_index"): ui, _ = pipeline_progress.ui_and_pipe_index() # type: ignore[attr-defined] use_pipeline_transfer = ui is not None except Exception: use_pipeline_transfer = False progress_bar: Optional[ProgressBar] = None if (not quiet) and (not use_pipeline_transfer): progress_bar = ProgressBar() transfer_started = [False] if not quiet: debug(f"Direct download: {filename}") try: start_time = time.time() downloaded_bytes = [0] transfer_started[0] = False def _maybe_begin_transfer(content_length: int) -> None: if pipeline_progress is None or transfer_started[0]: return try: total_val: Optional[int] = ( int(content_length) if isinstance(content_length, int) and content_length > 0 else None ) except Exception: total_val = None try: if hasattr(pipeline_progress, "begin_transfer"): pipeline_progress.begin_transfer( label=str(filename or "download"), total=total_val, ) transfer_started[0] = True except Exception: return def progress_callback(bytes_downloaded: int, content_length: int) -> None: downloaded_bytes[0] = int(bytes_downloaded or 0) try: if pipeline_progress is not None and hasattr(pipeline_progress, "update_transfer"): _maybe_begin_transfer(content_length) total_val: Optional[int] = ( int(content_length) if isinstance(content_length, int) and content_length > 0 else None ) pipeline_progress.update_transfer( label=str(filename or "download"), completed=int(bytes_downloaded or 0), total=total_val, ) except Exception: pass if progress_bar is not None: progress_bar.update( downloaded=int(bytes_downloaded or 0), total=int(content_length) if content_length and content_length > 0 else None, label=str(filename or "download"), file=sys.stderr, ) with HTTPClient(timeout=30.0) as client: client.download(url, str(file_path), progress_callback=progress_callback) elapsed = time.time() - start_time try: if progress_bar is not None: progress_bar.finish() except Exception: pass try: if pipeline_progress is not None and transfer_started[0] and hasattr( pipeline_progress, "finish_transfer" ): pipeline_progress.finish_transfer(label=str(filename or "download")) except Exception: pass if not quiet: debug(f"✓ Downloaded in {elapsed:.1f}s") ext_out = "" try: ext_out = Path(str(filename)).suffix.lstrip(".") except Exception: ext_out = "" info: Dict[str, Any] = { "id": str(filename).rsplit(".", 1)[0] if "." in str(filename) else str(filename), "ext": ext_out, "webpage_url": url, } hash_value = None try: hash_value = sha256_file(file_path) except Exception: pass tags: List[str] = [] if extract_ytdlp_tags: try: tags = extract_ytdlp_tags(info) except Exception as exc: log(f"Error extracting tags: {exc}", file=sys.stderr) if not any(str(t).startswith("title:") for t in tags): info["title"] = str(filename) tags = [] if extract_ytdlp_tags: try: tags = extract_ytdlp_tags(info) except Exception as exc: log(f"Error extracting tags with filename: {exc}", file=sys.stderr) if debug_logger is not None: debug_logger.write_record( "direct-file-downloaded", {"url": url, "path": str(file_path), "hash": hash_value}, ) return DownloadMediaResult( path=file_path, info=info, tag=tags, source_url=url, hash_value=hash_value, ) except (httpx.HTTPError, httpx.RequestError) as exc: try: if progress_bar is not None: progress_bar.finish() except Exception: pass try: if pipeline_progress is not None and transfer_started[0] and hasattr( pipeline_progress, "finish_transfer" ): pipeline_progress.finish_transfer(label=str(filename or "download")) except Exception: pass log(f"Download error: {exc}", file=sys.stderr) if debug_logger is not None: debug_logger.write_record( "exception", {"phase": "direct-file", "url": url, "error": str(exc)}, ) raise DownloadError(f"Failed to download {url}: {exc}") from exc except Exception as exc: try: if progress_bar is not None: progress_bar.finish() except Exception: pass try: if pipeline_progress is not None and transfer_started[0] and hasattr( pipeline_progress, "finish_transfer" ): pipeline_progress.finish_transfer(label=str(filename or "download")) except Exception: pass log(f"Error downloading file: {exc}", file=sys.stderr) if debug_logger is not None: debug_logger.write_record( "exception", { "phase": "direct-file", "url": url, "error": str(exc), "traceback": traceback.format_exc(), }, ) raise DownloadError(f"Error downloading file: {exc}") from exc # Back-compat alias _download_direct_file = download_direct_file class AsyncHTTPClient: """Unified async HTTP client with asyncio support.""" def __init__( self, timeout: float = DEFAULT_TIMEOUT, retries: int = DEFAULT_RETRIES, user_agent: str = DEFAULT_USER_AGENT, verify_ssl: bool = True, headers: Optional[Dict[str, str]] = None, ): """ Initialize async HTTP client. Args: timeout: Request timeout in seconds retries: Number of retries on transient failures user_agent: User-Agent header value verify_ssl: Whether to verify SSL certificates headers: Additional headers to include in all requests """ self.timeout = timeout self.retries = retries self.user_agent = user_agent self.verify_ssl = verify_ssl self.base_headers = headers or {} self._client: Optional[httpx.AsyncClient] = None async def __aenter__(self): """Async context manager entry.""" self._client = httpx.AsyncClient( timeout=self.timeout, verify=self.verify_ssl, headers=self._get_headers(), ) return self async def __aexit__(self, exc_type, exc_val, exc_tb): """Async context manager exit.""" if self._client: await self._client.aclose() self._client = None def _get_headers(self) -> Dict[str, str]: """Get request headers with user-agent.""" headers = { "User-Agent": self.user_agent } headers.update(self.base_headers) return headers async def get( self, url: str, params: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None, allow_redirects: bool = True, ) -> httpx.Response: """ Make an async GET request. Args: url: Request URL params: Query parameters headers: Additional headers allow_redirects: Follow redirects Returns: httpx.Response object """ return await self._request( "GET", url, params=params, headers=headers, follow_redirects=allow_redirects, ) async def post( self, url: str, data: Optional[Any] = None, json: Optional[Dict] = None, headers: Optional[Dict[str, str]] = None, ) -> httpx.Response: """ Make an async POST request. Args: url: Request URL data: Form data json: JSON data headers: Additional headers Returns: httpx.Response object """ return await self._request( "POST", url, data=data, json=json, headers=headers, ) async def download( self, url: str, file_path: str, chunk_size: int = 8192, progress_callback: Optional[Callable[[int, int], None]] = None, headers: Optional[Dict[str, str]] = None, ) -> Path: """ Download a file from URL asynchronously with optional progress tracking. Args: url: File URL file_path: Local file path to save to chunk_size: Download chunk size progress_callback: Callback(bytes_downloaded, total_bytes) headers: Additional headers Returns: Path object of downloaded file """ path = Path(file_path) path.parent.mkdir(parents=True, exist_ok=True) async with self._request_stream("GET", url, headers=headers) as response: response.raise_for_status() total_bytes = int(response.headers.get("content-length", 0)) bytes_downloaded = 0 with open(path, "wb") as f: async for chunk in response.aiter_bytes(chunk_size): if chunk: f.write(chunk) bytes_downloaded += len(chunk) if progress_callback: progress_callback(bytes_downloaded, total_bytes) return path async def _request(self, method: str, url: str, **kwargs) -> httpx.Response: """ Make an async HTTP request with automatic retries. Args: method: HTTP method url: Request URL **kwargs: Additional arguments for httpx.AsyncClient.request() Returns: httpx.Response object """ if not self._client: raise RuntimeError( "AsyncHTTPClient must be used with async context manager" ) # Merge headers if "headers" in kwargs and kwargs["headers"]: headers = self._get_headers() headers.update(kwargs["headers"]) kwargs["headers"] = headers else: kwargs["headers"] = self._get_headers() last_exception = None for attempt in range(self.retries): try: response = await self._client.request(method, url, **kwargs) response.raise_for_status() return response except httpx.TimeoutException as e: last_exception = e logger.warning( f"Timeout on attempt {attempt + 1}/{self.retries}: {url}" ) if attempt < self.retries - 1: await asyncio.sleep(0.5) # Brief delay before retry continue except httpx.HTTPStatusError as e: # Don't retry on 4xx errors if 400 <= e.response.status_code < 500: try: response_text = e.response.text[:500] except: response_text = "" logger.error( f"HTTP {e.response.status_code} from {url}: {response_text}" ) raise last_exception = e try: response_text = e.response.text[:200] except: response_text = "" logger.warning( f"HTTP {e.response.status_code} on attempt {attempt + 1}/{self.retries}: {url} - {response_text}" ) if attempt < self.retries - 1: await asyncio.sleep(0.5) continue except (httpx.RequestError, httpx.ConnectError) as e: last_exception = e logger.warning( f"Connection error on attempt {attempt + 1}/{self.retries}: {url} - {e}" ) if attempt < self.retries - 1: await asyncio.sleep(0.5) continue if last_exception: logger.error( f"Request failed after {self.retries} attempts: {url} - {last_exception}" ) raise last_exception raise RuntimeError("Request failed after retries") def _request_stream(self, method: str, url: str, **kwargs): """Make a streaming request.""" if not self._client: raise RuntimeError( "AsyncHTTPClient must be used with async context manager" ) # Merge headers if "headers" in kwargs and kwargs["headers"]: headers = self._get_headers() headers.update(kwargs["headers"]) kwargs["headers"] = headers else: kwargs["headers"] = self._get_headers() return self._client.stream(method, url, **kwargs) # Convenience function for quick sync requests def get(url: str, **kwargs) -> httpx.Response: """Quick GET request without context manager.""" with HTTPClient() as client: return client.get(url, **kwargs) def post(url: str, **kwargs) -> httpx.Response: """Quick POST request without context manager.""" with HTTPClient() as client: return client.post(url, **kwargs) def download( url: str, file_path: str, progress_callback: Optional[Callable[[int, int], None]] = None, **kwargs, ) -> Path: """Quick file download without context manager.""" with HTTPClient() as client: return client.download( url, file_path, progress_callback=progress_callback, **kwargs )