df
This commit is contained in:
369
API/HTTP.py
369
API/HTTP.py
@@ -10,10 +10,24 @@ Provides synchronous and asynchronous HTTP operations with:
|
||||
|
||||
import httpx
|
||||
import asyncio
|
||||
from typing import Optional, Dict, Any, Callable, BinaryIO
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
import re
|
||||
from typing import Optional, Dict, Any, Callable, BinaryIO, List, Iterable, Set
|
||||
from pathlib import Path
|
||||
from urllib.parse import unquote, urlparse, parse_qs
|
||||
import logging
|
||||
|
||||
from SYS.logger import debug, log
|
||||
from SYS.models import DebugLogger, DownloadError, DownloadMediaResult, ProgressBar
|
||||
from SYS.utils import ensure_directory, sha256_file
|
||||
|
||||
try: # Optional; used for metadata extraction when available
|
||||
from SYS.metadata import extract_ytdlp_tags
|
||||
except Exception: # pragma: no cover - optional dependency
|
||||
extract_ytdlp_tags = None # type: ignore[assignment]
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Default configuration
|
||||
@@ -366,6 +380,359 @@ class HTTPClient:
|
||||
return self._client.stream(method, url, **kwargs)
|
||||
|
||||
|
||||
def download_direct_file(
|
||||
url: str,
|
||||
output_dir: Path,
|
||||
debug_logger: Optional[DebugLogger] = None,
|
||||
quiet: bool = False,
|
||||
suggested_filename: Optional[str] = None,
|
||||
pipeline_progress: Optional[Any] = None,
|
||||
) -> DownloadMediaResult:
|
||||
"""Download a direct file (PDF, image, document, etc.) with guardrails and metadata hooks."""
|
||||
|
||||
ensure_directory(output_dir)
|
||||
|
||||
def _sanitize_filename(name: str) -> str:
|
||||
# Windows-safe filename sanitization.
|
||||
text = str(name or "").strip()
|
||||
if not text:
|
||||
return ""
|
||||
text = text.replace("/", "\\")
|
||||
text = text.split("\\")[-1]
|
||||
|
||||
invalid = set('<>:"/\\|?*')
|
||||
cleaned_chars: List[str] = []
|
||||
for ch in text:
|
||||
o = ord(ch)
|
||||
if o < 32 or ch in invalid:
|
||||
cleaned_chars.append(" ")
|
||||
continue
|
||||
cleaned_chars.append(ch)
|
||||
cleaned = " ".join("".join(cleaned_chars).split()).strip()
|
||||
cleaned = cleaned.rstrip(" .")
|
||||
return cleaned
|
||||
|
||||
def _unique_path(path: Path) -> Path:
|
||||
if not path.exists():
|
||||
return path
|
||||
stem = path.stem
|
||||
suffix = path.suffix
|
||||
parent = path.parent
|
||||
for i in range(1, 10_000):
|
||||
candidate = parent / f"{stem} ({i}){suffix}"
|
||||
if not candidate.exists():
|
||||
return candidate
|
||||
return parent / f"{stem} ({int(time.time())}){suffix}"
|
||||
|
||||
parsed_url = urlparse(url)
|
||||
url_path = parsed_url.path
|
||||
|
||||
filename: Optional[str] = None
|
||||
if parsed_url.query:
|
||||
query_params = parse_qs(parsed_url.query)
|
||||
for param_name in ("filename", "download", "file", "name"):
|
||||
if param_name in query_params and query_params[param_name]:
|
||||
filename = query_params[param_name][0]
|
||||
filename = unquote(filename)
|
||||
break
|
||||
|
||||
if not filename or not filename.strip():
|
||||
filename = url_path.split("/")[-1] if url_path else ""
|
||||
filename = unquote(filename)
|
||||
|
||||
if "?" in filename:
|
||||
filename = filename.split("?")[0]
|
||||
|
||||
content_type = ""
|
||||
try:
|
||||
with HTTPClient(timeout=10.0) as client:
|
||||
response = client._request("HEAD", url, follow_redirects=True)
|
||||
content_disposition = response.headers.get("content-disposition", "")
|
||||
try:
|
||||
content_type = str(response.headers.get("content-type", "") or "").strip().lower()
|
||||
except Exception:
|
||||
content_type = ""
|
||||
|
||||
if content_disposition:
|
||||
match = re.search(r'filename\*?=(?:"([^"]*)"|([^;\s]*))', content_disposition)
|
||||
if match:
|
||||
extracted_name = match.group(1) or match.group(2)
|
||||
if extracted_name:
|
||||
filename = unquote(extracted_name)
|
||||
if not quiet:
|
||||
debug(f"Filename from Content-Disposition: {filename}")
|
||||
except Exception as exc:
|
||||
if not quiet:
|
||||
log(f"Could not get filename from headers: {exc}", file=sys.stderr)
|
||||
|
||||
try:
|
||||
page_like_exts = {".php", ".asp", ".aspx", ".jsp", ".cgi"}
|
||||
ext = ""
|
||||
try:
|
||||
ext = Path(str(filename or "")).suffix.lower()
|
||||
except Exception:
|
||||
ext = ""
|
||||
|
||||
ct0 = (content_type or "").split(";", 1)[0].strip().lower()
|
||||
must_probe = bool(ct0.startswith("text/html") or ext in page_like_exts)
|
||||
|
||||
if must_probe:
|
||||
with HTTPClient(timeout=10.0) as client:
|
||||
with client._request_stream("GET", url, follow_redirects=True) as resp:
|
||||
resp.raise_for_status()
|
||||
ct = (
|
||||
str(resp.headers.get("content-type", "") or "")
|
||||
.split(";", 1)[0]
|
||||
.strip()
|
||||
.lower()
|
||||
)
|
||||
if ct.startswith("text/html"):
|
||||
raise DownloadError("URL appears to be an HTML page, not a direct file")
|
||||
except DownloadError:
|
||||
raise
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
suggested = _sanitize_filename(suggested_filename) if suggested_filename else ""
|
||||
if suggested:
|
||||
suggested_path = Path(suggested)
|
||||
if suggested_path.suffix:
|
||||
filename = suggested
|
||||
else:
|
||||
detected_ext = ""
|
||||
try:
|
||||
detected_ext = Path(str(filename)).suffix
|
||||
except Exception:
|
||||
detected_ext = ""
|
||||
filename = suggested + detected_ext if detected_ext else suggested
|
||||
|
||||
try:
|
||||
has_ext = bool(filename and Path(str(filename)).suffix)
|
||||
except Exception:
|
||||
has_ext = False
|
||||
|
||||
if filename and (not has_ext):
|
||||
ct = (content_type or "").split(";", 1)[0].strip().lower()
|
||||
ext_by_ct = {
|
||||
"application/pdf": ".pdf",
|
||||
"application/epub+zip": ".epub",
|
||||
"application/x-mobipocket-ebook": ".mobi",
|
||||
"image/jpeg": ".jpg",
|
||||
"image/png": ".png",
|
||||
"image/webp": ".webp",
|
||||
"image/gif": ".gif",
|
||||
"text/plain": ".txt",
|
||||
"application/zip": ".zip",
|
||||
}
|
||||
|
||||
if ct in ext_by_ct:
|
||||
filename = f"{filename}{ext_by_ct[ct]}"
|
||||
elif ct.startswith("text/html"):
|
||||
raise DownloadError("URL appears to be an HTML page, not a direct file")
|
||||
|
||||
if not filename or not str(filename).strip():
|
||||
raise DownloadError(
|
||||
"Could not determine filename for URL (no Content-Disposition and no path filename)"
|
||||
)
|
||||
|
||||
file_path = _unique_path(output_dir / str(filename))
|
||||
|
||||
use_pipeline_transfer = False
|
||||
try:
|
||||
if pipeline_progress is not None and hasattr(pipeline_progress, "update_transfer"):
|
||||
ui = None
|
||||
if hasattr(pipeline_progress, "ui_and_pipe_index"):
|
||||
ui, _ = pipeline_progress.ui_and_pipe_index() # type: ignore[attr-defined]
|
||||
use_pipeline_transfer = ui is not None
|
||||
except Exception:
|
||||
use_pipeline_transfer = False
|
||||
|
||||
progress_bar: Optional[ProgressBar] = None
|
||||
if (not quiet) and (not use_pipeline_transfer):
|
||||
progress_bar = ProgressBar()
|
||||
|
||||
transfer_started = [False]
|
||||
|
||||
if not quiet:
|
||||
debug(f"Direct download: {filename}")
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
downloaded_bytes = [0]
|
||||
transfer_started[0] = False
|
||||
|
||||
def _maybe_begin_transfer(content_length: int) -> None:
|
||||
if pipeline_progress is None or transfer_started[0]:
|
||||
return
|
||||
try:
|
||||
total_val: Optional[int] = (
|
||||
int(content_length)
|
||||
if isinstance(content_length, int) and content_length > 0
|
||||
else None
|
||||
)
|
||||
except Exception:
|
||||
total_val = None
|
||||
try:
|
||||
if hasattr(pipeline_progress, "begin_transfer"):
|
||||
pipeline_progress.begin_transfer(
|
||||
label=str(filename or "download"),
|
||||
total=total_val,
|
||||
)
|
||||
transfer_started[0] = True
|
||||
except Exception:
|
||||
return
|
||||
|
||||
def progress_callback(bytes_downloaded: int, content_length: int) -> None:
|
||||
downloaded_bytes[0] = int(bytes_downloaded or 0)
|
||||
|
||||
try:
|
||||
if pipeline_progress is not None and hasattr(pipeline_progress, "update_transfer"):
|
||||
_maybe_begin_transfer(content_length)
|
||||
total_val: Optional[int] = (
|
||||
int(content_length)
|
||||
if isinstance(content_length, int) and content_length > 0
|
||||
else None
|
||||
)
|
||||
pipeline_progress.update_transfer(
|
||||
label=str(filename or "download"),
|
||||
completed=int(bytes_downloaded or 0),
|
||||
total=total_val,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if progress_bar is not None:
|
||||
progress_bar.update(
|
||||
downloaded=int(bytes_downloaded or 0),
|
||||
total=int(content_length) if content_length and content_length > 0 else None,
|
||||
label=str(filename or "download"),
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
with HTTPClient(timeout=30.0) as client:
|
||||
client.download(url, str(file_path), progress_callback=progress_callback)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
try:
|
||||
if progress_bar is not None:
|
||||
progress_bar.finish()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
if pipeline_progress is not None and transfer_started[0] and hasattr(
|
||||
pipeline_progress, "finish_transfer"
|
||||
):
|
||||
pipeline_progress.finish_transfer(label=str(filename or "download"))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not quiet:
|
||||
debug(f"✓ Downloaded in {elapsed:.1f}s")
|
||||
|
||||
ext_out = ""
|
||||
try:
|
||||
ext_out = Path(str(filename)).suffix.lstrip(".")
|
||||
except Exception:
|
||||
ext_out = ""
|
||||
|
||||
info: Dict[str, Any] = {
|
||||
"id": str(filename).rsplit(".", 1)[0] if "." in str(filename) else str(filename),
|
||||
"ext": ext_out,
|
||||
"webpage_url": url,
|
||||
}
|
||||
|
||||
hash_value = None
|
||||
try:
|
||||
hash_value = sha256_file(file_path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
tags: List[str] = []
|
||||
if extract_ytdlp_tags:
|
||||
try:
|
||||
tags = extract_ytdlp_tags(info)
|
||||
except Exception as exc:
|
||||
log(f"Error extracting tags: {exc}", file=sys.stderr)
|
||||
|
||||
if not any(str(t).startswith("title:") for t in tags):
|
||||
info["title"] = str(filename)
|
||||
tags = []
|
||||
if extract_ytdlp_tags:
|
||||
try:
|
||||
tags = extract_ytdlp_tags(info)
|
||||
except Exception as exc:
|
||||
log(f"Error extracting tags with filename: {exc}", file=sys.stderr)
|
||||
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record(
|
||||
"direct-file-downloaded",
|
||||
{"url": url, "path": str(file_path), "hash": hash_value},
|
||||
)
|
||||
|
||||
return DownloadMediaResult(
|
||||
path=file_path,
|
||||
info=info,
|
||||
tag=tags,
|
||||
source_url=url,
|
||||
hash_value=hash_value,
|
||||
)
|
||||
|
||||
except (httpx.HTTPError, httpx.RequestError) as exc:
|
||||
try:
|
||||
if progress_bar is not None:
|
||||
progress_bar.finish()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
if pipeline_progress is not None and transfer_started[0] and hasattr(
|
||||
pipeline_progress, "finish_transfer"
|
||||
):
|
||||
pipeline_progress.finish_transfer(label=str(filename or "download"))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
log(f"Download error: {exc}", file=sys.stderr)
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record(
|
||||
"exception",
|
||||
{"phase": "direct-file", "url": url, "error": str(exc)},
|
||||
)
|
||||
raise DownloadError(f"Failed to download {url}: {exc}") from exc
|
||||
|
||||
except Exception as exc:
|
||||
try:
|
||||
if progress_bar is not None:
|
||||
progress_bar.finish()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
if pipeline_progress is not None and transfer_started[0] and hasattr(
|
||||
pipeline_progress, "finish_transfer"
|
||||
):
|
||||
pipeline_progress.finish_transfer(label=str(filename or "download"))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
log(f"Error downloading file: {exc}", file=sys.stderr)
|
||||
if debug_logger is not None:
|
||||
debug_logger.write_record(
|
||||
"exception",
|
||||
{
|
||||
"phase": "direct-file",
|
||||
"url": url,
|
||||
"error": str(exc),
|
||||
"traceback": traceback.format_exc(),
|
||||
},
|
||||
)
|
||||
raise DownloadError(f"Error downloading file: {exc}") from exc
|
||||
|
||||
|
||||
# Back-compat alias
|
||||
_download_direct_file = download_direct_file
|
||||
|
||||
|
||||
class AsyncHTTPClient:
|
||||
"""Unified async HTTP client with asyncio support."""
|
||||
|
||||
|
||||
Reference in New Issue
Block a user