This commit is contained in:
nose
2025-12-22 02:11:53 -08:00
parent d0b821b5dd
commit 16316bb3fd
20 changed files with 4218 additions and 2422 deletions

View File

@@ -584,10 +584,15 @@ def _download_direct_file(
filename = filename.split("?")[0]
# Try to get real filename from Content-Disposition header (HEAD request)
content_type = ""
try:
with HTTPClient(timeout=10.0) as client:
response = client._request("HEAD", url, follow_redirects=True)
content_disposition = response.headers.get("content-disposition", "")
try:
content_type = str(response.headers.get("content-type", "") or "").strip().lower()
except Exception:
content_type = ""
if content_disposition:
# Extract filename from Content-Disposition header
# Format: attachment; filename="filename.pdf" or filename=filename.pdf
@@ -620,9 +625,36 @@ def _download_direct_file(
else:
filename = suggested
# Final fallback if we still don't have a good filename
if not filename or "." not in filename:
filename = "downloaded_file.bin"
# If we still don't have an extension, try to infer one from Content-Type.
# Never fall back to a generic `.bin` extension.
try:
has_ext = bool(filename and Path(str(filename)).suffix)
except Exception:
has_ext = False
if filename and (not has_ext):
ct = (content_type or "").split(";")[0].strip().lower()
ext_by_ct = {
"application/pdf": ".pdf",
"application/epub+zip": ".epub",
"application/x-mobipocket-ebook": ".mobi",
"image/jpeg": ".jpg",
"image/png": ".png",
"image/webp": ".webp",
"image/gif": ".gif",
"text/plain": ".txt",
"application/zip": ".zip",
}
if ct in ext_by_ct:
filename = f"{filename}{ext_by_ct[ct]}"
elif ct.startswith("text/html"):
# Guardrail: HTML landing pages should not be downloaded as opaque files.
raise DownloadError("URL appears to be an HTML page, not a direct file")
# Final guardrail: if filename is empty, refuse rather than inventing `download.bin`.
if not filename or not str(filename).strip():
raise DownloadError("Could not determine filename for URL (no Content-Disposition and no path filename)")
file_path = _unique_path(output_dir / filename)
progress_bar = ProgressBar()
@@ -684,9 +716,15 @@ def _download_direct_file(
# For direct file downloads, create minimal info dict without filename as title
# This prevents creating duplicate title: tags when filename gets auto-generated
# We'll add title back later only if we couldn't extract meaningful tags
ext = ""
try:
ext = Path(str(filename)).suffix.lstrip(".")
except Exception:
ext = ""
info = {
"id": filename.rsplit(".", 1)[0],
"ext": filename.rsplit(".", 1)[1] if "." in filename else "bin",
"id": str(filename).rsplit(".", 1)[0] if "." in str(filename) else str(filename),
"ext": ext,
"webpage_url": url,
}