dfd
This commit is contained in:
@@ -584,10 +584,15 @@ def _download_direct_file(
|
||||
filename = filename.split("?")[0]
|
||||
|
||||
# Try to get real filename from Content-Disposition header (HEAD request)
|
||||
content_type = ""
|
||||
try:
|
||||
with HTTPClient(timeout=10.0) as client:
|
||||
response = client._request("HEAD", url, follow_redirects=True)
|
||||
content_disposition = response.headers.get("content-disposition", "")
|
||||
try:
|
||||
content_type = str(response.headers.get("content-type", "") or "").strip().lower()
|
||||
except Exception:
|
||||
content_type = ""
|
||||
if content_disposition:
|
||||
# Extract filename from Content-Disposition header
|
||||
# Format: attachment; filename="filename.pdf" or filename=filename.pdf
|
||||
@@ -620,9 +625,36 @@ def _download_direct_file(
|
||||
else:
|
||||
filename = suggested
|
||||
|
||||
# Final fallback if we still don't have a good filename
|
||||
if not filename or "." not in filename:
|
||||
filename = "downloaded_file.bin"
|
||||
# If we still don't have an extension, try to infer one from Content-Type.
|
||||
# Never fall back to a generic `.bin` extension.
|
||||
try:
|
||||
has_ext = bool(filename and Path(str(filename)).suffix)
|
||||
except Exception:
|
||||
has_ext = False
|
||||
|
||||
if filename and (not has_ext):
|
||||
ct = (content_type or "").split(";")[0].strip().lower()
|
||||
ext_by_ct = {
|
||||
"application/pdf": ".pdf",
|
||||
"application/epub+zip": ".epub",
|
||||
"application/x-mobipocket-ebook": ".mobi",
|
||||
"image/jpeg": ".jpg",
|
||||
"image/png": ".png",
|
||||
"image/webp": ".webp",
|
||||
"image/gif": ".gif",
|
||||
"text/plain": ".txt",
|
||||
"application/zip": ".zip",
|
||||
}
|
||||
|
||||
if ct in ext_by_ct:
|
||||
filename = f"{filename}{ext_by_ct[ct]}"
|
||||
elif ct.startswith("text/html"):
|
||||
# Guardrail: HTML landing pages should not be downloaded as opaque files.
|
||||
raise DownloadError("URL appears to be an HTML page, not a direct file")
|
||||
|
||||
# Final guardrail: if filename is empty, refuse rather than inventing `download.bin`.
|
||||
if not filename or not str(filename).strip():
|
||||
raise DownloadError("Could not determine filename for URL (no Content-Disposition and no path filename)")
|
||||
|
||||
file_path = _unique_path(output_dir / filename)
|
||||
progress_bar = ProgressBar()
|
||||
@@ -684,9 +716,15 @@ def _download_direct_file(
|
||||
# For direct file downloads, create minimal info dict without filename as title
|
||||
# This prevents creating duplicate title: tags when filename gets auto-generated
|
||||
# We'll add title back later only if we couldn't extract meaningful tags
|
||||
ext = ""
|
||||
try:
|
||||
ext = Path(str(filename)).suffix.lstrip(".")
|
||||
except Exception:
|
||||
ext = ""
|
||||
|
||||
info = {
|
||||
"id": filename.rsplit(".", 1)[0],
|
||||
"ext": filename.rsplit(".", 1)[1] if "." in filename else "bin",
|
||||
"id": str(filename).rsplit(".", 1)[0] if "." in str(filename) else str(filename),
|
||||
"ext": ext,
|
||||
"webpage_url": url,
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user