This commit is contained in:
nose
2025-12-05 03:42:57 -08:00
parent 5e4df11dbf
commit 5482ee5586
20 changed files with 911 additions and 223 deletions

View File

@@ -9,8 +9,8 @@ import logging
import re
import requests
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional
from urllib.parse import quote, urljoin
from typing import Any, Callable, Dict, List, Optional, Tuple
from urllib.parse import quote, urljoin, urlparse, unquote
# Optional dependencies
try:
@@ -405,6 +405,61 @@ def _resolve_download_url(
return None
def _guess_filename_extension(download_url: str, headers: Dict[str, str]) -> Optional[str]:
"""Guess the file extension from headers or the download URL."""
content_disposition = headers.get("content-disposition", "")
if content_disposition:
match = re.search(r'filename\*?=(?:UTF-8\'\'|"?)([^";]+)', content_disposition, flags=re.IGNORECASE)
if match:
filename = unquote(match.group(1).strip('"'))
suffix = Path(filename).suffix
if suffix:
return suffix.lstrip('.')
parsed = urlparse(download_url)
suffix = Path(parsed.path).suffix
if suffix:
return suffix.lstrip('.')
content_type = headers.get('content-type', '').lower()
mime_map = {
'application/pdf': 'pdf',
'application/epub+zip': 'epub',
'application/x-mobipocket-ebook': 'mobi',
'application/x-cbr': 'cbr',
'application/x-cbz': 'cbz',
'application/zip': 'zip',
}
for mime, ext in mime_map.items():
if mime in content_type:
return ext
return None
def _apply_extension(path: Path, extension: Optional[str]) -> Path:
"""Rename the path to match the detected extension, if needed."""
if not extension:
return path
suffix = extension if extension.startswith('.') else f'.{extension}'
if path.suffix.lower() == suffix.lower():
return path
candidate = path.with_suffix(suffix)
base_stem = path.stem
counter = 1
while candidate.exists() and counter < 100:
candidate = path.with_name(f"{base_stem}({counter}){suffix}")
counter += 1
try:
path.replace(candidate)
return candidate
except Exception:
return path
def download_from_mirror(
mirror_url: str,
output_path: Path,
@@ -412,8 +467,9 @@ def download_from_mirror(
log_info: LogFn = None,
log_error: ErrorFn = None,
session: Optional[requests.Session] = None,
) -> bool:
"""Download file from a LibGen mirror URL."""
progress_callback: Optional[Callable[[int, int], None]] = None,
) -> Tuple[bool, Optional[Path]]:
"""Download file from a LibGen mirror URL with optional progress tracking."""
session = session or requests.Session()
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
@@ -425,33 +481,43 @@ def download_from_mirror(
if not download_url:
_call(log_error, "[download] Could not find direct download link")
return False
return False, None
_call(log_info, f"[download] Downloading from: {download_url}")
# Download the actual file
downloaded = 0
total_size = 0
headers: Dict[str, str] = {}
with session.get(download_url, stream=True, timeout=60) as r:
r.raise_for_status()
headers = dict(r.headers)
# Verify it's not HTML (error page)
ct = r.headers.get("content-type", "").lower()
ct = headers.get("content-type", "").lower()
if "text/html" in ct:
_call(log_error, "[download] Final URL returned HTML, not a file.")
return False
return False, None
total_size = int(r.headers.get("content-length", 0))
downloaded = 0
total_size = int(headers.get("content-length", 0) or 0)
with open(output_path, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
downloaded += len(chunk)
# Optional: progress logging
_call(log_info, f"[download] Saved to {output_path}")
return True
if progress_callback:
progress_callback(downloaded, total_size)
final_extension = _guess_filename_extension(download_url, headers)
final_path = _apply_extension(output_path, final_extension)
if progress_callback and total_size > 0:
progress_callback(downloaded, total_size)
_call(log_info, f"[download] Saved to {final_path}")
return True, final_path
except Exception as e:
_call(log_error, f"[download] Download failed: {e}")
return False
return False, None