AST
This commit is contained in:
706
helper/unified_book_downloader.py
Normal file
706
helper/unified_book_downloader.py
Normal file
@@ -0,0 +1,706 @@
|
||||
"""Unified book downloader - handles Archive.org borrowing and Libgen fallback.
|
||||
|
||||
This module provides a single interface for downloading books from multiple sources:
|
||||
1. Try Archive.org direct download (if available)
|
||||
2. Try Archive.org borrowing (if user has credentials)
|
||||
3. Fallback to Libgen search by ISBN
|
||||
4. Attempt Libgen download
|
||||
|
||||
All sources integrated with proper metadata scraping and error handling.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import asyncio
|
||||
import requests
|
||||
from typing import Optional, Dict, Any, Tuple, List, Callable, cast
|
||||
from pathlib import Path
|
||||
|
||||
from helper.logger import debug
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class UnifiedBookDownloader:
|
||||
"""Unified interface for downloading books from multiple sources."""
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None, output_dir: Optional[str] = None):
|
||||
"""Initialize the unified book downloader.
|
||||
|
||||
Args:
|
||||
config: Configuration dict with credentials
|
||||
output_dir: Default output directory
|
||||
"""
|
||||
self.config = config or {}
|
||||
self.output_dir = output_dir
|
||||
self.session = requests.Session()
|
||||
|
||||
# Import download functions from their modules
|
||||
self._init_downloaders()
|
||||
|
||||
def _init_downloaders(self) -> None:
|
||||
"""Initialize downloader functions from their modules."""
|
||||
try:
|
||||
from helper.archive_client import (
|
||||
check_direct_download,
|
||||
get_openlibrary_by_isbn,
|
||||
loan
|
||||
)
|
||||
self.check_direct_download = check_direct_download
|
||||
self.get_openlibrary_by_isbn = get_openlibrary_by_isbn
|
||||
self.loan_func = loan
|
||||
logger.debug("[UnifiedBookDownloader] Loaded archive.org downloaders from archive_client")
|
||||
except Exception as e:
|
||||
logger.warning(f"[UnifiedBookDownloader] Failed to load archive.org functions: {e}")
|
||||
self.check_direct_download = None
|
||||
self.get_openlibrary_by_isbn = None
|
||||
self.loan_func = None
|
||||
|
||||
try:
|
||||
from helper.libgen_service import (
|
||||
DEFAULT_LIMIT as _LIBGEN_DEFAULT_LIMIT,
|
||||
download_from_mirror as _libgen_download,
|
||||
search_libgen as _libgen_search,
|
||||
)
|
||||
|
||||
def _log_info(message: str) -> None:
|
||||
debug(f"[UnifiedBookDownloader] {message}")
|
||||
|
||||
def _log_error(message: str) -> None:
|
||||
logger.error(f"[UnifiedBookDownloader] {message}")
|
||||
|
||||
self.search_libgen = lambda query, limit=_LIBGEN_DEFAULT_LIMIT: _libgen_search(
|
||||
query,
|
||||
limit=limit,
|
||||
log_info=_log_info,
|
||||
log_error=_log_error,
|
||||
)
|
||||
self.download_from_mirror = lambda mirror_url, output_path: _libgen_download(
|
||||
mirror_url,
|
||||
output_path,
|
||||
log_info=_log_info,
|
||||
log_error=_log_error,
|
||||
)
|
||||
logger.debug("[UnifiedBookDownloader] Loaded Libgen helpers")
|
||||
except Exception as e:
|
||||
logger.warning(f"[UnifiedBookDownloader] Failed to load Libgen helpers: {e}")
|
||||
self.search_libgen = None
|
||||
self.download_from_mirror = None
|
||||
|
||||
def get_download_options(self, book_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Get all available download options for a book.
|
||||
|
||||
Checks in priority order:
|
||||
1. Archive.org direct download (public domain)
|
||||
2. Archive.org borrowing (if credentials available and book is borrowable)
|
||||
3. Libgen fallback (by ISBN)
|
||||
|
||||
Args:
|
||||
book_data: Book metadata dict with at least 'openlibrary_id' or 'isbn'
|
||||
|
||||
Returns:
|
||||
Dict with available download methods and metadata
|
||||
"""
|
||||
options = {
|
||||
'book_title': book_data.get('title', 'Unknown'),
|
||||
'book_author': book_data.get('author', 'Unknown'),
|
||||
'isbn': book_data.get('isbn', ''),
|
||||
'openlibrary_id': book_data.get('openlibrary_id', ''),
|
||||
'methods': [], # Will be sorted by priority
|
||||
'metadata': {}
|
||||
}
|
||||
|
||||
# Extract book ID from openlibrary_id (e.g., OL8513721M -> 8513721, OL8513721W -> 8513721)
|
||||
ol_id = book_data.get('openlibrary_id', '')
|
||||
book_id = None
|
||||
|
||||
if ol_id.startswith('OL') and len(ol_id) > 2:
|
||||
# Remove 'OL' prefix (keep everything after it including the suffix letter)
|
||||
# The book_id is all digits after 'OL'
|
||||
book_id = ''.join(c for c in ol_id[2:] if c.isdigit())
|
||||
|
||||
# PRIORITY 1: Check direct download (fastest, no auth needed)
|
||||
if self.check_direct_download:
|
||||
try:
|
||||
can_download, pdf_url = self.check_direct_download(book_id)
|
||||
if can_download:
|
||||
options['methods'].append({
|
||||
'type': 'archive.org_direct',
|
||||
'label': 'Archive.org Direct Download',
|
||||
'requires_auth': False,
|
||||
'pdf_url': pdf_url,
|
||||
'book_id': book_id,
|
||||
'priority': 1 # Highest priority
|
||||
})
|
||||
logger.info(f"[UnifiedBookDownloader] Direct download available for {book_id}")
|
||||
except Exception as e:
|
||||
logger.debug(f"[UnifiedBookDownloader] Direct download check failed: {e}")
|
||||
|
||||
# PRIORITY 2: Check borrowing option (requires auth, 14-day loan)
|
||||
# First verify the book is actually lendable via OpenLibrary API
|
||||
if self._has_archive_credentials():
|
||||
is_lendable, status = self._check_book_lendable_status(ol_id)
|
||||
|
||||
if is_lendable:
|
||||
options['methods'].append({
|
||||
'type': 'archive.org_borrow',
|
||||
'label': 'Archive.org Borrow',
|
||||
'requires_auth': True,
|
||||
'book_id': book_id,
|
||||
'priority': 2 # Second priority
|
||||
})
|
||||
logger.info(f"[UnifiedBookDownloader] Borrow option available for {book_id} (status: {status})")
|
||||
else:
|
||||
logger.debug(f"[UnifiedBookDownloader] Borrow not available for {book_id} (status: {status})")
|
||||
|
||||
# PRIORITY 3: Check Libgen fallback (by ISBN, no auth needed, most reliable)
|
||||
isbn = book_data.get('isbn', '')
|
||||
title = book_data.get('title', '')
|
||||
author = book_data.get('author', '')
|
||||
|
||||
if self.search_libgen:
|
||||
# Can use Libgen if we have ISBN OR title (or both)
|
||||
if isbn or title:
|
||||
options['methods'].append({
|
||||
'type': 'libgen',
|
||||
'label': 'Libgen Search & Download',
|
||||
'requires_auth': False,
|
||||
'isbn': isbn,
|
||||
'title': title,
|
||||
'author': author,
|
||||
'priority': 3 # Third priority (fallback)
|
||||
})
|
||||
logger.info(f"[UnifiedBookDownloader] Libgen fallback available (ISBN: {isbn if isbn else 'N/A'}, Title: {title})")
|
||||
|
||||
# Sort by priority (higher priority first)
|
||||
options['methods'].sort(key=lambda x: x.get('priority', 999))
|
||||
|
||||
return options
|
||||
|
||||
def _has_archive_credentials(self) -> bool:
|
||||
"""Check if Archive.org credentials are available."""
|
||||
try:
|
||||
from helper.archive_client import credential_openlibrary
|
||||
email, password = credential_openlibrary(self.config)
|
||||
return bool(email and password)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _check_book_lendable_status(self, ol_id: str) -> Tuple[bool, Optional[str]]:
|
||||
"""Check if a book is lendable via OpenLibrary API.
|
||||
|
||||
Queries: https://openlibrary.org/api/volumes/brief/json/OLID:{ol_id}
|
||||
Note: Only works with Edition IDs (OL...M), not Work IDs (OL...W)
|
||||
|
||||
Args:
|
||||
ol_id: OpenLibrary ID (e.g., OL8513721M for Edition or OL4801915W for Work)
|
||||
|
||||
Returns:
|
||||
Tuple of (is_lendable: bool, status_reason: Optional[str])
|
||||
"""
|
||||
try:
|
||||
if not ol_id.startswith('OL'):
|
||||
return False, "Invalid OpenLibrary ID format"
|
||||
|
||||
# If this is a Work ID (ends with W), we can't query Volumes API
|
||||
# Work IDs are abstract umbrella records, not specific editions
|
||||
if ol_id.endswith('W'):
|
||||
logger.debug(f"[UnifiedBookDownloader] Work ID {ol_id} - skipping Volumes API (not lendable)")
|
||||
return False, "Work ID not supported by Volumes API (not a specific edition)"
|
||||
|
||||
# If it ends with M, it's an Edition ID - proceed with query
|
||||
if not ol_id.endswith('M'):
|
||||
logger.debug(f"[UnifiedBookDownloader] Unknown ID type {ol_id} (not M or W)")
|
||||
return False, "Invalid OpenLibrary ID type"
|
||||
|
||||
url = f"https://openlibrary.org/api/volumes/brief/json/OLID:{ol_id}"
|
||||
response = self.session.get(url, timeout=10)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
# Empty response means no records found
|
||||
if not data:
|
||||
logger.debug(f"[UnifiedBookDownloader] Empty response for {ol_id}")
|
||||
return False, "No availability data found"
|
||||
|
||||
# The response is wrapped in OLID key
|
||||
olid_key = f"OLID:{ol_id}"
|
||||
if olid_key not in data:
|
||||
logger.debug(f"[UnifiedBookDownloader] OLID key not found in response")
|
||||
return False, "No availability data found"
|
||||
|
||||
olid_data = data[olid_key]
|
||||
|
||||
# Check items array for lendable status
|
||||
if 'items' in olid_data and olid_data['items'] and len(olid_data['items']) > 0:
|
||||
items = olid_data['items']
|
||||
|
||||
# Check the first item for lending status
|
||||
first_item = items[0]
|
||||
|
||||
# Handle both dict and string representations (PowerShell converts to string)
|
||||
if isinstance(first_item, dict):
|
||||
status = first_item.get('status', '')
|
||||
else:
|
||||
# String representation - check if 'lendable' is in it
|
||||
status = str(first_item).lower()
|
||||
|
||||
is_lendable = 'lendable' in str(status).lower()
|
||||
|
||||
if is_lendable:
|
||||
logger.info(f"[UnifiedBookDownloader] Book {ol_id} is lendable")
|
||||
return True, "LENDABLE"
|
||||
else:
|
||||
status_str = status.get('status', 'NOT_LENDABLE') if isinstance(status, dict) else 'NOT_LENDABLE'
|
||||
logger.debug(f"[UnifiedBookDownloader] Book {ol_id} is not lendable (status: {status_str})")
|
||||
return False, status_str
|
||||
else:
|
||||
# No items array or empty
|
||||
logger.debug(f"[UnifiedBookDownloader] No items found for {ol_id}")
|
||||
return False, "Not available for lending"
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
logger.warning(f"[UnifiedBookDownloader] OpenLibrary API timeout for {ol_id}")
|
||||
return False, "API timeout"
|
||||
except Exception as e:
|
||||
logger.debug(f"[UnifiedBookDownloader] Failed to check lendable status for {ol_id}: {e}")
|
||||
return False, f"API error"
|
||||
|
||||
|
||||
async def download_book(self, method: Dict[str, Any], output_dir: Optional[str] = None) -> Tuple[bool, str]:
|
||||
"""Download a book using the specified method.
|
||||
|
||||
Args:
|
||||
method: Download method dict from get_download_options()
|
||||
output_dir: Directory to save the book
|
||||
|
||||
Returns:
|
||||
Tuple of (success: bool, message: str)
|
||||
"""
|
||||
output_dir = output_dir or self.output_dir or str(Path.home() / "Downloads")
|
||||
method_type = method.get('type', '')
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Starting download with method: {method_type}")
|
||||
|
||||
try:
|
||||
if method_type == 'archive.org_direct':
|
||||
return await self._download_archive_direct(method, output_dir)
|
||||
|
||||
elif method_type == 'archive.org_borrow':
|
||||
return await self._download_archive_borrow(method, output_dir)
|
||||
|
||||
elif method_type == 'libgen':
|
||||
return await self._download_libgen(method, output_dir)
|
||||
|
||||
else:
|
||||
return False, f"Unknown download method: {method_type}"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[UnifiedBookDownloader] Download error: {e}", exc_info=True)
|
||||
return False, f"Download failed: {str(e)}"
|
||||
|
||||
async def _download_archive_direct(self, method: Dict[str, Any], output_dir: str) -> Tuple[bool, str]:
|
||||
"""Download directly from Archive.org."""
|
||||
try:
|
||||
pdf_url = method.get('pdf_url', '')
|
||||
book_id = method.get('book_id', '')
|
||||
|
||||
if not pdf_url:
|
||||
return False, "No PDF URL available"
|
||||
|
||||
# Determine output filename
|
||||
filename = f"{book_id}.pdf"
|
||||
output_path = Path(output_dir) / filename
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Downloading PDF from: {pdf_url}")
|
||||
|
||||
# Download in a thread to avoid blocking
|
||||
loop = asyncio.get_event_loop()
|
||||
success = await loop.run_in_executor(
|
||||
None,
|
||||
self._download_file,
|
||||
pdf_url,
|
||||
str(output_path)
|
||||
)
|
||||
|
||||
if success:
|
||||
logger.info(f"[UnifiedBookDownloader] Successfully downloaded to: {output_path}")
|
||||
return True, f"Downloaded to: {output_path}"
|
||||
else:
|
||||
return False, "Failed to download PDF"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[UnifiedBookDownloader] Archive direct download error: {e}")
|
||||
return False, f"Archive download failed: {str(e)}"
|
||||
|
||||
async def _download_archive_borrow(self, method: Dict[str, Any], output_dir: str) -> Tuple[bool, str]:
|
||||
"""Download via Archive.org borrowing (requires credentials).
|
||||
|
||||
Process (follows archive_client.py pattern):
|
||||
1. Login to Archive.org with credentials
|
||||
2. Call loan endpoint to borrow the book (14-day loan)
|
||||
3. Get book info (page links, metadata)
|
||||
4. Download all pages as images
|
||||
5. Merge images into PDF
|
||||
|
||||
The loan function from archive_client.py handles:
|
||||
- Checking if book needs borrowing (status 400 = "doesn't need to be borrowed")
|
||||
- Creating borrow token for access
|
||||
- Handling borrow failures
|
||||
|
||||
get_book_infos() extracts page links from the borrowed book viewer
|
||||
download() downloads all pages using thread pool
|
||||
img2pdf merges pages into searchable PDF
|
||||
"""
|
||||
try:
|
||||
from helper.archive_client import credential_openlibrary
|
||||
|
||||
book_id = method.get('book_id', '')
|
||||
|
||||
# Get credentials
|
||||
email, password = credential_openlibrary(self.config)
|
||||
if not email or not password:
|
||||
return False, "Archive.org credentials not configured"
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Logging into Archive.org...")
|
||||
|
||||
# Login and borrow (in thread, following download_book.py pattern)
|
||||
loop = asyncio.get_event_loop()
|
||||
borrow_result = await loop.run_in_executor(
|
||||
None,
|
||||
self._archive_borrow_and_download,
|
||||
email,
|
||||
password,
|
||||
book_id,
|
||||
output_dir
|
||||
)
|
||||
|
||||
if borrow_result and isinstance(borrow_result, tuple):
|
||||
success, filepath = borrow_result
|
||||
if success:
|
||||
logger.info(f"[UnifiedBookDownloader] Borrow succeeded: {filepath}")
|
||||
return True, filepath
|
||||
else:
|
||||
logger.warning(f"[UnifiedBookDownloader] Borrow failed: {filepath}")
|
||||
return False, filepath
|
||||
else:
|
||||
return False, "Failed to borrow book from Archive.org"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[UnifiedBookDownloader] Archive borrow error: {e}")
|
||||
return False, f"Archive borrow failed: {str(e)}"
|
||||
|
||||
async def _download_libgen(self, method: Dict[str, Any], output_dir: str) -> Tuple[bool, str]:
|
||||
"""Download via Libgen search and download with mirror fallback."""
|
||||
try:
|
||||
isbn = method.get('isbn', '')
|
||||
title = method.get('title', '')
|
||||
|
||||
if not isbn and not title:
|
||||
return False, "Need ISBN or title for Libgen search"
|
||||
|
||||
if not self.search_libgen:
|
||||
return False, "Libgen searcher not available"
|
||||
|
||||
# Define wrapper functions to safely call the methods
|
||||
search_func = self.search_libgen
|
||||
if search_func is None:
|
||||
return False, "Search function not available"
|
||||
|
||||
preloaded_results = method.get('results')
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
if preloaded_results:
|
||||
results = list(preloaded_results)
|
||||
if not results:
|
||||
results = await loop.run_in_executor(None, lambda: search_func(isbn or title, 10))
|
||||
else:
|
||||
results = await loop.run_in_executor(None, lambda: search_func(isbn or title, 10))
|
||||
|
||||
if not results:
|
||||
logger.warning(f"[UnifiedBookDownloader] No Libgen results for: {isbn or title}")
|
||||
return False, f"No Libgen results found for: {isbn or title}"
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Found {len(results)} Libgen results")
|
||||
|
||||
# Determine output filename (use first result for naming)
|
||||
first_result = results[0]
|
||||
filename = f"{first_result.get('title', 'book')}"
|
||||
filename = "".join(c for c in filename if c.isalnum() or c in (' ', '.', '-'))[:100]
|
||||
|
||||
# Try each result's mirror until one succeeds
|
||||
for idx, result in enumerate(results, 1):
|
||||
mirror_url = result.get('mirror_url', '')
|
||||
|
||||
if not mirror_url:
|
||||
logger.debug(f"[UnifiedBookDownloader] Result {idx}: No mirror URL")
|
||||
continue
|
||||
|
||||
# Use extension from this result if available
|
||||
extension = result.get('extension', 'pdf')
|
||||
if extension and not extension.startswith('.'):
|
||||
extension = f".{extension}"
|
||||
elif not extension:
|
||||
extension = '.pdf'
|
||||
|
||||
output_path = Path(output_dir) / (filename + extension)
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Trying mirror {idx}/{len(results)}: {mirror_url}")
|
||||
|
||||
download_func = self.download_from_mirror
|
||||
if download_func is None:
|
||||
return False, "Download function not available"
|
||||
|
||||
download_callable = cast(Callable[[str, str], bool], download_func)
|
||||
|
||||
def download_wrapper():
|
||||
return download_callable(mirror_url, str(output_path))
|
||||
|
||||
# Download (in thread)
|
||||
try:
|
||||
success = await loop.run_in_executor(None, download_wrapper)
|
||||
|
||||
if success:
|
||||
# Validate downloaded file is not HTML (common Libgen issue)
|
||||
if output_path.exists():
|
||||
try:
|
||||
with open(output_path, 'rb') as f:
|
||||
file_start = f.read(1024).decode('utf-8', errors='ignore').lower()
|
||||
if '<!doctype' in file_start or '<html' in file_start:
|
||||
logger.warning(f"[UnifiedBookDownloader] Mirror {idx} returned HTML instead of file, trying next mirror...")
|
||||
output_path.unlink() # Delete the HTML file
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.debug(f"[UnifiedBookDownloader] Could not validate file content: {e}")
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Successfully downloaded from mirror {idx} to: {output_path}")
|
||||
return True, str(output_path)
|
||||
else:
|
||||
logger.warning(f"[UnifiedBookDownloader] Mirror {idx} download failed, trying next...")
|
||||
except Exception as e:
|
||||
logger.warning(f"[UnifiedBookDownloader] Mirror {idx} error: {e}, trying next...")
|
||||
continue
|
||||
|
||||
return False, f"All {len(results)} mirrors failed"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[UnifiedBookDownloader] Libgen download error: {e}")
|
||||
return False, f"Libgen download failed: {str(e)}"
|
||||
|
||||
async def download_libgen_selection(
|
||||
self,
|
||||
selected: Dict[str, Any],
|
||||
remaining: Optional[List[Dict[str, Any]]] = None,
|
||||
output_dir: Optional[str] = None,
|
||||
) -> Tuple[bool, str]:
|
||||
"""Download a specific Libgen result with optional fallbacks."""
|
||||
|
||||
if not isinstance(selected, dict):
|
||||
return False, "Selected result must be a dictionary"
|
||||
|
||||
ordered_results: List[Dict[str, Any]] = [selected]
|
||||
if remaining:
|
||||
for item in remaining:
|
||||
if isinstance(item, dict) and item is not selected:
|
||||
ordered_results.append(item)
|
||||
|
||||
method: Dict[str, Any] = {
|
||||
'type': 'libgen',
|
||||
'isbn': selected.get('isbn', '') or '',
|
||||
'title': selected.get('title', '') or '',
|
||||
'author': selected.get('author', '') or '',
|
||||
'results': ordered_results,
|
||||
}
|
||||
|
||||
return await self.download_book(method, output_dir)
|
||||
|
||||
def download_libgen_selection_sync(
|
||||
self,
|
||||
selected: Dict[str, Any],
|
||||
remaining: Optional[List[Dict[str, Any]]] = None,
|
||||
output_dir: Optional[str] = None,
|
||||
) -> Tuple[bool, str]:
|
||||
"""Synchronous helper for downloading a Libgen selection."""
|
||||
|
||||
async def _run() -> Tuple[bool, str]:
|
||||
return await self.download_libgen_selection(selected, remaining, output_dir)
|
||||
|
||||
loop = asyncio.new_event_loop()
|
||||
try:
|
||||
asyncio.set_event_loop(loop)
|
||||
return loop.run_until_complete(_run())
|
||||
finally:
|
||||
loop.close()
|
||||
asyncio.set_event_loop(None)
|
||||
|
||||
def _download_file(self, url: str, output_path: str) -> bool:
|
||||
"""Download a file from URL."""
|
||||
try:
|
||||
response = requests.get(url, stream=True, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
with open(output_path, 'wb') as f:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"[UnifiedBookDownloader] File download error: {e}")
|
||||
return False
|
||||
|
||||
def _archive_borrow_and_download(self, email: str, password: str, book_id: str, output_dir: str) -> Tuple[bool, str]:
|
||||
"""Borrow a book from Archive.org and download pages as PDF.
|
||||
|
||||
This follows the exact process from archive_client.py:
|
||||
1. Login with credentials
|
||||
2. Call loan() to create 14-day borrow
|
||||
3. Get book info (extract page URLs)
|
||||
4. Download all pages as images
|
||||
5. Merge images into searchable PDF
|
||||
|
||||
Returns tuple of (success: bool, filepath/message: str)
|
||||
"""
|
||||
try:
|
||||
from helper.archive_client import login, loan, get_book_infos, download
|
||||
import tempfile
|
||||
import shutil
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Logging into Archive.org as {email}")
|
||||
session = login(email, password)
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Attempting to borrow book: {book_id}")
|
||||
# Call loan to create the 14-day borrow
|
||||
session = loan(session, book_id, verbose=True)
|
||||
|
||||
# If we get here, borrowing succeeded
|
||||
logger.info(f"[UnifiedBookDownloader] Successfully borrowed book: {book_id}")
|
||||
|
||||
# Now get the book info (page URLs and metadata)
|
||||
logger.info(f"[UnifiedBookDownloader] Extracting book page information...")
|
||||
# Try both URL formats: with /borrow and without
|
||||
book_urls = [
|
||||
f"https://archive.org/borrow/{book_id}", # Try borrow page first (for borrowed books)
|
||||
f"https://archive.org/details/{book_id}" # Fallback to details page
|
||||
]
|
||||
|
||||
title = None
|
||||
links = None
|
||||
metadata = None
|
||||
last_error = None
|
||||
|
||||
for book_url in book_urls:
|
||||
try:
|
||||
logger.debug(f"[UnifiedBookDownloader] Trying to get book info from: {book_url}")
|
||||
response = session.get(book_url, timeout=10)
|
||||
|
||||
# Log response status
|
||||
if response.status_code != 200:
|
||||
logger.debug(f"[UnifiedBookDownloader] URL returned {response.status_code}: {book_url}")
|
||||
# Continue to try next URL
|
||||
continue
|
||||
|
||||
# Try to parse the response
|
||||
title, links, metadata = get_book_infos(session, book_url)
|
||||
logger.info(f"[UnifiedBookDownloader] Successfully got info from: {book_url}")
|
||||
logger.info(f"[UnifiedBookDownloader] Found {len(links)} pages to download")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.debug(f"[UnifiedBookDownloader] Failed with {book_url}: {e}")
|
||||
last_error = e
|
||||
continue
|
||||
|
||||
if links is None:
|
||||
logger.error(f"[UnifiedBookDownloader] Failed to get book info from all URLs: {last_error}")
|
||||
# Borrow extraction failed - return False
|
||||
return False, "Could not extract borrowed book pages"
|
||||
|
||||
# Create temporary directory for images
|
||||
temp_dir = tempfile.mkdtemp(prefix=f"{title}_", dir=output_dir)
|
||||
logger.info(f"[UnifiedBookDownloader] Downloading {len(links)} pages to temporary directory...")
|
||||
|
||||
try:
|
||||
# Download all pages (uses thread pool)
|
||||
images = download(
|
||||
session=session,
|
||||
n_threads=10,
|
||||
directory=temp_dir,
|
||||
links=links,
|
||||
scale=3, # Default resolution
|
||||
book_id=book_id
|
||||
)
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Downloaded {len(images)} pages")
|
||||
|
||||
# Try to merge pages into PDF
|
||||
try:
|
||||
import img2pdf
|
||||
logger.info(f"[UnifiedBookDownloader] Merging pages into PDF...")
|
||||
|
||||
# Prepare PDF metadata
|
||||
pdfmeta = {}
|
||||
if metadata:
|
||||
if "title" in metadata:
|
||||
pdfmeta["title"] = metadata["title"]
|
||||
if "creator" in metadata:
|
||||
pdfmeta["author"] = metadata["creator"]
|
||||
pdfmeta["keywords"] = [f"https://archive.org/details/{book_id}"]
|
||||
pdfmeta["creationdate"] = None # Avoid timezone issues
|
||||
|
||||
# Convert images to PDF
|
||||
pdf_content = img2pdf.convert(images, **pdfmeta) if images else None
|
||||
if not pdf_content:
|
||||
logger.error(f"[UnifiedBookDownloader] PDF conversion failed")
|
||||
return False, "Failed to convert pages to PDF"
|
||||
|
||||
# Save the PDF
|
||||
pdf_filename = f"{title}.pdf" if title else "book.pdf"
|
||||
pdf_path = Path(output_dir) / pdf_filename
|
||||
|
||||
# Handle duplicate filenames
|
||||
i = 1
|
||||
while pdf_path.exists():
|
||||
pdf_path = Path(output_dir) / f"{title or 'book'}({i}).pdf"
|
||||
i += 1
|
||||
|
||||
with open(pdf_path, 'wb') as f:
|
||||
f.write(pdf_content)
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Successfully created PDF: {pdf_path}")
|
||||
|
||||
return True, str(pdf_path)
|
||||
|
||||
except ImportError:
|
||||
logger.warning(f"[UnifiedBookDownloader] img2pdf not available, saving as JPG collection instead")
|
||||
|
||||
# Create JPG collection directory
|
||||
if not title:
|
||||
title = f"book_{book_id}"
|
||||
jpg_dir = Path(output_dir) / title
|
||||
i = 1
|
||||
while jpg_dir.exists():
|
||||
jpg_dir = Path(output_dir) / f"{title}({i})"
|
||||
i += 1
|
||||
|
||||
# Move temporary directory to final location
|
||||
shutil.move(temp_dir, str(jpg_dir))
|
||||
temp_dir = None # Mark as already moved
|
||||
|
||||
logger.info(f"[UnifiedBookDownloader] Saved as JPG collection: {jpg_dir}")
|
||||
return True, str(jpg_dir)
|
||||
|
||||
finally:
|
||||
# Clean up temporary directory if it still exists
|
||||
if temp_dir and Path(temp_dir).exists():
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
except SystemExit:
|
||||
# loan() function calls sys.exit on failure - catch it
|
||||
logger.error(f"[UnifiedBookDownloader] Borrow process exited (book may not be borrowable)")
|
||||
return False, "Book could not be borrowed (may not be available for borrowing)"
|
||||
except Exception as e:
|
||||
logger.error(f"[UnifiedBookDownloader] Archive borrow error: {e}")
|
||||
return False, f"Borrow failed: {str(e)}"
|
||||
|
||||
def close(self) -> None:
|
||||
"""Close the session."""
|
||||
self.session.close()
|
||||
Reference in New Issue
Block a user