Files
Medios-Macina/helper/unified_book_downloader.py

707 lines
31 KiB
Python
Raw Normal View History

2025-11-25 20:09:33 -08:00
"""Unified book downloader - handles Archive.org borrowing and Libgen fallback.
This module provides a single interface for downloading books from multiple sources:
1. Try Archive.org direct download (if available)
2. Try Archive.org borrowing (if user has credentials)
3. Fallback to Libgen search by ISBN
4. Attempt Libgen download
All sources integrated with proper metadata scraping and error handling.
"""
import logging
import asyncio
import requests
from typing import Optional, Dict, Any, Tuple, List, Callable, cast
from pathlib import Path
from helper.logger import debug
logger = logging.getLogger(__name__)
class UnifiedBookDownloader:
"""Unified interface for downloading books from multiple sources."""
def __init__(self, config: Optional[Dict[str, Any]] = None, output_dir: Optional[str] = None):
"""Initialize the unified book downloader.
Args:
config: Configuration dict with credentials
output_dir: Default output directory
"""
self.config = config or {}
self.output_dir = output_dir
self.session = requests.Session()
# Import download functions from their modules
self._init_downloaders()
def _init_downloaders(self) -> None:
"""Initialize downloader functions from their modules."""
try:
from helper.archive_client import (
check_direct_download,
get_openlibrary_by_isbn,
loan
)
self.check_direct_download = check_direct_download
self.get_openlibrary_by_isbn = get_openlibrary_by_isbn
self.loan_func = loan
logger.debug("[UnifiedBookDownloader] Loaded archive.org downloaders from archive_client")
except Exception as e:
logger.warning(f"[UnifiedBookDownloader] Failed to load archive.org functions: {e}")
self.check_direct_download = None
self.get_openlibrary_by_isbn = None
self.loan_func = None
try:
from helper.libgen_service import (
DEFAULT_LIMIT as _LIBGEN_DEFAULT_LIMIT,
download_from_mirror as _libgen_download,
search_libgen as _libgen_search,
)
def _log_info(message: str) -> None:
debug(f"[UnifiedBookDownloader] {message}")
def _log_error(message: str) -> None:
logger.error(f"[UnifiedBookDownloader] {message}")
self.search_libgen = lambda query, limit=_LIBGEN_DEFAULT_LIMIT: _libgen_search(
query,
limit=limit,
log_info=_log_info,
log_error=_log_error,
)
self.download_from_mirror = lambda mirror_url, output_path: _libgen_download(
mirror_url,
output_path,
log_info=_log_info,
log_error=_log_error,
)
logger.debug("[UnifiedBookDownloader] Loaded Libgen helpers")
except Exception as e:
logger.warning(f"[UnifiedBookDownloader] Failed to load Libgen helpers: {e}")
self.search_libgen = None
self.download_from_mirror = None
def get_download_options(self, book_data: Dict[str, Any]) -> Dict[str, Any]:
"""Get all available download options for a book.
Checks in priority order:
1. Archive.org direct download (public domain)
2. Archive.org borrowing (if credentials available and book is borrowable)
3. Libgen fallback (by ISBN)
Args:
book_data: Book metadata dict with at least 'openlibrary_id' or 'isbn'
Returns:
Dict with available download methods and metadata
"""
options = {
'book_title': book_data.get('title', 'Unknown'),
'book_author': book_data.get('author', 'Unknown'),
'isbn': book_data.get('isbn', ''),
'openlibrary_id': book_data.get('openlibrary_id', ''),
'methods': [], # Will be sorted by priority
'metadata': {}
}
# Extract book ID from openlibrary_id (e.g., OL8513721M -> 8513721, OL8513721W -> 8513721)
ol_id = book_data.get('openlibrary_id', '')
book_id = None
if ol_id.startswith('OL') and len(ol_id) > 2:
# Remove 'OL' prefix (keep everything after it including the suffix letter)
# The book_id is all digits after 'OL'
book_id = ''.join(c for c in ol_id[2:] if c.isdigit())
# PRIORITY 1: Check direct download (fastest, no auth needed)
if self.check_direct_download:
try:
can_download, pdf_url = self.check_direct_download(book_id)
if can_download:
options['methods'].append({
'type': 'archive.org_direct',
'label': 'Archive.org Direct Download',
'requires_auth': False,
'pdf_url': pdf_url,
'book_id': book_id,
'priority': 1 # Highest priority
})
logger.info(f"[UnifiedBookDownloader] Direct download available for {book_id}")
except Exception as e:
logger.debug(f"[UnifiedBookDownloader] Direct download check failed: {e}")
# PRIORITY 2: Check borrowing option (requires auth, 14-day loan)
# First verify the book is actually lendable via OpenLibrary API
if self._has_archive_credentials():
is_lendable, status = self._check_book_lendable_status(ol_id)
if is_lendable:
options['methods'].append({
'type': 'archive.org_borrow',
'label': 'Archive.org Borrow',
'requires_auth': True,
'book_id': book_id,
'priority': 2 # Second priority
})
logger.info(f"[UnifiedBookDownloader] Borrow option available for {book_id} (status: {status})")
else:
logger.debug(f"[UnifiedBookDownloader] Borrow not available for {book_id} (status: {status})")
# PRIORITY 3: Check Libgen fallback (by ISBN, no auth needed, most reliable)
isbn = book_data.get('isbn', '')
title = book_data.get('title', '')
author = book_data.get('author', '')
if self.search_libgen:
# Can use Libgen if we have ISBN OR title (or both)
if isbn or title:
options['methods'].append({
'type': 'libgen',
'label': 'Libgen Search & Download',
'requires_auth': False,
'isbn': isbn,
'title': title,
'author': author,
'priority': 3 # Third priority (fallback)
})
logger.info(f"[UnifiedBookDownloader] Libgen fallback available (ISBN: {isbn if isbn else 'N/A'}, Title: {title})")
# Sort by priority (higher priority first)
options['methods'].sort(key=lambda x: x.get('priority', 999))
return options
def _has_archive_credentials(self) -> bool:
"""Check if Archive.org credentials are available."""
try:
from helper.archive_client import credential_openlibrary
email, password = credential_openlibrary(self.config)
return bool(email and password)
except Exception:
return False
def _check_book_lendable_status(self, ol_id: str) -> Tuple[bool, Optional[str]]:
"""Check if a book is lendable via OpenLibrary API.
Queries: https://openlibrary.org/api/volumes/brief/json/OLID:{ol_id}
Note: Only works with Edition IDs (OL...M), not Work IDs (OL...W)
Args:
ol_id: OpenLibrary ID (e.g., OL8513721M for Edition or OL4801915W for Work)
Returns:
Tuple of (is_lendable: bool, status_reason: Optional[str])
"""
try:
if not ol_id.startswith('OL'):
return False, "Invalid OpenLibrary ID format"
# If this is a Work ID (ends with W), we can't query Volumes API
# Work IDs are abstract umbrella records, not specific editions
if ol_id.endswith('W'):
logger.debug(f"[UnifiedBookDownloader] Work ID {ol_id} - skipping Volumes API (not lendable)")
return False, "Work ID not supported by Volumes API (not a specific edition)"
# If it ends with M, it's an Edition ID - proceed with query
if not ol_id.endswith('M'):
logger.debug(f"[UnifiedBookDownloader] Unknown ID type {ol_id} (not M or W)")
return False, "Invalid OpenLibrary ID type"
url = f"https://openlibrary.org/api/volumes/brief/json/OLID:{ol_id}"
response = self.session.get(url, timeout=10)
response.raise_for_status()
data = response.json()
# Empty response means no records found
if not data:
logger.debug(f"[UnifiedBookDownloader] Empty response for {ol_id}")
return False, "No availability data found"
# The response is wrapped in OLID key
olid_key = f"OLID:{ol_id}"
if olid_key not in data:
logger.debug(f"[UnifiedBookDownloader] OLID key not found in response")
return False, "No availability data found"
olid_data = data[olid_key]
# Check items array for lendable status
if 'items' in olid_data and olid_data['items'] and len(olid_data['items']) > 0:
items = olid_data['items']
# Check the first item for lending status
first_item = items[0]
# Handle both dict and string representations (PowerShell converts to string)
if isinstance(first_item, dict):
status = first_item.get('status', '')
else:
# String representation - check if 'lendable' is in it
status = str(first_item).lower()
is_lendable = 'lendable' in str(status).lower()
if is_lendable:
logger.info(f"[UnifiedBookDownloader] Book {ol_id} is lendable")
return True, "LENDABLE"
else:
status_str = status.get('status', 'NOT_LENDABLE') if isinstance(status, dict) else 'NOT_LENDABLE'
logger.debug(f"[UnifiedBookDownloader] Book {ol_id} is not lendable (status: {status_str})")
return False, status_str
else:
# No items array or empty
logger.debug(f"[UnifiedBookDownloader] No items found for {ol_id}")
return False, "Not available for lending"
except requests.exceptions.Timeout:
logger.warning(f"[UnifiedBookDownloader] OpenLibrary API timeout for {ol_id}")
return False, "API timeout"
except Exception as e:
logger.debug(f"[UnifiedBookDownloader] Failed to check lendable status for {ol_id}: {e}")
return False, f"API error"
async def download_book(self, method: Dict[str, Any], output_dir: Optional[str] = None) -> Tuple[bool, str]:
"""Download a book using the specified method.
Args:
method: Download method dict from get_download_options()
output_dir: Directory to save the book
Returns:
Tuple of (success: bool, message: str)
"""
output_dir = output_dir or self.output_dir or str(Path.home() / "Downloads")
method_type = method.get('type', '')
logger.info(f"[UnifiedBookDownloader] Starting download with method: {method_type}")
try:
if method_type == 'archive.org_direct':
return await self._download_archive_direct(method, output_dir)
elif method_type == 'archive.org_borrow':
return await self._download_archive_borrow(method, output_dir)
elif method_type == 'libgen':
return await self._download_libgen(method, output_dir)
else:
return False, f"Unknown download method: {method_type}"
except Exception as e:
logger.error(f"[UnifiedBookDownloader] Download error: {e}", exc_info=True)
return False, f"Download failed: {str(e)}"
async def _download_archive_direct(self, method: Dict[str, Any], output_dir: str) -> Tuple[bool, str]:
"""Download directly from Archive.org."""
try:
pdf_url = method.get('pdf_url', '')
book_id = method.get('book_id', '')
if not pdf_url:
return False, "No PDF URL available"
# Determine output filename
filename = f"{book_id}.pdf"
output_path = Path(output_dir) / filename
logger.info(f"[UnifiedBookDownloader] Downloading PDF from: {pdf_url}")
# Download in a thread to avoid blocking
loop = asyncio.get_event_loop()
success = await loop.run_in_executor(
None,
self._download_file,
pdf_url,
str(output_path)
)
if success:
logger.info(f"[UnifiedBookDownloader] Successfully downloaded to: {output_path}")
return True, f"Downloaded to: {output_path}"
else:
return False, "Failed to download PDF"
except Exception as e:
logger.error(f"[UnifiedBookDownloader] Archive direct download error: {e}")
return False, f"Archive download failed: {str(e)}"
async def _download_archive_borrow(self, method: Dict[str, Any], output_dir: str) -> Tuple[bool, str]:
"""Download via Archive.org borrowing (requires credentials).
Process (follows archive_client.py pattern):
1. Login to Archive.org with credentials
2. Call loan endpoint to borrow the book (14-day loan)
3. Get book info (page links, metadata)
4. Download all pages as images
5. Merge images into PDF
The loan function from archive_client.py handles:
- Checking if book needs borrowing (status 400 = "doesn't need to be borrowed")
- Creating borrow token for access
- Handling borrow failures
get_book_infos() extracts page links from the borrowed book viewer
download() downloads all pages using thread pool
img2pdf merges pages into searchable PDF
"""
try:
from helper.archive_client import credential_openlibrary
book_id = method.get('book_id', '')
# Get credentials
email, password = credential_openlibrary(self.config)
if not email or not password:
return False, "Archive.org credentials not configured"
logger.info(f"[UnifiedBookDownloader] Logging into Archive.org...")
# Login and borrow (in thread, following download_book.py pattern)
loop = asyncio.get_event_loop()
borrow_result = await loop.run_in_executor(
None,
self._archive_borrow_and_download,
email,
password,
book_id,
output_dir
)
if borrow_result and isinstance(borrow_result, tuple):
success, filepath = borrow_result
if success:
logger.info(f"[UnifiedBookDownloader] Borrow succeeded: {filepath}")
return True, filepath
else:
logger.warning(f"[UnifiedBookDownloader] Borrow failed: {filepath}")
return False, filepath
else:
return False, "Failed to borrow book from Archive.org"
except Exception as e:
logger.error(f"[UnifiedBookDownloader] Archive borrow error: {e}")
return False, f"Archive borrow failed: {str(e)}"
async def _download_libgen(self, method: Dict[str, Any], output_dir: str) -> Tuple[bool, str]:
"""Download via Libgen search and download with mirror fallback."""
try:
isbn = method.get('isbn', '')
title = method.get('title', '')
if not isbn and not title:
return False, "Need ISBN or title for Libgen search"
if not self.search_libgen:
return False, "Libgen searcher not available"
# Define wrapper functions to safely call the methods
search_func = self.search_libgen
if search_func is None:
return False, "Search function not available"
preloaded_results = method.get('results')
loop = asyncio.get_event_loop()
if preloaded_results:
results = list(preloaded_results)
if not results:
results = await loop.run_in_executor(None, lambda: search_func(isbn or title, 10))
else:
results = await loop.run_in_executor(None, lambda: search_func(isbn or title, 10))
if not results:
logger.warning(f"[UnifiedBookDownloader] No Libgen results for: {isbn or title}")
return False, f"No Libgen results found for: {isbn or title}"
logger.info(f"[UnifiedBookDownloader] Found {len(results)} Libgen results")
# Determine output filename (use first result for naming)
first_result = results[0]
filename = f"{first_result.get('title', 'book')}"
filename = "".join(c for c in filename if c.isalnum() or c in (' ', '.', '-'))[:100]
# Try each result's mirror until one succeeds
for idx, result in enumerate(results, 1):
mirror_url = result.get('mirror_url', '')
if not mirror_url:
logger.debug(f"[UnifiedBookDownloader] Result {idx}: No mirror URL")
continue
# Use extension from this result if available
extension = result.get('extension', 'pdf')
if extension and not extension.startswith('.'):
extension = f".{extension}"
elif not extension:
extension = '.pdf'
output_path = Path(output_dir) / (filename + extension)
logger.info(f"[UnifiedBookDownloader] Trying mirror {idx}/{len(results)}: {mirror_url}")
download_func = self.download_from_mirror
if download_func is None:
return False, "Download function not available"
download_callable = cast(Callable[[str, str], bool], download_func)
def download_wrapper():
return download_callable(mirror_url, str(output_path))
# Download (in thread)
try:
success = await loop.run_in_executor(None, download_wrapper)
if success:
# Validate downloaded file is not HTML (common Libgen issue)
if output_path.exists():
try:
with open(output_path, 'rb') as f:
file_start = f.read(1024).decode('utf-8', errors='ignore').lower()
if '<!doctype' in file_start or '<html' in file_start:
logger.warning(f"[UnifiedBookDownloader] Mirror {idx} returned HTML instead of file, trying next mirror...")
output_path.unlink() # Delete the HTML file
continue
except Exception as e:
logger.debug(f"[UnifiedBookDownloader] Could not validate file content: {e}")
logger.info(f"[UnifiedBookDownloader] Successfully downloaded from mirror {idx} to: {output_path}")
return True, str(output_path)
else:
logger.warning(f"[UnifiedBookDownloader] Mirror {idx} download failed, trying next...")
except Exception as e:
logger.warning(f"[UnifiedBookDownloader] Mirror {idx} error: {e}, trying next...")
continue
return False, f"All {len(results)} mirrors failed"
except Exception as e:
logger.error(f"[UnifiedBookDownloader] Libgen download error: {e}")
return False, f"Libgen download failed: {str(e)}"
async def download_libgen_selection(
self,
selected: Dict[str, Any],
remaining: Optional[List[Dict[str, Any]]] = None,
output_dir: Optional[str] = None,
) -> Tuple[bool, str]:
"""Download a specific Libgen result with optional fallbacks."""
if not isinstance(selected, dict):
return False, "Selected result must be a dictionary"
ordered_results: List[Dict[str, Any]] = [selected]
if remaining:
for item in remaining:
if isinstance(item, dict) and item is not selected:
ordered_results.append(item)
method: Dict[str, Any] = {
'type': 'libgen',
'isbn': selected.get('isbn', '') or '',
'title': selected.get('title', '') or '',
'author': selected.get('author', '') or '',
'results': ordered_results,
}
return await self.download_book(method, output_dir)
def download_libgen_selection_sync(
self,
selected: Dict[str, Any],
remaining: Optional[List[Dict[str, Any]]] = None,
output_dir: Optional[str] = None,
) -> Tuple[bool, str]:
"""Synchronous helper for downloading a Libgen selection."""
async def _run() -> Tuple[bool, str]:
return await self.download_libgen_selection(selected, remaining, output_dir)
loop = asyncio.new_event_loop()
try:
asyncio.set_event_loop(loop)
return loop.run_until_complete(_run())
finally:
loop.close()
asyncio.set_event_loop(None)
def _download_file(self, url: str, output_path: str) -> bool:
"""Download a file from URL."""
try:
response = requests.get(url, stream=True, timeout=30)
response.raise_for_status()
with open(output_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
return True
except Exception as e:
logger.error(f"[UnifiedBookDownloader] File download error: {e}")
return False
def _archive_borrow_and_download(self, email: str, password: str, book_id: str, output_dir: str) -> Tuple[bool, str]:
"""Borrow a book from Archive.org and download pages as PDF.
This follows the exact process from archive_client.py:
1. Login with credentials
2. Call loan() to create 14-day borrow
3. Get book info (extract page URLs)
4. Download all pages as images
5. Merge images into searchable PDF
Returns tuple of (success: bool, filepath/message: str)
"""
try:
from helper.archive_client import login, loan, get_book_infos, download
import tempfile
import shutil
logger.info(f"[UnifiedBookDownloader] Logging into Archive.org as {email}")
session = login(email, password)
logger.info(f"[UnifiedBookDownloader] Attempting to borrow book: {book_id}")
# Call loan to create the 14-day borrow
session = loan(session, book_id, verbose=True)
# If we get here, borrowing succeeded
logger.info(f"[UnifiedBookDownloader] Successfully borrowed book: {book_id}")
# Now get the book info (page URLs and metadata)
logger.info(f"[UnifiedBookDownloader] Extracting book page information...")
# Try both URL formats: with /borrow and without
book_urls = [
f"https://archive.org/borrow/{book_id}", # Try borrow page first (for borrowed books)
f"https://archive.org/details/{book_id}" # Fallback to details page
]
title = None
links = None
metadata = None
last_error = None
for book_url in book_urls:
try:
logger.debug(f"[UnifiedBookDownloader] Trying to get book info from: {book_url}")
response = session.get(book_url, timeout=10)
# Log response status
if response.status_code != 200:
logger.debug(f"[UnifiedBookDownloader] URL returned {response.status_code}: {book_url}")
# Continue to try next URL
continue
# Try to parse the response
title, links, metadata = get_book_infos(session, book_url)
logger.info(f"[UnifiedBookDownloader] Successfully got info from: {book_url}")
logger.info(f"[UnifiedBookDownloader] Found {len(links)} pages to download")
break
except Exception as e:
logger.debug(f"[UnifiedBookDownloader] Failed with {book_url}: {e}")
last_error = e
continue
if links is None:
logger.error(f"[UnifiedBookDownloader] Failed to get book info from all URLs: {last_error}")
# Borrow extraction failed - return False
return False, "Could not extract borrowed book pages"
# Create temporary directory for images
temp_dir = tempfile.mkdtemp(prefix=f"{title}_", dir=output_dir)
logger.info(f"[UnifiedBookDownloader] Downloading {len(links)} pages to temporary directory...")
try:
# Download all pages (uses thread pool)
images = download(
session=session,
n_threads=10,
directory=temp_dir,
links=links,
scale=3, # Default resolution
book_id=book_id
)
logger.info(f"[UnifiedBookDownloader] Downloaded {len(images)} pages")
# Try to merge pages into PDF
try:
import img2pdf
logger.info(f"[UnifiedBookDownloader] Merging pages into PDF...")
# Prepare PDF metadata
pdfmeta = {}
if metadata:
if "title" in metadata:
pdfmeta["title"] = metadata["title"]
if "creator" in metadata:
pdfmeta["author"] = metadata["creator"]
pdfmeta["keywords"] = [f"https://archive.org/details/{book_id}"]
pdfmeta["creationdate"] = None # Avoid timezone issues
# Convert images to PDF
pdf_content = img2pdf.convert(images, **pdfmeta) if images else None
if not pdf_content:
logger.error(f"[UnifiedBookDownloader] PDF conversion failed")
return False, "Failed to convert pages to PDF"
# Save the PDF
pdf_filename = f"{title}.pdf" if title else "book.pdf"
pdf_path = Path(output_dir) / pdf_filename
# Handle duplicate filenames
i = 1
while pdf_path.exists():
pdf_path = Path(output_dir) / f"{title or 'book'}({i}).pdf"
i += 1
with open(pdf_path, 'wb') as f:
f.write(pdf_content)
logger.info(f"[UnifiedBookDownloader] Successfully created PDF: {pdf_path}")
return True, str(pdf_path)
except ImportError:
logger.warning(f"[UnifiedBookDownloader] img2pdf not available, saving as JPG collection instead")
# Create JPG collection directory
if not title:
title = f"book_{book_id}"
jpg_dir = Path(output_dir) / title
i = 1
while jpg_dir.exists():
jpg_dir = Path(output_dir) / f"{title}({i})"
i += 1
# Move temporary directory to final location
shutil.move(temp_dir, str(jpg_dir))
temp_dir = None # Mark as already moved
logger.info(f"[UnifiedBookDownloader] Saved as JPG collection: {jpg_dir}")
return True, str(jpg_dir)
finally:
# Clean up temporary directory if it still exists
if temp_dir and Path(temp_dir).exists():
shutil.rmtree(temp_dir)
except SystemExit:
# loan() function calls sys.exit on failure - catch it
logger.error(f"[UnifiedBookDownloader] Borrow process exited (book may not be borrowable)")
return False, "Book could not be borrowed (may not be available for borrowing)"
except Exception as e:
logger.error(f"[UnifiedBookDownloader] Archive borrow error: {e}")
return False, f"Borrow failed: {str(e)}"
def close(self) -> None:
"""Close the session."""
self.session.close()