Medios-Macina/helper/unified_book_downloader.py

"""Unified book downloader - handles Archive.org borrowing and Libgen fallback.

This module provides a single interface for downloading books from multiple sources:
1. Try Archive.org direct download (if available)
2. Try Archive.org borrowing (if user has credentials)
3. Fallback to Libgen search by ISBN
4. Attempt Libgen download

All sources integrated with proper metadata scraping and error handling.
"""

import logging
import asyncio
import requests
from typing import Optional, Dict, Any, Tuple, List, Callable, cast
from pathlib import Path

from helper.logger import debug

logger = logging.getLogger(__name__)


class UnifiedBookDownloader:
    """Unified interface for downloading books from multiple sources."""
    
    def __init__(self, config: Optional[Dict[str, Any]] = None, output_dir: Optional[str] = None):
        """Initialize the unified book downloader.
        
        Args:
            config: Configuration dict with credentials
            output_dir: Default output directory
        """
        self.config = config or {}
        self.output_dir = output_dir
        self.session = requests.Session()
        
        # Import download functions from their modules
        self._init_downloaders()
    
    def _init_downloaders(self) -> None:
        """Initialize downloader functions from their modules."""
        try:
            from helper.archive_client import (
                check_direct_download,
                get_openlibrary_by_isbn,
                loan
            )
            self.check_direct_download = check_direct_download
            self.get_openlibrary_by_isbn = get_openlibrary_by_isbn
            self.loan_func = loan
            logger.debug("[UnifiedBookDownloader] Loaded archive.org downloaders from archive_client")
        except Exception as e:
            logger.warning(f"[UnifiedBookDownloader] Failed to load archive.org functions: {e}")
            self.check_direct_download = None
            self.get_openlibrary_by_isbn = None
            self.loan_func = None
        
        try:
            from helper.libgen_service import (
                DEFAULT_LIMIT as _LIBGEN_DEFAULT_LIMIT,
                download_from_mirror as _libgen_download,
                search_libgen as _libgen_search,
            )

            def _log_info(message: str) -> None:
                debug(f"[UnifiedBookDownloader] {message}")

            def _log_error(message: str) -> None:
                logger.error(f"[UnifiedBookDownloader] {message}")

            self.search_libgen = lambda query, limit=_LIBGEN_DEFAULT_LIMIT: _libgen_search(
                query,
                limit=limit,
                log_info=_log_info,
                log_error=_log_error,
            )
            self.download_from_mirror = lambda mirror_url, output_path: _libgen_download(
                mirror_url,
                output_path,
                log_info=_log_info,
                log_error=_log_error,
            )
            logger.debug("[UnifiedBookDownloader] Loaded Libgen helpers")
        except Exception as e:
            logger.warning(f"[UnifiedBookDownloader] Failed to load Libgen helpers: {e}")
            self.search_libgen = None
            self.download_from_mirror = None
    
    def get_download_options(self, book_data: Dict[str, Any]) -> Dict[str, Any]:
        """Get all available download options for a book.
        
        Checks in priority order:
        1. Archive.org direct download (public domain)
        2. Archive.org borrowing (if credentials available and book is borrowable)
        3. Libgen fallback (by ISBN)
        
        Args:
            book_data: Book metadata dict with at least 'openlibrary_id' or 'isbn'
            
        Returns:
            Dict with available download methods and metadata
        """
        options = {
            'book_title': book_data.get('title', 'Unknown'),
            'book_author': book_data.get('author', 'Unknown'),
            'isbn': book_data.get('isbn', ''),
            'openlibrary_id': book_data.get('openlibrary_id', ''),
            'methods': [],  # Will be sorted by priority
            'metadata': {}
        }
        
        # Extract book ID from openlibrary_id (e.g., OL8513721M -> 8513721, OL8513721W -> 8513721)
        ol_id = book_data.get('openlibrary_id', '')
        book_id = None
        
        if ol_id.startswith('OL') and len(ol_id) > 2:
            # Remove 'OL' prefix (keep everything after it including the suffix letter)
            # The book_id is all digits after 'OL'
            book_id = ''.join(c for c in ol_id[2:] if c.isdigit())
            
            # PRIORITY 1: Check direct download (fastest, no auth needed)
            if self.check_direct_download:
                try:
                    can_download, pdf_url = self.check_direct_download(book_id)
                    if can_download:
                        options['methods'].append({
                            'type': 'archive.org_direct',
                            'label': 'Archive.org Direct Download',
                            'requires_auth': False,
                            'pdf_url': pdf_url,
                            'book_id': book_id,
                            'priority': 1  # Highest priority
                        })
                        logger.info(f"[UnifiedBookDownloader] Direct download available for {book_id}")
                except Exception as e:
                    logger.debug(f"[UnifiedBookDownloader] Direct download check failed: {e}")
            
            # PRIORITY 2: Check borrowing option (requires auth, 14-day loan)
            # First verify the book is actually lendable via OpenLibrary API
            if self._has_archive_credentials():
                is_lendable, status = self._check_book_lendable_status(ol_id)
                
                if is_lendable:
                    options['methods'].append({
                        'type': 'archive.org_borrow',
                        'label': 'Archive.org Borrow',
                        'requires_auth': True,
                        'book_id': book_id,
                        'priority': 2  # Second priority
                    })
                    logger.info(f"[UnifiedBookDownloader] Borrow option available for {book_id} (status: {status})")
                else:
                    logger.debug(f"[UnifiedBookDownloader] Borrow not available for {book_id} (status: {status})")
        
        # PRIORITY 3: Check Libgen fallback (by ISBN, no auth needed, most reliable)
        isbn = book_data.get('isbn', '')
        title = book_data.get('title', '')
        author = book_data.get('author', '')
        
        if self.search_libgen:
            # Can use Libgen if we have ISBN OR title (or both)
            if isbn or title:
                options['methods'].append({
                    'type': 'libgen',
                    'label': 'Libgen Search & Download',
                    'requires_auth': False,
                    'isbn': isbn,
                    'title': title,
                    'author': author,
                    'priority': 3  # Third priority (fallback)
                })
                logger.info(f"[UnifiedBookDownloader] Libgen fallback available (ISBN: {isbn if isbn else 'N/A'}, Title: {title})")
        
        # Sort by priority (higher priority first)
        options['methods'].sort(key=lambda x: x.get('priority', 999))
        
        return options
    
    def _has_archive_credentials(self) -> bool:
        """Check if Archive.org credentials are available."""
        try:
            from helper.archive_client import credential_openlibrary
            email, password = credential_openlibrary(self.config)
            return bool(email and password)
        except Exception:
            return False
    
    def _check_book_lendable_status(self, ol_id: str) -> Tuple[bool, Optional[str]]:
        """Check if a book is lendable via OpenLibrary API.
        
        Queries: https://openlibrary.org/api/volumes/brief/json/OLID:{ol_id}
        Note: Only works with Edition IDs (OL...M), not Work IDs (OL...W)
        
        Args:
            ol_id: OpenLibrary ID (e.g., OL8513721M for Edition or OL4801915W for Work)
            
        Returns:
            Tuple of (is_lendable: bool, status_reason: Optional[str])
        """
        try:
            if not ol_id.startswith('OL'):
                return False, "Invalid OpenLibrary ID format"
            
            # If this is a Work ID (ends with W), we can't query Volumes API
            # Work IDs are abstract umbrella records, not specific editions
            if ol_id.endswith('W'):
                logger.debug(f"[UnifiedBookDownloader] Work ID {ol_id} - skipping Volumes API (not lendable)")
                return False, "Work ID not supported by Volumes API (not a specific edition)"
            
            # If it ends with M, it's an Edition ID - proceed with query
            if not ol_id.endswith('M'):
                logger.debug(f"[UnifiedBookDownloader] Unknown ID type {ol_id} (not M or W)")
                return False, "Invalid OpenLibrary ID type"
            
            url = f"https://openlibrary.org/api/volumes/brief/json/OLID:{ol_id}"
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            data = response.json()
            
            # Empty response means no records found
            if not data:
                logger.debug(f"[UnifiedBookDownloader] Empty response for {ol_id}")
                return False, "No availability data found"
            
            # The response is wrapped in OLID key
            olid_key = f"OLID:{ol_id}"
            if olid_key not in data:
                logger.debug(f"[UnifiedBookDownloader] OLID key not found in response")
                return False, "No availability data found"
            
            olid_data = data[olid_key]
            
            # Check items array for lendable status
            if 'items' in olid_data and olid_data['items'] and len(olid_data['items']) > 0:
                items = olid_data['items']
                
                # Check the first item for lending status
                first_item = items[0]
                
                # Handle both dict and string representations (PowerShell converts to string)
                if isinstance(first_item, dict):
                    status = first_item.get('status', '')
                else:
                    # String representation - check if 'lendable' is in it
                    status = str(first_item).lower()
                
                is_lendable = 'lendable' in str(status).lower()
                
                if is_lendable:
                    logger.info(f"[UnifiedBookDownloader] Book {ol_id} is lendable")
                    return True, "LENDABLE"
                else:
                    status_str = status.get('status', 'NOT_LENDABLE') if isinstance(status, dict) else 'NOT_LENDABLE'
                    logger.debug(f"[UnifiedBookDownloader] Book {ol_id} is not lendable (status: {status_str})")
                    return False, status_str
            else:
                # No items array or empty
                logger.debug(f"[UnifiedBookDownloader] No items found for {ol_id}")
                return False, "Not available for lending"
                
        except requests.exceptions.Timeout:
            logger.warning(f"[UnifiedBookDownloader] OpenLibrary API timeout for {ol_id}")
            return False, "API timeout"
        except Exception as e:
            logger.debug(f"[UnifiedBookDownloader] Failed to check lendable status for {ol_id}: {e}")
            return False, f"API error"
    
    
    async def download_book(self, method: Dict[str, Any], output_dir: Optional[str] = None) -> Tuple[bool, str]:
        """Download a book using the specified method.
        
        Args:
            method: Download method dict from get_download_options()
            output_dir: Directory to save the book
            
        Returns:
            Tuple of (success: bool, message: str)
        """
        output_dir = output_dir or self.output_dir or str(Path.home() / "Downloads")
        method_type = method.get('type', '')
        
        logger.info(f"[UnifiedBookDownloader] Starting download with method: {method_type}")
        
        try:
            if method_type == 'archive.org_direct':
                return await self._download_archive_direct(method, output_dir)
            
            elif method_type == 'archive.org_borrow':
                return await self._download_archive_borrow(method, output_dir)
            
            elif method_type == 'libgen':
                return await self._download_libgen(method, output_dir)
            
            else:
                return False, f"Unknown download method: {method_type}"
        
        except Exception as e:
            logger.error(f"[UnifiedBookDownloader] Download error: {e}", exc_info=True)
            return False, f"Download failed: {str(e)}"
    
    async def _download_archive_direct(self, method: Dict[str, Any], output_dir: str) -> Tuple[bool, str]:
        """Download directly from Archive.org."""
        try:
            pdf_url = method.get('pdf_url', '')
            book_id = method.get('book_id', '')
            
            if not pdf_url:
                return False, "No PDF URL available"
            
            # Determine output filename
            filename = f"{book_id}.pdf"
            output_path = Path(output_dir) / filename
            
            logger.info(f"[UnifiedBookDownloader] Downloading PDF from: {pdf_url}")
            
            # Download in a thread to avoid blocking
            loop = asyncio.get_event_loop()
            success = await loop.run_in_executor(
                None,
                self._download_file,
                pdf_url,
                str(output_path)
            )
            
            if success:
                logger.info(f"[UnifiedBookDownloader] Successfully downloaded to: {output_path}")
                return True, f"Downloaded to: {output_path}"
            else:
                return False, "Failed to download PDF"
        
        except Exception as e:
            logger.error(f"[UnifiedBookDownloader] Archive direct download error: {e}")
            return False, f"Archive download failed: {str(e)}"
    
    async def _download_archive_borrow(self, method: Dict[str, Any], output_dir: str) -> Tuple[bool, str]:
        """Download via Archive.org borrowing (requires credentials).
        
        Process (follows archive_client.py pattern):
        1. Login to Archive.org with credentials
        2. Call loan endpoint to borrow the book (14-day loan)
        3. Get book info (page links, metadata)
        4. Download all pages as images
        5. Merge images into PDF
        
        The loan function from archive_client.py handles:
        - Checking if book needs borrowing (status 400 = "doesn't need to be borrowed")
        - Creating borrow token for access
        - Handling borrow failures
        
        get_book_infos() extracts page links from the borrowed book viewer
        download() downloads all pages using thread pool
        img2pdf merges pages into searchable PDF
        """
        try:
            from helper.archive_client import credential_openlibrary
            
            book_id = method.get('book_id', '')
            
            # Get credentials
            email, password = credential_openlibrary(self.config)
            if not email or not password:
                return False, "Archive.org credentials not configured"
            
            logger.info(f"[UnifiedBookDownloader] Logging into Archive.org...")
            
            # Login and borrow (in thread, following download_book.py pattern)
            loop = asyncio.get_event_loop()
            borrow_result = await loop.run_in_executor(
                None,
                self._archive_borrow_and_download,
                email,
                password,
                book_id,
                output_dir
            )
            
            if borrow_result and isinstance(borrow_result, tuple):
                success, filepath = borrow_result
                if success:
                    logger.info(f"[UnifiedBookDownloader] Borrow succeeded: {filepath}")
                    return True, filepath
                else:
                    logger.warning(f"[UnifiedBookDownloader] Borrow failed: {filepath}")
                    return False, filepath
            else:
                return False, "Failed to borrow book from Archive.org"
        
        except Exception as e:
            logger.error(f"[UnifiedBookDownloader] Archive borrow error: {e}")
            return False, f"Archive borrow failed: {str(e)}"
    
    async def _download_libgen(self, method: Dict[str, Any], output_dir: str) -> Tuple[bool, str]:
        """Download via Libgen search and download with mirror fallback."""
        try:
            isbn = method.get('isbn', '')
            title = method.get('title', '')
            
            if not isbn and not title:
                return False, "Need ISBN or title for Libgen search"
            
            if not self.search_libgen:
                return False, "Libgen searcher not available"
            
            # Define wrapper functions to safely call the methods
            search_func = self.search_libgen
            if search_func is None:
                return False, "Search function not available"
            
            preloaded_results = method.get('results')
            loop = asyncio.get_event_loop()

            if preloaded_results:
                results = list(preloaded_results)
                if not results:
                    results = await loop.run_in_executor(None, lambda: search_func(isbn or title, 10))
            else:
                results = await loop.run_in_executor(None, lambda: search_func(isbn or title, 10))
            
            if not results:
                logger.warning(f"[UnifiedBookDownloader] No Libgen results for: {isbn or title}")
                return False, f"No Libgen results found for: {isbn or title}"
            
            logger.info(f"[UnifiedBookDownloader] Found {len(results)} Libgen results")
            
            # Determine output filename (use first result for naming)
            first_result = results[0]
            filename = f"{first_result.get('title', 'book')}"
            filename = "".join(c for c in filename if c.isalnum() or c in (' ', '.', '-'))[:100]
            
            # Try each result's mirror until one succeeds
            for idx, result in enumerate(results, 1):
                mirror_url = result.get('mirror_url', '')
                
                if not mirror_url:
                    logger.debug(f"[UnifiedBookDownloader] Result {idx}: No mirror URL")
                    continue
                
                # Use extension from this result if available
                extension = result.get('extension', 'pdf')
                if extension and not extension.startswith('.'):
                    extension = f".{extension}"
                elif not extension:
                    extension = '.pdf'
                
                output_path = Path(output_dir) / (filename + extension)
                
                logger.info(f"[UnifiedBookDownloader] Trying mirror {idx}/{len(results)}: {mirror_url}")
                
                download_func = self.download_from_mirror
                if download_func is None:
                    return False, "Download function not available"
                
                download_callable = cast(Callable[[str, str], bool], download_func)

                def download_wrapper():
                    return download_callable(mirror_url, str(output_path))
                
                # Download (in thread)
                try:
                    success = await loop.run_in_executor(None, download_wrapper)
                    
                    if success:
                        # Validate downloaded file is not HTML (common Libgen issue)
                        if output_path.exists():
                            try:
                                with open(output_path, 'rb') as f:
                                    file_start = f.read(1024).decode('utf-8', errors='ignore').lower()
                                    if '<!doctype' in file_start or '<html' in file_start:
                                        logger.warning(f"[UnifiedBookDownloader] Mirror {idx} returned HTML instead of file, trying next mirror...")
                                        output_path.unlink()  # Delete the HTML file
                                        continue
                            except Exception as e:
                                logger.debug(f"[UnifiedBookDownloader] Could not validate file content: {e}")
                        
                        logger.info(f"[UnifiedBookDownloader] Successfully downloaded from mirror {idx} to: {output_path}")
                        return True, str(output_path)
                    else:
                        logger.warning(f"[UnifiedBookDownloader] Mirror {idx} download failed, trying next...")
                except Exception as e:
                    logger.warning(f"[UnifiedBookDownloader] Mirror {idx} error: {e}, trying next...")
                    continue
            
            return False, f"All {len(results)} mirrors failed"
        
        except Exception as e:
            logger.error(f"[UnifiedBookDownloader] Libgen download error: {e}")
            return False, f"Libgen download failed: {str(e)}"

    async def download_libgen_selection(
        self,
        selected: Dict[str, Any],
        remaining: Optional[List[Dict[str, Any]]] = None,
        output_dir: Optional[str] = None,
    ) -> Tuple[bool, str]:
        """Download a specific Libgen result with optional fallbacks."""

        if not isinstance(selected, dict):
            return False, "Selected result must be a dictionary"

        ordered_results: List[Dict[str, Any]] = [selected]
        if remaining:
            for item in remaining:
                if isinstance(item, dict) and item is not selected:
                    ordered_results.append(item)

        method: Dict[str, Any] = {
            'type': 'libgen',
            'isbn': selected.get('isbn', '') or '',
            'title': selected.get('title', '') or '',
            'author': selected.get('author', '') or '',
            'results': ordered_results,
        }

        return await self.download_book(method, output_dir)

    def download_libgen_selection_sync(
        self,
        selected: Dict[str, Any],
        remaining: Optional[List[Dict[str, Any]]] = None,
        output_dir: Optional[str] = None,
    ) -> Tuple[bool, str]:
        """Synchronous helper for downloading a Libgen selection."""

        async def _run() -> Tuple[bool, str]:
            return await self.download_libgen_selection(selected, remaining, output_dir)

        loop = asyncio.new_event_loop()
        try:
            asyncio.set_event_loop(loop)
            return loop.run_until_complete(_run())
        finally:
            loop.close()
            asyncio.set_event_loop(None)
    
    def _download_file(self, url: str, output_path: str) -> bool:
        """Download a file from URL."""
        try:
            response = requests.get(url, stream=True, timeout=30)
            response.raise_for_status()
            
            with open(output_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
            
            return True
        except Exception as e:
            logger.error(f"[UnifiedBookDownloader] File download error: {e}")
            return False
    
    def _archive_borrow_and_download(self, email: str, password: str, book_id: str, output_dir: str) -> Tuple[bool, str]:
        """Borrow a book from Archive.org and download pages as PDF.
        
        This follows the exact process from archive_client.py:
        1. Login with credentials
        2. Call loan() to create 14-day borrow
        3. Get book info (extract page URLs)
        4. Download all pages as images
        5. Merge images into searchable PDF
        
        Returns tuple of (success: bool, filepath/message: str)
        """
        try:
            from helper.archive_client import login, loan, get_book_infos, download
            import tempfile
            import shutil
            
            logger.info(f"[UnifiedBookDownloader] Logging into Archive.org as {email}")
            session = login(email, password)
            
            logger.info(f"[UnifiedBookDownloader] Attempting to borrow book: {book_id}")
            # Call loan to create the 14-day borrow
            session = loan(session, book_id, verbose=True)
            
            # If we get here, borrowing succeeded
            logger.info(f"[UnifiedBookDownloader] Successfully borrowed book: {book_id}")
            
            # Now get the book info (page URLs and metadata)
            logger.info(f"[UnifiedBookDownloader] Extracting book page information...")
            # Try both URL formats: with /borrow and without
            book_urls = [
                f"https://archive.org/borrow/{book_id}",  # Try borrow page first (for borrowed books)
                f"https://archive.org/details/{book_id}"   # Fallback to details page
            ]
            
            title = None
            links = None
            metadata = None
            last_error = None
            
            for book_url in book_urls:
                try:
                    logger.debug(f"[UnifiedBookDownloader] Trying to get book info from: {book_url}")
                    response = session.get(book_url, timeout=10)
                    
                    # Log response status
                    if response.status_code != 200:
                        logger.debug(f"[UnifiedBookDownloader] URL returned {response.status_code}: {book_url}")
                        # Continue to try next URL
                        continue
                    
                    # Try to parse the response
                    title, links, metadata = get_book_infos(session, book_url)
                    logger.info(f"[UnifiedBookDownloader] Successfully got info from: {book_url}")
                    logger.info(f"[UnifiedBookDownloader] Found {len(links)} pages to download")
                    break
                except Exception as e:
                    logger.debug(f"[UnifiedBookDownloader] Failed with {book_url}: {e}")
                    last_error = e
                    continue
            
            if links is None:
                logger.error(f"[UnifiedBookDownloader] Failed to get book info from all URLs: {last_error}")
                # Borrow extraction failed - return False
                return False, "Could not extract borrowed book pages"
            
            # Create temporary directory for images
            temp_dir = tempfile.mkdtemp(prefix=f"{title}_", dir=output_dir)
            logger.info(f"[UnifiedBookDownloader] Downloading {len(links)} pages to temporary directory...")
            
            try:
                # Download all pages (uses thread pool)
                images = download(
                    session=session,
                    n_threads=10,
                    directory=temp_dir,
                    links=links,
                    scale=3,  # Default resolution
                    book_id=book_id
                )
                
                logger.info(f"[UnifiedBookDownloader] Downloaded {len(images)} pages")
                
                # Try to merge pages into PDF
                try:
                    import img2pdf
                    logger.info(f"[UnifiedBookDownloader] Merging pages into PDF...")
                    
                    # Prepare PDF metadata
                    pdfmeta = {}
                    if metadata:
                        if "title" in metadata:
                            pdfmeta["title"] = metadata["title"]
                        if "creator" in metadata:
                            pdfmeta["author"] = metadata["creator"]
                    pdfmeta["keywords"] = [f"https://archive.org/details/{book_id}"]
                    pdfmeta["creationdate"] = None  # Avoid timezone issues
                    
                    # Convert images to PDF
                    pdf_content = img2pdf.convert(images, **pdfmeta) if images else None
                    if not pdf_content:
                        logger.error(f"[UnifiedBookDownloader] PDF conversion failed")
                        return False, "Failed to convert pages to PDF"
                    
                    # Save the PDF
                    pdf_filename = f"{title}.pdf" if title else "book.pdf"
                    pdf_path = Path(output_dir) / pdf_filename
                    
                    # Handle duplicate filenames
                    i = 1
                    while pdf_path.exists():
                        pdf_path = Path(output_dir) / f"{title or 'book'}({i}).pdf"
                        i += 1
                    
                    with open(pdf_path, 'wb') as f:
                        f.write(pdf_content)
                    
                    logger.info(f"[UnifiedBookDownloader] Successfully created PDF: {pdf_path}")
                    
                    return True, str(pdf_path)
                
                except ImportError:
                    logger.warning(f"[UnifiedBookDownloader] img2pdf not available, saving as JPG collection instead")
                    
                    # Create JPG collection directory
                    if not title:
                        title = f"book_{book_id}"
                    jpg_dir = Path(output_dir) / title
                    i = 1
                    while jpg_dir.exists():
                        jpg_dir = Path(output_dir) / f"{title}({i})"
                        i += 1
                    
                    # Move temporary directory to final location
                    shutil.move(temp_dir, str(jpg_dir))
                    temp_dir = None  # Mark as already moved
                    
                    logger.info(f"[UnifiedBookDownloader] Saved as JPG collection: {jpg_dir}")
                    return True, str(jpg_dir)
            
            finally:
                # Clean up temporary directory if it still exists
                if temp_dir and Path(temp_dir).exists():
                    shutil.rmtree(temp_dir)
        
        except SystemExit:
            # loan() function calls sys.exit on failure - catch it
            logger.error(f"[UnifiedBookDownloader] Borrow process exited (book may not be borrowable)")
            return False, "Book could not be borrowed (may not be available for borrowing)"
        except Exception as e:
            logger.error(f"[UnifiedBookDownloader] Archive borrow error: {e}")
            return False, f"Borrow failed: {str(e)}"
    
    def close(self) -> None:
        """Close the session."""
        self.session.close()