from __future__ import annotations import shutil import sys import tempfile from pathlib import Path from typing import Any, Dict, List, Optional, Tuple import requests from ProviderCore.base import SearchProvider, SearchResult from ProviderCore.download import download_file, sanitize_filename from cli_syntax import get_field, get_free_text, parse_query from SYS.logger import log from SYS.utils import unique_path def _looks_like_isbn(text: str) -> bool: t = (text or "").replace("-", "").strip() return t.isdigit() and len(t) in (10, 13) def _first_str(value: Any) -> Optional[str]: if isinstance(value, str): v = value.strip() return v if v else None if isinstance(value, list) and value: first = value[0] if isinstance(first, str): v = first.strip() return v if v else None return str(first) if first is not None else None return None def _resolve_edition_id(doc: Dict[str, Any]) -> str: # OpenLibrary Search API typically provides edition_key: ["OL...M", ...] edition_key = doc.get("edition_key") if isinstance(edition_key, list) and edition_key: return str(edition_key[0]).strip() # Fallback: sometimes key can be /books/OL...M key = doc.get("key") if isinstance(key, str) and key.startswith("/books/"): return key.split("/books/", 1)[1].strip("/") return "" def _check_lendable(session: requests.Session, edition_id: str) -> Tuple[bool, str]: """Return (lendable, status_text) using OpenLibrary volumes API.""" try: if not edition_id or not edition_id.startswith("OL") or not edition_id.endswith("M"): return False, "not-an-edition" url = f"https://openlibrary.org/api/volumes/brief/json/OLID:{edition_id}" resp = session.get(url, timeout=10) resp.raise_for_status() data = resp.json() or {} wrapped = data.get(f"OLID:{edition_id}") if not isinstance(wrapped, dict): return False, "no-availability" items = wrapped.get("items") if not isinstance(items, list) or not items: return False, "no-items" first = items[0] status_val = "" if isinstance(first, dict): status_val = str(first.get("status", "")) else: status_val = str(first) return ("lendable" in status_val.lower()), status_val except requests.exceptions.Timeout: return False, "api-timeout" except Exception: return False, "api-error" def _resolve_archive_id(session: requests.Session, edition_id: str, ia_candidates: List[str]) -> str: # Prefer IA identifiers already present in search results. if ia_candidates: first = ia_candidates[0].strip() if first: return first # Otherwise query the edition JSON. try: resp = session.get(f"https://openlibrary.org/books/{edition_id}.json", timeout=10) resp.raise_for_status() data = resp.json() or {} ocaid = data.get("ocaid") if isinstance(ocaid, str) and ocaid.strip(): return ocaid.strip() identifiers = data.get("identifiers") if isinstance(identifiers, dict): ia = identifiers.get("internet_archive") ia_id = _first_str(ia) if ia_id: return ia_id except Exception: pass return "" class OpenLibrary(SearchProvider): """Search provider for OpenLibrary books + Archive.org direct/borrow download.""" def __init__(self, config: Optional[Dict[str, Any]] = None): super().__init__(config) self._session = requests.Session() def search( self, query: str, limit: int = 50, filters: Optional[Dict[str, Any]] = None, **kwargs: Any, ) -> List[SearchResult]: filters = filters or {} parsed = parse_query(query) isbn = get_field(parsed, "isbn") author = get_field(parsed, "author") title = get_field(parsed, "title") free_text = get_free_text(parsed) q = (isbn or title or author or free_text or query or "").strip() if not q: return [] if _looks_like_isbn(q): q = f"isbn:{q.replace('-', '')}" try: resp = self._session.get( "https://openlibrary.org/search.json", params={"q": q, "limit": int(limit)}, timeout=10, ) resp.raise_for_status() data = resp.json() or {} except Exception as exc: log(f"[openlibrary] Search failed: {exc}", file=sys.stderr) return [] results: List[SearchResult] = [] docs = data.get("docs") or [] if not isinstance(docs, list): return [] for doc in docs[: int(limit)]: if not isinstance(doc, dict): continue book_title = str(doc.get("title") or "").strip() or "Unknown" authors = doc.get("author_name") or [] if isinstance(authors, str): authors = [authors] if not isinstance(authors, list): authors = [] authors_list = [str(a) for a in authors if a] year_val = doc.get("first_publish_year") year = str(year_val) if year_val is not None else "" edition_id = _resolve_edition_id(doc) ia_val = doc.get("ia") or [] if isinstance(ia_val, str): ia_val = [ia_val] if not isinstance(ia_val, list): ia_val = [] ia_ids = [str(x) for x in ia_val if x] isbn_list = doc.get("isbn") or [] if isinstance(isbn_list, str): isbn_list = [isbn_list] if not isinstance(isbn_list, list): isbn_list = [] isbn_13 = next((str(i) for i in isbn_list if len(str(i)) == 13), "") isbn_10 = next((str(i) for i in isbn_list if len(str(i)) == 10), "") columns = [ ("Title", book_title), ("Author", ", ".join(authors_list)), ("Year", year), ("OLID", edition_id), ] annotations: List[str] = [] if isbn_13: annotations.append(f"isbn_13:{isbn_13}") elif isbn_10: annotations.append(f"isbn_10:{isbn_10}") if ia_ids: annotations.append("archive") results.append( SearchResult( table="openlibrary", title=book_title, path=(f"https://openlibrary.org/books/{edition_id}" if edition_id else "https://openlibrary.org"), detail=( (f"By: {', '.join(authors_list)}" if authors_list else "") + (f" ({year})" if year else "") ).strip(), annotations=annotations, media_kind="book", columns=columns, full_metadata={ "openlibrary_id": edition_id, "authors": authors_list, "year": year, "isbn_10": isbn_10, "isbn_13": isbn_13, "ia": ia_ids, "raw": doc, }, ) ) return results def download(self, result: SearchResult, output_dir: Path) -> Optional[Path]: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) meta = result.full_metadata or {} edition_id = str(meta.get("openlibrary_id") or "").strip() if not edition_id: log("[openlibrary] Missing openlibrary_id; cannot download", file=sys.stderr) return None ia_ids = meta.get("ia") or [] if isinstance(ia_ids, str): ia_ids = [ia_ids] if not isinstance(ia_ids, list): ia_ids = [] ia_candidates = [str(x) for x in ia_ids if x] archive_id = _resolve_archive_id(self._session, edition_id, ia_candidates) if not archive_id: log("[openlibrary] No archive identifier available; cannot download", file=sys.stderr) return None safe_title = sanitize_filename(result.title) # 1) Direct download if available. try: from API.archive_client import check_direct_download can_direct, pdf_url = check_direct_download(archive_id) except Exception: can_direct, pdf_url = False, "" if can_direct and pdf_url: out_path = unique_path(output_dir / f"{safe_title}.pdf") ok = download_file(pdf_url, out_path, session=self._session) if ok: return out_path log("[openlibrary] Direct download failed", file=sys.stderr) return None # 2) Borrow flow (credentials required). try: from API.archive_client import BookNotAvailableError, credential_openlibrary, download as archive_download from API.archive_client import get_book_infos, loan, login email, password = credential_openlibrary(self.config or {}) if not email or not password: log("[openlibrary] Archive credentials missing; cannot borrow", file=sys.stderr) return None lendable, reason = _check_lendable(self._session, edition_id) if not lendable: log(f"[openlibrary] Not lendable: {reason}", file=sys.stderr) return None session = login(email, password) try: session = loan(session, archive_id, verbose=False) except BookNotAvailableError: log("[openlibrary] Book not available to borrow", file=sys.stderr) return None except SystemExit: log("[openlibrary] Borrow failed", file=sys.stderr) return None urls = [f"https://archive.org/borrow/{archive_id}", f"https://archive.org/details/{archive_id}"] title = safe_title links: Optional[List[str]] = None last_exc: Optional[Exception] = None for u in urls: try: title_raw, links, _metadata = get_book_infos(session, u) if title_raw: title = sanitize_filename(title_raw) break except Exception as exc: last_exc = exc continue if not links: log(f"[openlibrary] Failed to extract pages: {last_exc}", file=sys.stderr) return None temp_dir = tempfile.mkdtemp(prefix=f"{title}_", dir=str(output_dir)) try: images = archive_download(session=session, n_threads=10, directory=temp_dir, links=links, scale=3, book_id=archive_id) try: import img2pdf # type: ignore pdf_bytes = img2pdf.convert(images) if images else None if not pdf_bytes: log("[openlibrary] PDF conversion failed", file=sys.stderr) try: shutil.rmtree(temp_dir) except Exception: pass return None pdf_path = unique_path(output_dir / f"{title}.pdf") with open(pdf_path, "wb") as f: f.write(pdf_bytes) try: shutil.rmtree(temp_dir) except Exception: pass return pdf_path except ImportError: # Keep images folder. return Path(temp_dir) except Exception: try: shutil.rmtree(temp_dir) except Exception: pass raise except Exception as exc: log(f"[openlibrary] Borrow workflow error: {exc}", file=sys.stderr) return None def validate(self) -> bool: return True