Medios-Macina/Provider/openlibrary.py

from __future__ import annotations

import base64
import io
from concurrent import futures
import hashlib
import json as json_module
import re
import shutil
import sys
import tempfile
import time
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple
from urllib.parse import urlparse

import requests

from API.HTTP import HTTPClient
from ProviderCore.base import Provider, SearchResult
from ProviderCore.download import download_file, sanitize_filename
from SYS.cli_syntax import get_field, get_free_text, parse_query
from SYS.logger import debug, log
from SYS.utils import unique_path

try:
    from Crypto.Cipher import AES  # type: ignore
    from Crypto.Util import Counter  # type: ignore
except ImportError:
    AES = None  # type: ignore
    Counter = None  # type: ignore

try:
    from tqdm import tqdm  # type: ignore
except ImportError:
    tqdm = None  # type: ignore


def _image_paths_to_pdf_bytes(images: List[str]) -> Optional[bytes]:
    if not images:
        return None
    try:
        from PIL import Image  # type: ignore
    except Exception:
        return None

    pil_images: List[Any] = []
    try:
        for p in images:
            img_path = Path(p)
            if not img_path.is_file():
                continue
            with Image.open(img_path) as im:  # type: ignore[attr-defined]
                # Ensure PDF-compatible mode.
                if im.mode in {"RGBA",
                               "LA",
                               "P"}:
                    im = im.convert("RGB")
                else:
                    im = im.convert("RGB")
                pil_images.append(im.copy())
    except Exception:
        for im in pil_images:
            try:
                im.close()
            except Exception:
                pass
        return None

    if not pil_images:
        return None

    buf = io.BytesIO()
    first, rest = pil_images[0], pil_images[1:]
    try:
        first.save(buf, format="PDF", save_all=True, append_images=rest)
        return buf.getvalue()
    except Exception:
        return None
    finally:
        for im in pil_images:
            try:
                im.close()
            except Exception:
                pass


def _looks_like_isbn(text: str) -> bool:
    t = (text or "").replace("-", "").strip()
    return t.isdigit() and len(t) in (10, 13)


def _first_str(value: Any) -> Optional[str]:
    if isinstance(value, str):
        v = value.strip()
        return v if v else None
    if isinstance(value, list) and value:
        first = value[0]
        if isinstance(first, str):
            v = first.strip()
            return v if v else None
        return str(first) if first is not None else None
    return None


def _resolve_edition_id(doc: Dict[str, Any]) -> str:
    # OpenLibrary Search API typically provides edition_key: ["OL...M", ...]
    edition_key = doc.get("edition_key")
    if isinstance(edition_key, list) and edition_key:
        return str(edition_key[0]).strip()
    if isinstance(edition_key, str) and edition_key.strip():
        return edition_key.strip()

    # Often present even when edition_key is missing.
    cover_edition_key = doc.get("cover_edition_key")
    if isinstance(cover_edition_key, str) and cover_edition_key.strip():
        return cover_edition_key.strip()

    # Fallback: sometimes key can be /books/OL...M
    key = doc.get("key")
    if isinstance(key, str) and key.startswith("/books/"):
        return key.split("/books/", 1)[1].strip("/")

    return ""


def _check_lendable(session: requests.Session, edition_id: str) -> Tuple[bool, str]:
    """Return (lendable, status_text) using OpenLibrary volumes API."""
    try:
        if not edition_id or not edition_id.startswith("OL") or not edition_id.endswith(
                "M"):
            return False, "not-an-edition"

        url = f"https://openlibrary.org/api/volumes/brief/json/OLID:{edition_id}"
        resp = session.get(url, timeout=6)
        resp.raise_for_status()
        data = resp.json() or {}
        wrapped = data.get(f"OLID:{edition_id}")
        if not isinstance(wrapped, dict):
            return False, "no-availability"

        items = wrapped.get("items")
        if not isinstance(items, list) or not items:
            return False, "no-items"

        first = items[0]
        status_val = ""
        if isinstance(first, dict):
            status_val = str(first.get("status", ""))
        else:
            status_val = str(first)

        return ("lendable" in status_val.lower()), status_val
    except requests.exceptions.Timeout:
        return False, "api-timeout"
    except Exception:
        return False, "api-error"


def _resolve_archive_id(
    session: requests.Session,
    edition_id: str,
    ia_candidates: List[str]
) -> str:
    # Prefer IA identifiers already present in search results.
    if ia_candidates:
        first = ia_candidates[0].strip()
        if first:
            return first

    # Otherwise query the edition JSON.
    try:
        resp = session.get(
            f"https://openlibrary.org/books/{edition_id}.json",
            timeout=6
        )
        resp.raise_for_status()
        data = resp.json() or {}

        ocaid = data.get("ocaid")
        if isinstance(ocaid, str) and ocaid.strip():
            return ocaid.strip()

        identifiers = data.get("identifiers")
        if isinstance(identifiers, dict):
            ia = identifiers.get("internet_archive")
            ia_id = _first_str(ia)
            if ia_id:
                return ia_id

    except Exception:
        pass

    return ""


def _archive_id_from_url(url: str) -> str:
    """Best-effort extraction of an Archive.org item identifier from a URL."""

    u = str(url or "").strip()
    if not u:
        return ""

    try:
        p = urlparse(u)
        host = (p.hostname or "").lower().strip()
        if not host.endswith("archive.org"):
            return ""
        parts = [x for x in (p.path or "").split("/") if x]
    except Exception:
        return ""

    # Common patterns:
    # - /details/<id>/...
    # - /borrow/<id>
    # - /download/<id>/...
    if len(parts) >= 2 and parts[0].lower() in {"details",
                                                "borrow",
                                                "download",
                                                "stream"}:
        return str(parts[1]).strip()

    # Sometimes the identifier is the first segment.
    if len(parts) >= 1:
        first = str(parts[0]).strip()
        if first and first.lower() not in {"account",
                                           "services",
                                           "search",
                                           "advancedsearch.php"}:
            return first

    return ""


def _coerce_archive_field_list(value: Any) -> List[str]:
    """Coerce an Archive.org metadata field to a list of strings."""
    if value is None:
        return []
    if isinstance(value, list):
        out: List[str] = []
        for v in value:
            try:
                s = str(v).strip()
            except Exception:
                continue
            if s:
                out.append(s)
        return out
    if isinstance(value, (tuple, set)):
        out = []
        for v in value:
            try:
                s = str(v).strip()
            except Exception:
                continue
            if s:
                out.append(s)
        return out
    try:
        s = str(value).strip()
    except Exception:
        return []
    return [s] if s else []


def _archive_item_metadata_to_tags(archive_id: str,
                                   item_metadata: Dict[str,
                                                       Any]) -> List[str]:
    """Map Archive.org metadata JSON (the `metadata` object) to tag strings.

    This is intentionally best-effort and conservative: it focuses on stable,
    useful bibliographic fields (title/author/publisher/ISBN/identifier/topics).
    """
    archive_id_clean = str(archive_id or "").strip()
    meta = item_metadata if isinstance(item_metadata,
                                       dict) else {}

    tags: List[str] = []
    seen: set[str] = set()

    def _add(tag: str) -> None:
        try:
            t = str(tag).strip()
        except Exception:
            return
        if not t:
            return
        if t.lower() in seen:
            return
        seen.add(t.lower())
        tags.append(t)

    if archive_id_clean:
        _add(f"internet_archive:{archive_id_clean}")

    # Title
    for title in _coerce_archive_field_list(meta.get("title"))[:1]:
        _add(f"title:{title}")

    # Authors/creators
    creators: List[str] = []
    creators.extend(_coerce_archive_field_list(meta.get("creator")))
    creators.extend(_coerce_archive_field_list(meta.get("author")))
    for creator in creators[:3]:
        _add(f"author:{creator}")

    # Publisher
    for publisher in _coerce_archive_field_list(meta.get("publisher"))[:3]:
        _add(f"publisher:{publisher}")

    # Publish date/year
    for date_val in _coerce_archive_field_list(meta.get("date"))[:1]:
        _add(f"publish_date:{date_val}")
    for year_val in _coerce_archive_field_list(meta.get("year"))[:1]:
        _add(f"publish_date:{year_val}")

    # Language
    for lang in _coerce_archive_field_list(meta.get("language"))[:3]:
        _add(f"language:{lang}")

    # Topics/subjects: follow existing OpenLibrary behavior (un-namespaced tags)
    for subj in _coerce_archive_field_list(meta.get("subject"))[:15]:
        if len(subj) > 200:
            subj = subj[:200]
        _add(subj)

    # ISBNs and identifiers
    def _clean_isbn(raw: str) -> str:
        return str(raw or "").replace("-", "").strip()

    for isbn in _coerce_archive_field_list(meta.get("isbn"))[:10]:
        isbn_clean = _clean_isbn(isbn)
        if isbn_clean:
            _add(f"isbn:{isbn_clean}")

    identifiers: List[str] = []
    identifiers.extend(_coerce_archive_field_list(meta.get("identifier")))
    identifiers.extend(_coerce_archive_field_list(meta.get("external-identifier")))
    added_other = 0
    for ident in identifiers:
        ident_s = str(ident or "").strip()
        if not ident_s:
            continue
        low = ident_s.lower()

        if low.startswith("urn:isbn:"):
            val = _clean_isbn(ident_s.split(":", 2)[-1])
            if val:
                _add(f"isbn:{val}")
            continue
        if low.startswith("isbn:"):
            val = _clean_isbn(ident_s.split(":", 1)[-1])
            if val:
                _add(f"isbn:{val}")
            continue
        if low.startswith("urn:oclc:"):
            val = ident_s.split(":", 2)[-1].strip()
            if val:
                _add(f"oclc:{val}")
            continue
        if low.startswith("oclc:"):
            val = ident_s.split(":", 1)[-1].strip()
            if val:
                _add(f"oclc:{val}")
            continue
        if low.startswith("urn:lccn:"):
            val = ident_s.split(":", 2)[-1].strip()
            if val:
                _add(f"lccn:{val}")
            continue
        if low.startswith("lccn:"):
            val = ident_s.split(":", 1)[-1].strip()
            if val:
                _add(f"lccn:{val}")
            continue
        if low.startswith("doi:"):
            val = ident_s.split(":", 1)[-1].strip()
            if val:
                _add(f"doi:{val}")
            continue

        if archive_id_clean and low == archive_id_clean.lower():
            continue
        if added_other >= 5:
            continue
        if len(ident_s) > 200:
            ident_s = ident_s[:200]
        _add(f"identifier:{ident_s}")
        added_other += 1

    return tags


def _fetch_archive_item_metadata(archive_id: str,
                                 *,
                                 timeout: int = 8) -> Dict[str,
                                                           Any]:
    ident = str(archive_id or "").strip()
    if not ident:
        return {}
    resp = requests.get(f"https://archive.org/metadata/{ident}", timeout=int(timeout))
    resp.raise_for_status()
    data = resp.json() if resp is not None else {}
    if not isinstance(data, dict):
        return {}
    meta = data.get("metadata")
    return meta if isinstance(meta,
                              dict) else {}


class OpenLibrary(Provider):
    # Domains that should be routed to this provider when the user supplies a URL.
    # (Used by ProviderCore.registry.match_provider_name_for_url)
    URL_DOMAINS = (
        "openlibrary.org",
        "archive.org",
    )
    """Search provider for OpenLibrary books + Archive.org direct/borrow download."""

    def __init__(self, config: Optional[Dict[str, Any]] = None):
        super().__init__(config)
        self._session = requests.Session()

    class BookNotAvailableError(Exception):
        """Raised when a book is not available for borrowing (waitlisted/in use)."""

    @staticmethod
    def _credential_archive(config: Dict[str,
                                         Any]) -> Tuple[Optional[str],
                                                        Optional[str]]:
        """Get Archive.org email/password from config.

        Supports:
        - New: {"provider": {"openlibrary": {"email": "...", "password": "..."}}}
        - Old: {"Archive": {"email": "...", "password": "..."}}
               {"archive_org_email": "...", "archive_org_password": "..."}
        """
        if not isinstance(config, dict):
            return None, None

        provider_config = config.get("provider",
                                     {})
        if isinstance(provider_config, dict):
            openlibrary_config = provider_config.get("openlibrary",
                                                     {})
            if isinstance(openlibrary_config, dict):
                email = openlibrary_config.get("email")
                password = openlibrary_config.get("password")
                if email or password:
                    return str(email) if email is not None else None, (
                        str(password) if password is not None else None
                    )

        archive_config = config.get("Archive")
        if isinstance(archive_config, dict):
            email = archive_config.get("email")
            password = archive_config.get("password")
            if email or password:
                return str(email) if email is not None else None, (
                    str(password) if password is not None else None
                )

        email = config.get("archive_org_email")
        password = config.get("archive_org_password")
        return str(email) if email is not None else None, (
            str(password) if password is not None else None
        )

    @staticmethod
    def _archive_error_body(response: requests.Response) -> str:
        try:
            body = response.text or ""
        except Exception:
            return ""
        if len(body) > 2000:
            return body[:1200] + "\n... (truncated) ...\n" + body[-400:]
        return body

    @classmethod
    def _archive_login(cls, email: str, password: str) -> requests.Session:
        """Login to archive.org using the token-based services endpoint (matches test-login.py)."""
        session = requests.Session()

        token_resp = session.get(
            "https://archive.org/services/account/login/",
            timeout=30
        )
        try:
            token_json = token_resp.json()
        except Exception as exc:
            raise RuntimeError(
                f"Archive login token parse failed: {exc}\n{cls._archive_error_body(token_resp)}"
            )

        if not token_json.get("success"):
            raise RuntimeError(
                f"Archive login token fetch failed\n{cls._archive_error_body(token_resp)}"
            )

        token = (token_json.get("value") or {}).get("token")
        if not token:
            raise RuntimeError("Archive login token missing")

        headers = {
            "Content-Type": "application/x-www-form-urlencoded"
        }
        payload = {
            "username": email,
            "password": password,
            "t": token
        }

        login_resp = session.post(
            "https://archive.org/services/account/login/",
            headers=headers,
            data=json_module.dumps(payload),
            timeout=30,
        )

        try:
            login_json = login_resp.json()
        except Exception as exc:
            raise RuntimeError(
                f"Archive login parse failed: {exc}\n{cls._archive_error_body(login_resp)}"
            )

        if login_json.get("success") is False:
            if login_json.get("value") == "bad_login":
                raise RuntimeError("Invalid Archive.org credentials")
            raise RuntimeError(f"Archive login failed: {login_json}")

        return session

    @classmethod
    def _archive_loan(
        cls,
        session: requests.Session,
        book_id: str,
        *,
        verbose: bool = True
    ) -> requests.Session:
        data = {
            "action": "grant_access",
            "identifier": book_id
        }
        session.post(
            "https://archive.org/services/loans/loan/searchInside.php",
            data=data,
            timeout=30
        )
        data["action"] = "browse_book"
        response = session.post(
            "https://archive.org/services/loans/loan/",
            data=data,
            timeout=30
        )

        if response.status_code == 400:
            try:
                err = (response.json() or {}).get("error")
                if (err ==
                        "This book is not available to borrow at this time. Please try again later."
                    ):
                    raise cls.BookNotAvailableError("Book is waitlisted or in use")
                raise RuntimeError(f"Borrow failed: {err or response.text}")
            except cls.BookNotAvailableError:
                raise
            except Exception:
                raise RuntimeError("The book cannot be borrowed")

        data["action"] = "create_token"
        response = session.post(
            "https://archive.org/services/loans/loan/",
            data=data,
            timeout=30
        )
        if "token" in (response.text or ""):
            return session
        raise RuntimeError("Something went wrong when trying to borrow the book")

    @staticmethod
    def _archive_return_loan(session: requests.Session, book_id: str) -> None:
        data = {
            "action": "return_loan",
            "identifier": book_id
        }
        response = session.post(
            "https://archive.org/services/loans/loan/",
            data=data,
            timeout=30
        )
        if response.status_code == 200:
            try:
                if (response.json() or {}).get("success"):
                    return
            except Exception:
                pass
        raise RuntimeError("Something went wrong when trying to return the book")

    @staticmethod
    def _archive_logout(session: requests.Session) -> None:
        """Best-effort logout from archive.org.

        Archive sessions are cookie-based; returning the loan is the critical step.
        Logout is attempted for cleanliness but failures should not abort the workflow.
        """

        if session is None:
            return
        for url in (
                "https://archive.org/account/logout",
                "https://archive.org/account/logout.php",
        ):
            try:
                resp = session.get(url, timeout=15, allow_redirects=True)
                code = int(getattr(resp, "status_code", 0) or 0)
                if code and code < 500:
                    return
            except Exception:
                continue

    @staticmethod
    def _archive_is_lendable(book_id: str) -> tuple[bool, str]:
        """Heuristic lendable check using Archive.org item metadata.

        Some lendable items do not map cleanly to an OpenLibrary edition id.
        In practice, Archive metadata collections often include markers like:
        - inlibrary
        - printdisabled
        """

        ident = str(book_id or "").strip()
        if not ident:
            return False, "no-archive-id"
        try:
            resp = requests.get(f"https://archive.org/metadata/{ident}", timeout=8)
            resp.raise_for_status()
            data = resp.json() if resp is not None else {}
            meta = data.get("metadata",
                            {}) if isinstance(data,
                                              dict) else {}
            collection = meta.get("collection") if isinstance(meta, dict) else None

            values: List[str] = []
            if isinstance(collection, list):
                values = [str(x).strip().lower() for x in collection if str(x).strip()]
            elif isinstance(collection, str):
                values = [collection.strip().lower()]

            if any(v in {"inlibrary",
                         "printdisabled",
                         "lendinglibrary"} for v in values):
                return True, "archive-collection"
            return False, "archive-not-lendable"
        except Exception:
            return False, "archive-metadata-error"

    @staticmethod
    def _archive_get_book_infos(session: requests.Session,
                                url: str) -> Tuple[str,
                                                   List[str],
                                                   Dict[str,
                                                        Any]]:
        """Extract page links from Archive.org book reader."""
        r = session.get(url, timeout=30).text

        # Matches: "url":"//archive.org/..." (allow whitespace)
        match = re.search(r'"url"\s*:\s*"([^"]+)"', r)
        if not match:
            raise RuntimeError("Failed to extract book info URL from response")

        url_path = match.group(1)
        infos_url = ("https:" + url_path) if url_path.startswith("//") else url_path
        infos_url = infos_url.replace("\\u0026", "&")

        response = session.get(infos_url, timeout=30)
        payload = response.json()
        data = payload["data"]

        title = str(data["brOptions"]["bookTitle"]).strip().replace(" ", "_")
        title = "".join(c for c in title if c not in '<>:"/\\|?*')
        title = title[:150]

        metadata = data.get("metadata") or {}
        links: List[str] = []
        br_data = (data.get("brOptions") or {}).get("data",
                                                    [])
        if isinstance(br_data, list):
            for item in br_data:
                if isinstance(item, list):
                    for page in item:
                        if isinstance(page, dict) and "uri" in page:
                            links.append(page["uri"])
                elif isinstance(item, dict) and "uri" in item:
                    links.append(item["uri"])

        if not links:
            raise RuntimeError("No pages found in book data")
        return title, links, metadata if isinstance(metadata, dict) else {}

    @staticmethod
    def _archive_image_name(pages: int, page: int, directory: str) -> str:
        return f"{directory}/{(len(str(pages)) - len(str(page))) * '0'}{page}.jpg"

    @staticmethod
    def _archive_deobfuscate_image(
        image_data: bytes,
        link: str,
        obf_header: str
    ) -> bytes:
        if not AES or not Counter:
            raise RuntimeError("Crypto library not available")

        try:
            version, counter_b64 = obf_header.split("|")
        except Exception as exc:
            raise ValueError("Invalid X-Obfuscate header format") from exc

        if version != "1":
            raise ValueError("Unsupported obfuscation version: " + version)

        aes_key = re.sub(r"^https?:\/\/.*?\/", "/", link)
        sha1_digest = hashlib.sha1(aes_key.encode("utf-8")).digest()
        key = sha1_digest[:16]

        counter_bytes = base64.b64decode(counter_b64)
        if len(counter_bytes) != 16:
            raise ValueError(
                f"Expected counter to be 16 bytes, got {len(counter_bytes)}"
            )

        prefix = counter_bytes[:8]
        initial_value = int.from_bytes(counter_bytes[8:], byteorder="big")
        ctr = Counter.new(
            64,
            prefix=prefix,
            initial_value=initial_value,
            little_endian=False
        )  # type: ignore
        cipher = AES.new(key, AES.MODE_CTR, counter=ctr)  # type: ignore

        decrypted_part = cipher.decrypt(image_data[:1024])
        return decrypted_part + image_data[1024:]

    @classmethod
    def _archive_download_one_image(
        cls,
        session: requests.Session,
        link: str,
        i: int,
        directory: str,
        book_id: str,
        pages: int,
    ) -> None:
        headers = {
            "Referer": "https://archive.org/",
            "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
            "Sec-Fetch-Site": "same-site",
            "Sec-Fetch-Mode": "no-cors",
            "Sec-Fetch-Dest": "image",
        }

        while True:
            try:
                response = session.get(link, headers=headers, timeout=30)
                if response.status_code == 403:
                    cls._archive_loan(session, book_id, verbose=False)
                    raise RuntimeError("Borrow again")
                if response.status_code == 200:
                    break
            except Exception:
                time.sleep(1)

        image = cls._archive_image_name(pages, i, directory)
        obf_header = response.headers.get("X-Obfuscate")
        if obf_header:
            image_content = cls._archive_deobfuscate_image(
                response.content,
                link,
                obf_header
            )
        else:
            image_content = response.content

        with open(image, "wb") as f:
            f.write(image_content)

    @classmethod
    def _archive_download(
        cls,
        session: requests.Session,
        n_threads: int,
        directory: str,
        links: List[str],
        scale: int,
        book_id: str,
        progress_callback: Optional[Callable[[int,
                                              int],
                                             None]] = None,
    ) -> List[str]:
        links_scaled = [f"{link}&rotate=0&scale={scale}" for link in links]
        pages = len(links_scaled)

        tasks = []
        with futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
            for i, link in enumerate(links_scaled):
                tasks.append(
                    executor.submit(
                        cls._archive_download_one_image,
                        session=session,
                        link=link,
                        i=i,
                        directory=directory,
                        book_id=book_id,
                        pages=pages,
                    )
                )
            if progress_callback is not None:
                done = 0
                total = len(tasks)
                for fut in futures.as_completed(tasks):
                    try:
                        _ = fut.result()
                    except Exception:
                        pass
                    done += 1
                    try:
                        progress_callback(done, total)
                    except Exception:
                        pass
            elif tqdm:
                for _ in tqdm(futures.as_completed(tasks),
                              total=len(tasks)):  # type: ignore
                    pass
            else:
                for _ in futures.as_completed(tasks):
                    pass

        return [cls._archive_image_name(pages, i, directory) for i in range(pages)]

    @staticmethod
    def _archive_check_direct_download(book_id: str) -> Tuple[bool, str]:
        """Check for a directly downloadable original PDF in Archive.org metadata."""
        try:
            metadata_url = f"https://archive.org/metadata/{book_id}"
            response = requests.get(metadata_url, timeout=6)
            response.raise_for_status()
            metadata = response.json()
            files = metadata.get("files") if isinstance(metadata, dict) else None
            if isinstance(files, list):
                for file_info in files:
                    if not isinstance(file_info, dict):
                        continue
                    filename = str(file_info.get("name", ""))
                    if filename.endswith(".pdf") and file_info.get("source"
                                                                   ) == "original":
                        pdf_url = (
                            f"https://archive.org/download/{book_id}/{filename.replace(' ', '%20')}"
                        )
                        check_response = requests.head(
                            pdf_url,
                            timeout=4,
                            allow_redirects=True
                        )
                        if check_response.status_code == 200:
                            return True, pdf_url
            return False, ""
        except Exception:
            return False, ""

    @staticmethod
    def scrape_isbn_metadata(isbn: str) -> List[str]:
        """Scrape tags for an ISBN using Open Library API.

        Returns tags such as:
        - title:<...>, author:<...>, publish_date:<...>, publisher:<...>, description:<...>, pages:<...>
        - identifiers: openlibrary:<...>, lccn:<...>, oclc:<...>, goodreads:<...>, librarything:<...>, doi:<...>, internet_archive:<...>
        """
        new_tags: List[str] = []

        isbn_clean = str(isbn or "").replace("isbn:", "").replace("-", "").strip()
        if not isbn_clean:
            return []

        url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json"
        try:
            with HTTPClient() as client:
                response = client.get(url)
                response.raise_for_status()
                data = json_module.loads(response.content.decode("utf-8"))
        except Exception as exc:
            log(f"Failed to fetch ISBN metadata: {exc}", file=sys.stderr)
            return []

        if not data:
            log(f"No ISBN metadata found for: {isbn}")
            return []

        book_data = next(iter(data.values()), None)
        if not isinstance(book_data, dict):
            return []

        if "title" in book_data:
            new_tags.append(f"title:{book_data['title']}")

        authors = book_data.get("authors")
        if isinstance(authors, list):
            for author in authors[:3]:
                if isinstance(author, dict) and author.get("name"):
                    new_tags.append(f"author:{author['name']}")

        if book_data.get("publish_date"):
            new_tags.append(f"publish_date:{book_data['publish_date']}")

        publishers = book_data.get("publishers")
        if isinstance(publishers, list) and publishers:
            pub = publishers[0]
            if isinstance(pub, dict) and pub.get("name"):
                new_tags.append(f"publisher:{pub['name']}")

        if "description" in book_data:
            desc = book_data.get("description")
            if isinstance(desc, dict) and "value" in desc:
                desc = desc.get("value")
            if desc:
                desc_str = str(desc).strip()
                if desc_str:
                    new_tags.append(f"description:{desc_str[:200]}")

        page_count = book_data.get("number_of_pages")
        if isinstance(page_count, int) and page_count > 0:
            new_tags.append(f"pages:{page_count}")

        identifiers = book_data.get("identifiers")
        if isinstance(identifiers, dict):

            def _first(value: Any) -> Any:
                if isinstance(value, list) and value:
                    return value[0]
                return value

            for key, ns in (
                ("openlibrary", "openlibrary"),
                ("lccn", "lccn"),
                ("oclc", "oclc"),
                ("goodreads", "goodreads"),
                ("librarything", "librarything"),
                ("doi", "doi"),
                ("internet_archive", "internet_archive"),
            ):
                val = _first(identifiers.get(key))
                if val:
                    new_tags.append(f"{ns}:{val}")

        debug(f"Found {len(new_tags)} tag(s) from ISBN lookup")
        return new_tags

    @staticmethod
    def scrape_openlibrary_metadata(olid: str) -> List[str]:
        """Scrape tags for an OpenLibrary ID using the .json API endpoint."""
        new_tags: List[str] = []

        olid_text = str(olid or "").strip()
        if not olid_text:
            return []

        # Normalize OLID to the common "OL<digits>M" form when possible.
        olid_norm = olid_text
        try:
            if not olid_norm.startswith("OL"):
                olid_norm = f"OL{olid_norm}"
            if not olid_norm.endswith("M"):
                olid_norm = f"{olid_norm}M"
        except Exception:
            olid_norm = olid_text

        # Ensure we always include a scrapeable identifier tag.
        new_tags.append(f"openlibrary:{olid_norm}")

        # Accept OL9674499M, 9674499M, or just digits.
        olid_clean = olid_text.replace("OL", "").replace("M", "")
        if not olid_clean.isdigit():
            olid_clean = olid_text

        if not olid_text.startswith("OL"):
            url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
        else:
            url = f"https://openlibrary.org/books/{olid_text}.json"

        try:
            with HTTPClient() as client:
                response = client.get(url)
                response.raise_for_status()
                data = json_module.loads(response.content.decode("utf-8"))
        except Exception as exc:
            log(f"Failed to fetch OpenLibrary metadata: {exc}", file=sys.stderr)
            return []

        if not isinstance(data, dict) or not data:
            log(f"No OpenLibrary metadata found for: {olid_text}")
            return []

        if "title" in data:
            new_tags.append(f"title:{data['title']}")

        authors = data.get("authors")
        if isinstance(authors, list):
            for author in authors[:3]:
                if isinstance(author, dict) and author.get("name"):
                    new_tags.append(f"author:{author['name']}")
                    continue

                # Common OL shape: {"key": "/authors/OL...A"} or {"author": {"key": ...}}
                author_key = None
                if isinstance(author, dict):
                    if isinstance(author.get("author"), dict):
                        author_key = author.get("author",
                                                {}).get("key")
                    if not author_key:
                        author_key = author.get("key")

                if isinstance(author_key, str) and author_key.startswith("/"):
                    try:
                        author_url = f"https://openlibrary.org{author_key}.json"
                        with HTTPClient(timeout=10) as client:
                            author_resp = client.get(author_url)
                            author_resp.raise_for_status()
                            author_data = json_module.loads(
                                author_resp.content.decode("utf-8")
                            )
                        if isinstance(author_data, dict) and author_data.get("name"):
                            new_tags.append(f"author:{author_data['name']}")
                            continue
                    except Exception:
                        pass

                if isinstance(author, str) and author:
                    new_tags.append(f"author:{author}")

        if data.get("publish_date"):
            new_tags.append(f"publish_date:{data['publish_date']}")

        publishers = data.get("publishers")
        if isinstance(publishers, list) and publishers:
            pub = publishers[0]
            if isinstance(pub, dict) and pub.get("name"):
                new_tags.append(f"publisher:{pub['name']}")
            elif isinstance(pub, str) and pub:
                new_tags.append(f"publisher:{pub}")

        if "description" in data:
            desc = data.get("description")
            if isinstance(desc, dict) and "value" in desc:
                desc = desc.get("value")
            if desc:
                desc_str = str(desc).strip()
                if desc_str:
                    new_tags.append(f"description:{desc_str[:200]}")

        page_count = data.get("number_of_pages")
        if isinstance(page_count, int) and page_count > 0:
            new_tags.append(f"pages:{page_count}")

        subjects = data.get("subjects")
        if isinstance(subjects, list):
            for subject in subjects[:10]:
                if isinstance(subject, str):
                    subject_clean = subject.strip()
                    if subject_clean and subject_clean not in new_tags:
                        new_tags.append(subject_clean)

        identifiers = data.get("identifiers")
        if isinstance(identifiers, dict):

            def _first(value: Any) -> Any:
                if isinstance(value, list) and value:
                    return value[0]
                return value

            for key, ns in (
                ("isbn_10", "isbn_10"),
                ("isbn_13", "isbn_13"),
                ("lccn", "lccn"),
                ("oclc_numbers", "oclc"),
                ("goodreads", "goodreads"),
                ("internet_archive", "internet_archive"),
            ):
                val = _first(identifiers.get(key))
                if val:
                    new_tags.append(f"{ns}:{val}")

        # Some editions expose a direct Archive.org identifier as "ocaid".
        ocaid = data.get("ocaid")
        if isinstance(ocaid, str) and ocaid.strip():
            new_tags.append(f"internet_archive:{ocaid.strip()}")

        debug(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup")
        return new_tags

    def search(
        self,
        query: str,
        limit: int = 50,
        filters: Optional[Dict[str,
                               Any]] = None,
        **kwargs: Any,
    ) -> List[SearchResult]:
        filters = filters or {}

        parsed = parse_query(query)
        isbn = get_field(parsed, "isbn")
        author = get_field(parsed, "author")
        title = get_field(parsed, "title")
        free_text = get_free_text(parsed)

        q = (isbn or title or author or free_text or query or "").strip()
        if not q:
            return []

        if _looks_like_isbn(q):
            q = f"isbn:{q.replace('-', '')}"

        try:
            resp = self._session.get(
                "https://openlibrary.org/search.json",
                params={
                    "q": q,
                    "limit": int(limit)
                },
                timeout=10,
            )
            resp.raise_for_status()
            data = resp.json() or {}
        except Exception as exc:
            log(f"[openlibrary] Search failed: {exc}", file=sys.stderr)
            return []

        results: List[SearchResult] = []
        docs = data.get("docs") or []
        if not isinstance(docs, list):
            return []

        # Availability enrichment can be slow if done sequentially (it may require multiple
        # network calls per row). Do it concurrently to keep the pipeline responsive.
        docs = docs[:int(limit)]

        def _compute_availability(doc_dict: Dict[str,
                                                 Any]) -> Tuple[str,
                                                                str,
                                                                str,
                                                                str]:
            edition_id_local = _resolve_edition_id(doc_dict)
            if not edition_id_local:
                return "no-olid", "", "", ""

            ia_val_local = doc_dict.get("ia") or []
            if isinstance(ia_val_local, str):
                ia_val_local = [ia_val_local]
            if not isinstance(ia_val_local, list):
                ia_val_local = []
            ia_ids_local = [str(x) for x in ia_val_local if x]

            session_local = requests.Session()

            try:
                archive_id_local = _resolve_archive_id(
                    session_local,
                    edition_id_local,
                    ia_ids_local
                )
            except Exception:
                archive_id_local = ""

            if not archive_id_local:
                return "no-archive", "", "", ""

            # Prefer the fastest signal first: OpenLibrary lendable status.
            lendable_local, reason_local = _check_lendable(session_local, edition_id_local)
            if lendable_local:
                return "borrow", reason_local, archive_id_local, ""

            # Not lendable: check whether it's directly downloadable (public domain uploads, etc.).
            try:
                can_direct, pdf_url = self._archive_check_direct_download(archive_id_local)
                if can_direct and pdf_url:
                    return "download", reason_local, archive_id_local, str(pdf_url)
            except Exception:
                pass

            return "unavailable", reason_local, archive_id_local, ""

        availability_rows: List[Tuple[str,
                                      str,
                                      str,
                                      str]] = [
                                          ("unknown",
                                           "",
                                           "",
                                           "") for _ in range(len(docs))
                                      ]
        if docs:
            max_workers = min(8, max(1, len(docs)))
            done = 0
            with futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
                future_to_index = {
                    executor.submit(_compute_availability,
                                    doc_dict): i
                    for i, doc_dict in enumerate(docs) if isinstance(doc_dict, dict)
                }
                for fut in futures.as_completed(list(future_to_index.keys())):
                    i = future_to_index[fut]
                    try:
                        availability_rows[i] = fut.result()
                    except Exception:
                        availability_rows[i] = ("unknown", "", "", "")
                    done += 1

        for idx, doc in enumerate(docs):
            if not isinstance(doc, dict):
                continue

            book_title = str(doc.get("title") or "").strip() or "Unknown"

            authors = doc.get("author_name") or []
            if isinstance(authors, str):
                authors = [authors]
            if not isinstance(authors, list):
                authors = []
            authors_list = [str(a) for a in authors if a]

            year_val = doc.get("first_publish_year")
            year = str(year_val) if year_val is not None else ""

            edition_id = _resolve_edition_id(doc)
            work_key = doc.get("key") if isinstance(doc.get("key"), str) else ""

            ia_val = doc.get("ia") or []
            if isinstance(ia_val, str):
                ia_val = [ia_val]
            if not isinstance(ia_val, list):
                ia_val = []
            ia_ids = [str(x) for x in ia_val if x]

            isbn_list = doc.get("isbn") or []
            if isinstance(isbn_list, str):
                isbn_list = [isbn_list]
            if not isinstance(isbn_list, list):
                isbn_list = []

            isbn_13 = next((str(i) for i in isbn_list if len(str(i)) == 13), "")
            isbn_10 = next((str(i) for i in isbn_list if len(str(i)) == 10), "")

            columns = [
                ("Title",
                 book_title),
                ("Author",
                 ", ".join(authors_list)),
                ("Year",
                 year),
                ("Avail",
                 ""),
                ("OLID",
                 edition_id),
            ]

            # Determine availability using the concurrently computed enrichment.
            availability, availability_reason, archive_id, direct_url = ("unknown", "", "", "")
            if 0 <= idx < len(availability_rows):
                availability, availability_reason, archive_id, direct_url = availability_rows[idx]

            # Patch the display column.
            for idx, (name, _val) in enumerate(columns):
                if name == "Avail":
                    columns[idx] = ("Avail", availability)
                    break

            annotations: List[str] = []
            if isbn_13:
                annotations.append(f"isbn_13:{isbn_13}")
            elif isbn_10:
                annotations.append(f"isbn_10:{isbn_10}")
            if ia_ids:
                annotations.append("archive")
            if availability in {"download",
                                "borrow"}:
                annotations.append(availability)

            results.append(
                SearchResult(
                    table="openlibrary",
                    title=book_title,
                    path=(
                        f"https://openlibrary.org/books/{edition_id}" if edition_id else
                        (
                            f"https://openlibrary.org{work_key}"
                            if isinstance(work_key,
                                          str) and work_key.startswith("/") else
                            "https://openlibrary.org"
                        )
                    ),
                    detail=(
                        (f"By: {', '.join(authors_list)}" if authors_list else "") +
                        (f" ({year})" if year else "")
                    ).strip(),
                    annotations=annotations,
                    media_kind="book",
                    columns=columns,
                    full_metadata={
                        "openlibrary_id": edition_id,
                        "openlibrary_key": work_key,
                        "authors": authors_list,
                        "year": year,
                        "isbn_10": isbn_10,
                        "isbn_13": isbn_13,
                        "ia": ia_ids,
                        "availability": availability,
                        "availability_reason": availability_reason,
                        "archive_id": archive_id,
                        "direct_url": direct_url,
                        "raw": doc,
                    },
                )
            )

        return results

    def download(
        self,
        result: SearchResult,
        output_dir: Path,
        progress_callback: Optional[Callable[[str,
                                              int,
                                              Optional[int],
                                              str],
                                             None]] = None,
    ) -> Optional[Path]:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

        meta = result.full_metadata or {}
        edition_id = str(meta.get("openlibrary_id") or "").strip()

        # Accept direct Archive.org URLs too (details/borrow/download) even when no OL edition id is known.
        archive_id = str(meta.get("archive_id") or "").strip()

        ia_ids = meta.get("ia") or []
        if isinstance(ia_ids, str):
            ia_ids = [ia_ids]
        if not isinstance(ia_ids, list):
            ia_ids = []
        ia_candidates = [str(x) for x in ia_ids if x]

        if not archive_id:
            archive_id = _first_str(ia_candidates) or ""

        if not archive_id and edition_id:
            archive_id = _resolve_archive_id(self._session, edition_id, ia_candidates)

        if not archive_id:
            # Try to extract identifier from the SearchResult path (URL).
            archive_id = _archive_id_from_url(str(getattr(result, "path", "") or ""))

        if not archive_id:
            log(
                "[openlibrary] No archive identifier available; cannot download",
                file=sys.stderr
            )
            return None

        # Best-effort metadata scrape to attach bibliographic tags for downstream cmdlets.
        try:
            archive_meta = _fetch_archive_item_metadata(archive_id)
            tags = _archive_item_metadata_to_tags(archive_id, archive_meta)
            if tags:
                try:
                    result.tag.update(tags)
                except Exception:
                    # Fallback for callers that pass plain dicts.
                    pass
            if isinstance(meta, dict):
                meta["archive_id"] = archive_id
                if archive_meta:
                    meta["archive_metadata"] = archive_meta
                result.full_metadata = meta
        except Exception:
            # Never block downloads on metadata fetch.
            pass

        safe_title = sanitize_filename(result.title)
        if not safe_title or "http" in safe_title.lower():
            safe_title = sanitize_filename(archive_id) or "archive"

        # 1) Direct download if available.
        try:
            can_direct, pdf_url = self._archive_check_direct_download(archive_id)
        except Exception:
            can_direct, pdf_url = False, ""

        if can_direct and pdf_url:
            try:
                if progress_callback is not None:
                    progress_callback("step", 0, None, "direct download")
            except Exception:
                pass
            out_path = unique_path(output_dir / f"{safe_title}.pdf")
            ok = download_file(
                pdf_url,
                out_path,
                session=self._session,
                progress_callback=(
                    (
                        lambda downloaded, total, label:
                        progress_callback("bytes", downloaded, total, label)
                    ) if progress_callback is not None else None
                ),
            )
            if ok:
                return out_path
            log("[openlibrary] Direct download failed", file=sys.stderr)
            return None

        # 2) Borrow flow (credentials required).
        try:
            email, password = self._credential_archive(self.config or {})
            if not email or not password:
                log(
                    "[openlibrary] Archive credentials missing; cannot borrow",
                    file=sys.stderr
                )
                return None

            lendable = True
            reason = ""
            if edition_id:
                lendable, reason = _check_lendable(self._session, edition_id)
                if not lendable:
                    # OpenLibrary API can be a false-negative; fall back to Archive metadata.
                    lendable2, reason2 = self._archive_is_lendable(archive_id)
                    if lendable2:
                        lendable, reason = True, reason2
            else:
                lendable, reason = self._archive_is_lendable(archive_id)

            if not lendable:
                log(f"[openlibrary] Not lendable: {reason}", file=sys.stderr)
                return None

            session = self._archive_login(email, password)
            loaned = False
            try:
                try:
                    if progress_callback is not None:
                        progress_callback("step", 0, None, "login")
                except Exception:
                    pass

                try:
                    session = self._archive_loan(session, archive_id, verbose=False)
                    loaned = True
                except self.BookNotAvailableError:
                    log("[openlibrary] Book not available to borrow", file=sys.stderr)
                    return None
                except Exception:
                    log("[openlibrary] Borrow failed", file=sys.stderr)
                    return None

                try:
                    if progress_callback is not None:
                        progress_callback("step", 0, None, "borrow")
                except Exception:
                    pass

                urls = [
                    f"https://archive.org/borrow/{archive_id}",
                    f"https://archive.org/details/{archive_id}",
                ]
                title = safe_title
                links: Optional[List[str]] = None
                last_exc: Optional[Exception] = None
                for u in urls:
                    try:
                        title_raw, links, _metadata = self._archive_get_book_infos(session, u)
                        if title_raw:
                            title = sanitize_filename(title_raw)
                        break
                    except Exception as exc:
                        last_exc = exc
                        continue

                if not links:
                    log(
                        f"[openlibrary] Failed to extract pages: {last_exc}",
                        file=sys.stderr
                    )
                    return None

                try:
                    if progress_callback is not None:
                        progress_callback("step", 0, None, "download pages")
                except Exception:
                    pass

                temp_dir = tempfile.mkdtemp(prefix=f"{title}_", dir=str(output_dir))
                try:
                    images = self._archive_download(
                        session=session,
                        n_threads=10,
                        directory=temp_dir,
                        links=links,
                        scale=3,
                        book_id=archive_id,
                        progress_callback=(
                            (
                                lambda done, total:
                                progress_callback("pages", done, total, "pages")
                            ) if progress_callback is not None else None
                        ),
                    )

                    pdf_bytes = _image_paths_to_pdf_bytes(images)
                    if not pdf_bytes:
                        # Keep images folder for manual conversion.
                        log(
                            "[openlibrary] PDF conversion failed; keeping images folder",
                            file=sys.stderr,
                        )
                        return Path(temp_dir)

                    try:
                        if progress_callback is not None:
                            progress_callback("step", 0, None, "stitch pdf")
                    except Exception:
                        pass

                    pdf_path = unique_path(output_dir / f"{title}.pdf")
                    with open(pdf_path, "wb") as f:
                        f.write(pdf_bytes)

                    try:
                        shutil.rmtree(temp_dir)
                    except Exception:
                        pass
                    return pdf_path

                except Exception:
                    try:
                        shutil.rmtree(temp_dir)
                    except Exception:
                        pass
                    raise
            finally:
                # Always return the loan after a successful borrow, even if download/stitch fails.
                if loaned:
                    try:
                        if progress_callback is not None:
                            progress_callback("step", 0, None, "return book")
                    except Exception:
                        pass
                    try:
                        self._archive_return_loan(session, archive_id)
                    except Exception as exc:
                        log(
                            f"[openlibrary] Warning: failed to return loan: {exc}",
                            file=sys.stderr
                        )
                try:
                    self._archive_logout(session)
                except Exception:
                    pass

        except Exception as exc:
            log(f"[openlibrary] Borrow workflow error: {exc}", file=sys.stderr)
            return None

    def validate(self) -> bool:
        return True