dfdfsdd
This commit is contained in:
@@ -1,19 +1,38 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
from concurrent import futures
|
||||
import hashlib
|
||||
import json as json_module
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import requests
|
||||
|
||||
from API.HTTP import HTTPClient
|
||||
from ProviderCore.base import SearchProvider, SearchResult
|
||||
from ProviderCore.download import download_file, sanitize_filename
|
||||
from cli_syntax import get_field, get_free_text, parse_query
|
||||
from SYS.logger import log
|
||||
from SYS.utils import unique_path
|
||||
|
||||
try:
|
||||
from Crypto.Cipher import AES # type: ignore
|
||||
from Crypto.Util import Counter # type: ignore
|
||||
except ImportError:
|
||||
AES = None # type: ignore
|
||||
Counter = None # type: ignore
|
||||
|
||||
try:
|
||||
from tqdm import tqdm # type: ignore
|
||||
except ImportError:
|
||||
tqdm = None # type: ignore
|
||||
|
||||
|
||||
def _looks_like_isbn(text: str) -> bool:
|
||||
t = (text or "").replace("-", "").strip()
|
||||
@@ -38,6 +57,13 @@ def _resolve_edition_id(doc: Dict[str, Any]) -> str:
|
||||
edition_key = doc.get("edition_key")
|
||||
if isinstance(edition_key, list) and edition_key:
|
||||
return str(edition_key[0]).strip()
|
||||
if isinstance(edition_key, str) and edition_key.strip():
|
||||
return edition_key.strip()
|
||||
|
||||
# Often present even when edition_key is missing.
|
||||
cover_edition_key = doc.get("cover_edition_key")
|
||||
if isinstance(cover_edition_key, str) and cover_edition_key.strip():
|
||||
return cover_edition_key.strip()
|
||||
|
||||
# Fallback: sometimes key can be /books/OL...M
|
||||
key = doc.get("key")
|
||||
@@ -54,7 +80,7 @@ def _check_lendable(session: requests.Session, edition_id: str) -> Tuple[bool, s
|
||||
return False, "not-an-edition"
|
||||
|
||||
url = f"https://openlibrary.org/api/volumes/brief/json/OLID:{edition_id}"
|
||||
resp = session.get(url, timeout=10)
|
||||
resp = session.get(url, timeout=6)
|
||||
resp.raise_for_status()
|
||||
data = resp.json() or {}
|
||||
wrapped = data.get(f"OLID:{edition_id}")
|
||||
@@ -88,7 +114,7 @@ def _resolve_archive_id(session: requests.Session, edition_id: str, ia_candidate
|
||||
|
||||
# Otherwise query the edition JSON.
|
||||
try:
|
||||
resp = session.get(f"https://openlibrary.org/books/{edition_id}.json", timeout=10)
|
||||
resp = session.get(f"https://openlibrary.org/books/{edition_id}.json", timeout=6)
|
||||
resp.raise_for_status()
|
||||
data = resp.json() or {}
|
||||
|
||||
@@ -116,6 +142,522 @@ class OpenLibrary(SearchProvider):
|
||||
super().__init__(config)
|
||||
self._session = requests.Session()
|
||||
|
||||
class BookNotAvailableError(Exception):
|
||||
"""Raised when a book is not available for borrowing (waitlisted/in use)."""
|
||||
|
||||
@staticmethod
|
||||
def _credential_archive(config: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""Get Archive.org email/password from config.
|
||||
|
||||
Supports:
|
||||
- New: {"provider": {"openlibrary": {"email": "...", "password": "..."}}}
|
||||
- Old: {"Archive": {"email": "...", "password": "..."}}
|
||||
{"archive_org_email": "...", "archive_org_password": "..."}
|
||||
"""
|
||||
if not isinstance(config, dict):
|
||||
return None, None
|
||||
|
||||
provider_config = config.get("provider", {})
|
||||
if isinstance(provider_config, dict):
|
||||
openlibrary_config = provider_config.get("openlibrary", {})
|
||||
if isinstance(openlibrary_config, dict):
|
||||
email = openlibrary_config.get("email")
|
||||
password = openlibrary_config.get("password")
|
||||
if email or password:
|
||||
return str(email) if email is not None else None, str(password) if password is not None else None
|
||||
|
||||
archive_config = config.get("Archive")
|
||||
if isinstance(archive_config, dict):
|
||||
email = archive_config.get("email")
|
||||
password = archive_config.get("password")
|
||||
if email or password:
|
||||
return str(email) if email is not None else None, str(password) if password is not None else None
|
||||
|
||||
email = config.get("archive_org_email")
|
||||
password = config.get("archive_org_password")
|
||||
return str(email) if email is not None else None, str(password) if password is not None else None
|
||||
|
||||
@staticmethod
|
||||
def _archive_error_body(response: requests.Response) -> str:
|
||||
try:
|
||||
body = response.text or ""
|
||||
except Exception:
|
||||
return ""
|
||||
if len(body) > 2000:
|
||||
return body[:1200] + "\n... (truncated) ...\n" + body[-400:]
|
||||
return body
|
||||
|
||||
@classmethod
|
||||
def _archive_login(cls, email: str, password: str) -> requests.Session:
|
||||
"""Login to archive.org using the token-based services endpoint (matches test-login.py)."""
|
||||
session = requests.Session()
|
||||
|
||||
token_resp = session.get("https://archive.org/services/account/login/", timeout=30)
|
||||
try:
|
||||
token_json = token_resp.json()
|
||||
except Exception as exc:
|
||||
raise RuntimeError(f"Archive login token parse failed: {exc}\n{cls._archive_error_body(token_resp)}")
|
||||
|
||||
if not token_json.get("success"):
|
||||
raise RuntimeError(f"Archive login token fetch failed\n{cls._archive_error_body(token_resp)}")
|
||||
|
||||
token = (token_json.get("value") or {}).get("token")
|
||||
if not token:
|
||||
raise RuntimeError("Archive login token missing")
|
||||
|
||||
headers = {"Content-Type": "application/x-www-form-urlencoded"}
|
||||
payload = {"username": email, "password": password, "t": token}
|
||||
|
||||
login_resp = session.post(
|
||||
"https://archive.org/services/account/login/",
|
||||
headers=headers,
|
||||
data=json_module.dumps(payload),
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
try:
|
||||
login_json = login_resp.json()
|
||||
except Exception as exc:
|
||||
raise RuntimeError(f"Archive login parse failed: {exc}\n{cls._archive_error_body(login_resp)}")
|
||||
|
||||
if login_json.get("success") is False:
|
||||
if login_json.get("value") == "bad_login":
|
||||
raise RuntimeError("Invalid Archive.org credentials")
|
||||
raise RuntimeError(f"Archive login failed: {login_json}")
|
||||
|
||||
return session
|
||||
|
||||
@classmethod
|
||||
def _archive_loan(cls, session: requests.Session, book_id: str, *, verbose: bool = True) -> requests.Session:
|
||||
data = {"action": "grant_access", "identifier": book_id}
|
||||
session.post("https://archive.org/services/loans/loan/searchInside.php", data=data, timeout=30)
|
||||
data["action"] = "browse_book"
|
||||
response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30)
|
||||
|
||||
if response.status_code == 400:
|
||||
try:
|
||||
err = (response.json() or {}).get("error")
|
||||
if err == "This book is not available to borrow at this time. Please try again later.":
|
||||
raise cls.BookNotAvailableError("Book is waitlisted or in use")
|
||||
raise RuntimeError(f"Borrow failed: {err or response.text}")
|
||||
except cls.BookNotAvailableError:
|
||||
raise
|
||||
except Exception:
|
||||
raise RuntimeError("The book cannot be borrowed")
|
||||
|
||||
data["action"] = "create_token"
|
||||
response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30)
|
||||
if "token" in (response.text or ""):
|
||||
return session
|
||||
raise RuntimeError("Something went wrong when trying to borrow the book")
|
||||
|
||||
@staticmethod
|
||||
def _archive_return_loan(session: requests.Session, book_id: str) -> None:
|
||||
data = {"action": "return_loan", "identifier": book_id}
|
||||
response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30)
|
||||
if response.status_code == 200:
|
||||
try:
|
||||
if (response.json() or {}).get("success"):
|
||||
return
|
||||
except Exception:
|
||||
pass
|
||||
raise RuntimeError("Something went wrong when trying to return the book")
|
||||
|
||||
@staticmethod
|
||||
def _archive_get_book_infos(session: requests.Session, url: str) -> Tuple[str, List[str], Dict[str, Any]]:
|
||||
"""Extract page links from Archive.org book reader."""
|
||||
r = session.get(url, timeout=30).text
|
||||
|
||||
# Matches: "url":"//archive.org/..." (allow whitespace)
|
||||
match = re.search(r'"url"\s*:\s*"([^"]+)"', r)
|
||||
if not match:
|
||||
raise RuntimeError("Failed to extract book info URL from response")
|
||||
|
||||
url_path = match.group(1)
|
||||
infos_url = ("https:" + url_path) if url_path.startswith("//") else url_path
|
||||
infos_url = infos_url.replace("\\u0026", "&")
|
||||
|
||||
response = session.get(infos_url, timeout=30)
|
||||
payload = response.json()
|
||||
data = payload["data"]
|
||||
|
||||
title = str(data["brOptions"]["bookTitle"]).strip().replace(" ", "_")
|
||||
title = "".join(c for c in title if c not in '<>:"/\\|?*')
|
||||
title = title[:150]
|
||||
|
||||
metadata = data.get("metadata") or {}
|
||||
links: List[str] = []
|
||||
br_data = (data.get("brOptions") or {}).get("data", [])
|
||||
if isinstance(br_data, list):
|
||||
for item in br_data:
|
||||
if isinstance(item, list):
|
||||
for page in item:
|
||||
if isinstance(page, dict) and "uri" in page:
|
||||
links.append(page["uri"])
|
||||
elif isinstance(item, dict) and "uri" in item:
|
||||
links.append(item["uri"])
|
||||
|
||||
if not links:
|
||||
raise RuntimeError("No pages found in book data")
|
||||
return title, links, metadata if isinstance(metadata, dict) else {}
|
||||
|
||||
@staticmethod
|
||||
def _archive_image_name(pages: int, page: int, directory: str) -> str:
|
||||
return f"{directory}/{(len(str(pages)) - len(str(page))) * '0'}{page}.jpg"
|
||||
|
||||
@staticmethod
|
||||
def _archive_deobfuscate_image(image_data: bytes, link: str, obf_header: str) -> bytes:
|
||||
if not AES or not Counter:
|
||||
raise RuntimeError("Crypto library not available")
|
||||
|
||||
try:
|
||||
version, counter_b64 = obf_header.split("|")
|
||||
except Exception as exc:
|
||||
raise ValueError("Invalid X-Obfuscate header format") from exc
|
||||
|
||||
if version != "1":
|
||||
raise ValueError("Unsupported obfuscation version: " + version)
|
||||
|
||||
aes_key = re.sub(r"^https?:\/\/.*?\/", "/", link)
|
||||
sha1_digest = hashlib.sha1(aes_key.encode("utf-8")).digest()
|
||||
key = sha1_digest[:16]
|
||||
|
||||
counter_bytes = base64.b64decode(counter_b64)
|
||||
if len(counter_bytes) != 16:
|
||||
raise ValueError(f"Expected counter to be 16 bytes, got {len(counter_bytes)}")
|
||||
|
||||
prefix = counter_bytes[:8]
|
||||
initial_value = int.from_bytes(counter_bytes[8:], byteorder="big")
|
||||
ctr = Counter.new(64, prefix=prefix, initial_value=initial_value, little_endian=False) # type: ignore
|
||||
cipher = AES.new(key, AES.MODE_CTR, counter=ctr) # type: ignore
|
||||
|
||||
decrypted_part = cipher.decrypt(image_data[:1024])
|
||||
return decrypted_part + image_data[1024:]
|
||||
|
||||
@classmethod
|
||||
def _archive_download_one_image(
|
||||
cls,
|
||||
session: requests.Session,
|
||||
link: str,
|
||||
i: int,
|
||||
directory: str,
|
||||
book_id: str,
|
||||
pages: int,
|
||||
) -> None:
|
||||
headers = {
|
||||
"Referer": "https://archive.org/",
|
||||
"Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
|
||||
"Sec-Fetch-Site": "same-site",
|
||||
"Sec-Fetch-Mode": "no-cors",
|
||||
"Sec-Fetch-Dest": "image",
|
||||
}
|
||||
|
||||
while True:
|
||||
try:
|
||||
response = session.get(link, headers=headers, timeout=30)
|
||||
if response.status_code == 403:
|
||||
cls._archive_loan(session, book_id, verbose=False)
|
||||
raise RuntimeError("Borrow again")
|
||||
if response.status_code == 200:
|
||||
break
|
||||
except Exception:
|
||||
time.sleep(1)
|
||||
|
||||
image = cls._archive_image_name(pages, i, directory)
|
||||
obf_header = response.headers.get("X-Obfuscate")
|
||||
if obf_header:
|
||||
image_content = cls._archive_deobfuscate_image(response.content, link, obf_header)
|
||||
else:
|
||||
image_content = response.content
|
||||
|
||||
with open(image, "wb") as f:
|
||||
f.write(image_content)
|
||||
|
||||
@classmethod
|
||||
def _archive_download(
|
||||
cls,
|
||||
session: requests.Session,
|
||||
n_threads: int,
|
||||
directory: str,
|
||||
links: List[str],
|
||||
scale: int,
|
||||
book_id: str,
|
||||
) -> List[str]:
|
||||
links_scaled = [f"{link}&rotate=0&scale={scale}" for link in links]
|
||||
pages = len(links_scaled)
|
||||
|
||||
tasks = []
|
||||
with futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
|
||||
for i, link in enumerate(links_scaled):
|
||||
tasks.append(
|
||||
executor.submit(
|
||||
cls._archive_download_one_image,
|
||||
session=session,
|
||||
link=link,
|
||||
i=i,
|
||||
directory=directory,
|
||||
book_id=book_id,
|
||||
pages=pages,
|
||||
)
|
||||
)
|
||||
if tqdm:
|
||||
for _ in tqdm(futures.as_completed(tasks), total=len(tasks)): # type: ignore
|
||||
pass
|
||||
else:
|
||||
for _ in futures.as_completed(tasks):
|
||||
pass
|
||||
|
||||
return [cls._archive_image_name(pages, i, directory) for i in range(pages)]
|
||||
|
||||
@staticmethod
|
||||
def _archive_check_direct_download(book_id: str) -> Tuple[bool, str]:
|
||||
"""Check for a directly downloadable original PDF in Archive.org metadata."""
|
||||
try:
|
||||
metadata_url = f"https://archive.org/metadata/{book_id}"
|
||||
response = requests.get(metadata_url, timeout=6)
|
||||
response.raise_for_status()
|
||||
metadata = response.json()
|
||||
files = metadata.get("files") if isinstance(metadata, dict) else None
|
||||
if isinstance(files, list):
|
||||
for file_info in files:
|
||||
if not isinstance(file_info, dict):
|
||||
continue
|
||||
filename = str(file_info.get("name", ""))
|
||||
if filename.endswith(".pdf") and file_info.get("source") == "original":
|
||||
pdf_url = f"https://archive.org/download/{book_id}/{filename.replace(' ', '%20')}"
|
||||
check_response = requests.head(pdf_url, timeout=4, allow_redirects=True)
|
||||
if check_response.status_code == 200:
|
||||
return True, pdf_url
|
||||
return False, ""
|
||||
except Exception:
|
||||
return False, ""
|
||||
|
||||
@staticmethod
|
||||
def scrape_isbn_metadata(isbn: str) -> List[str]:
|
||||
"""Scrape tags for an ISBN using Open Library API.
|
||||
|
||||
Returns tags such as:
|
||||
- title:<...>, author:<...>, publish_date:<...>, publisher:<...>, description:<...>, pages:<...>
|
||||
- identifiers: openlibrary:<...>, lccn:<...>, oclc:<...>, goodreads:<...>, librarything:<...>, doi:<...>, internet_archive:<...>
|
||||
"""
|
||||
new_tags: List[str] = []
|
||||
|
||||
isbn_clean = str(isbn or "").replace("isbn:", "").replace("-", "").strip()
|
||||
if not isbn_clean:
|
||||
return []
|
||||
|
||||
url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json"
|
||||
try:
|
||||
with HTTPClient() as client:
|
||||
response = client.get(url)
|
||||
response.raise_for_status()
|
||||
data = json_module.loads(response.content.decode("utf-8"))
|
||||
except Exception as exc:
|
||||
log(f"Failed to fetch ISBN metadata: {exc}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
if not data:
|
||||
log(f"No ISBN metadata found for: {isbn}")
|
||||
return []
|
||||
|
||||
book_data = next(iter(data.values()), None)
|
||||
if not isinstance(book_data, dict):
|
||||
return []
|
||||
|
||||
if "title" in book_data:
|
||||
new_tags.append(f"title:{book_data['title']}")
|
||||
|
||||
authors = book_data.get("authors")
|
||||
if isinstance(authors, list):
|
||||
for author in authors[:3]:
|
||||
if isinstance(author, dict) and author.get("name"):
|
||||
new_tags.append(f"author:{author['name']}")
|
||||
|
||||
if book_data.get("publish_date"):
|
||||
new_tags.append(f"publish_date:{book_data['publish_date']}")
|
||||
|
||||
publishers = book_data.get("publishers")
|
||||
if isinstance(publishers, list) and publishers:
|
||||
pub = publishers[0]
|
||||
if isinstance(pub, dict) and pub.get("name"):
|
||||
new_tags.append(f"publisher:{pub['name']}")
|
||||
|
||||
if "description" in book_data:
|
||||
desc = book_data.get("description")
|
||||
if isinstance(desc, dict) and "value" in desc:
|
||||
desc = desc.get("value")
|
||||
if desc:
|
||||
desc_str = str(desc).strip()
|
||||
if desc_str:
|
||||
new_tags.append(f"description:{desc_str[:200]}")
|
||||
|
||||
page_count = book_data.get("number_of_pages")
|
||||
if isinstance(page_count, int) and page_count > 0:
|
||||
new_tags.append(f"pages:{page_count}")
|
||||
|
||||
identifiers = book_data.get("identifiers")
|
||||
if isinstance(identifiers, dict):
|
||||
|
||||
def _first(value: Any) -> Any:
|
||||
if isinstance(value, list) and value:
|
||||
return value[0]
|
||||
return value
|
||||
|
||||
for key, ns in (
|
||||
("openlibrary", "openlibrary"),
|
||||
("lccn", "lccn"),
|
||||
("oclc", "oclc"),
|
||||
("goodreads", "goodreads"),
|
||||
("librarything", "librarything"),
|
||||
("doi", "doi"),
|
||||
("internet_archive", "internet_archive"),
|
||||
):
|
||||
val = _first(identifiers.get(key))
|
||||
if val:
|
||||
new_tags.append(f"{ns}:{val}")
|
||||
|
||||
log(f"Found {len(new_tags)} tag(s) from ISBN lookup")
|
||||
return new_tags
|
||||
|
||||
@staticmethod
|
||||
def scrape_openlibrary_metadata(olid: str) -> List[str]:
|
||||
"""Scrape tags for an OpenLibrary ID using the .json API endpoint."""
|
||||
new_tags: List[str] = []
|
||||
|
||||
olid_text = str(olid or "").strip()
|
||||
if not olid_text:
|
||||
return []
|
||||
|
||||
# Normalize OLID to the common "OL<digits>M" form when possible.
|
||||
olid_norm = olid_text
|
||||
try:
|
||||
if not olid_norm.startswith("OL"):
|
||||
olid_norm = f"OL{olid_norm}"
|
||||
if not olid_norm.endswith("M"):
|
||||
olid_norm = f"{olid_norm}M"
|
||||
except Exception:
|
||||
olid_norm = olid_text
|
||||
|
||||
# Ensure we always include a scrapeable identifier tag.
|
||||
new_tags.append(f"openlibrary:{olid_norm}")
|
||||
|
||||
# Accept OL9674499M, 9674499M, or just digits.
|
||||
olid_clean = olid_text.replace("OL", "").replace("M", "")
|
||||
if not olid_clean.isdigit():
|
||||
olid_clean = olid_text
|
||||
|
||||
if not olid_text.startswith("OL"):
|
||||
url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
|
||||
else:
|
||||
url = f"https://openlibrary.org/books/{olid_text}.json"
|
||||
|
||||
try:
|
||||
with HTTPClient() as client:
|
||||
response = client.get(url)
|
||||
response.raise_for_status()
|
||||
data = json_module.loads(response.content.decode("utf-8"))
|
||||
except Exception as exc:
|
||||
log(f"Failed to fetch OpenLibrary metadata: {exc}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
if not isinstance(data, dict) or not data:
|
||||
log(f"No OpenLibrary metadata found for: {olid_text}")
|
||||
return []
|
||||
|
||||
if "title" in data:
|
||||
new_tags.append(f"title:{data['title']}")
|
||||
|
||||
authors = data.get("authors")
|
||||
if isinstance(authors, list):
|
||||
for author in authors[:3]:
|
||||
if isinstance(author, dict) and author.get("name"):
|
||||
new_tags.append(f"author:{author['name']}")
|
||||
continue
|
||||
|
||||
# Common OL shape: {"key": "/authors/OL...A"} or {"author": {"key": ...}}
|
||||
author_key = None
|
||||
if isinstance(author, dict):
|
||||
if isinstance(author.get("author"), dict):
|
||||
author_key = author.get("author", {}).get("key")
|
||||
if not author_key:
|
||||
author_key = author.get("key")
|
||||
|
||||
if isinstance(author_key, str) and author_key.startswith("/"):
|
||||
try:
|
||||
author_url = f"https://openlibrary.org{author_key}.json"
|
||||
with HTTPClient(timeout=10) as client:
|
||||
author_resp = client.get(author_url)
|
||||
author_resp.raise_for_status()
|
||||
author_data = json_module.loads(author_resp.content.decode("utf-8"))
|
||||
if isinstance(author_data, dict) and author_data.get("name"):
|
||||
new_tags.append(f"author:{author_data['name']}")
|
||||
continue
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if isinstance(author, str) and author:
|
||||
new_tags.append(f"author:{author}")
|
||||
|
||||
if data.get("publish_date"):
|
||||
new_tags.append(f"publish_date:{data['publish_date']}")
|
||||
|
||||
publishers = data.get("publishers")
|
||||
if isinstance(publishers, list) and publishers:
|
||||
pub = publishers[0]
|
||||
if isinstance(pub, dict) and pub.get("name"):
|
||||
new_tags.append(f"publisher:{pub['name']}")
|
||||
elif isinstance(pub, str) and pub:
|
||||
new_tags.append(f"publisher:{pub}")
|
||||
|
||||
if "description" in data:
|
||||
desc = data.get("description")
|
||||
if isinstance(desc, dict) and "value" in desc:
|
||||
desc = desc.get("value")
|
||||
if desc:
|
||||
desc_str = str(desc).strip()
|
||||
if desc_str:
|
||||
new_tags.append(f"description:{desc_str[:200]}")
|
||||
|
||||
page_count = data.get("number_of_pages")
|
||||
if isinstance(page_count, int) and page_count > 0:
|
||||
new_tags.append(f"pages:{page_count}")
|
||||
|
||||
subjects = data.get("subjects")
|
||||
if isinstance(subjects, list):
|
||||
for subject in subjects[:10]:
|
||||
if isinstance(subject, str):
|
||||
subject_clean = subject.strip()
|
||||
if subject_clean and subject_clean not in new_tags:
|
||||
new_tags.append(subject_clean)
|
||||
|
||||
identifiers = data.get("identifiers")
|
||||
if isinstance(identifiers, dict):
|
||||
|
||||
def _first(value: Any) -> Any:
|
||||
if isinstance(value, list) and value:
|
||||
return value[0]
|
||||
return value
|
||||
|
||||
for key, ns in (
|
||||
("isbn_10", "isbn_10"),
|
||||
("isbn_13", "isbn_13"),
|
||||
("lccn", "lccn"),
|
||||
("oclc_numbers", "oclc"),
|
||||
("goodreads", "goodreads"),
|
||||
("internet_archive", "internet_archive"),
|
||||
):
|
||||
val = _first(identifiers.get(key))
|
||||
if val:
|
||||
new_tags.append(f"{ns}:{val}")
|
||||
|
||||
# Some editions expose a direct Archive.org identifier as "ocaid".
|
||||
ocaid = data.get("ocaid")
|
||||
if isinstance(ocaid, str) and ocaid.strip():
|
||||
new_tags.append(f"internet_archive:{ocaid.strip()}")
|
||||
|
||||
log(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup")
|
||||
return new_tags
|
||||
|
||||
def search(
|
||||
self,
|
||||
query: str,
|
||||
@@ -155,7 +697,70 @@ class OpenLibrary(SearchProvider):
|
||||
if not isinstance(docs, list):
|
||||
return []
|
||||
|
||||
for doc in docs[: int(limit)]:
|
||||
# Availability enrichment can be slow if done sequentially (it may require multiple
|
||||
# network calls per row). Do it concurrently to keep the pipeline responsive.
|
||||
docs = docs[: int(limit)]
|
||||
|
||||
def _compute_availability(doc_dict: Dict[str, Any]) -> Tuple[str, str, str, str]:
|
||||
edition_id_local = _resolve_edition_id(doc_dict)
|
||||
if not edition_id_local:
|
||||
return "no-olid", "", "", ""
|
||||
|
||||
ia_val_local = doc_dict.get("ia") or []
|
||||
if isinstance(ia_val_local, str):
|
||||
ia_val_local = [ia_val_local]
|
||||
if not isinstance(ia_val_local, list):
|
||||
ia_val_local = []
|
||||
ia_ids_local = [str(x) for x in ia_val_local if x]
|
||||
|
||||
session_local = requests.Session()
|
||||
|
||||
try:
|
||||
archive_id_local = _resolve_archive_id(session_local, edition_id_local, ia_ids_local)
|
||||
except Exception:
|
||||
archive_id_local = ""
|
||||
|
||||
if not archive_id_local:
|
||||
return "no-archive", "", "", ""
|
||||
|
||||
# Prefer the fastest signal first: OpenLibrary lendable status.
|
||||
lendable_local, reason_local = _check_lendable(session_local, edition_id_local)
|
||||
if lendable_local:
|
||||
return "borrow", reason_local, archive_id_local, ""
|
||||
|
||||
# Not lendable: check whether it's directly downloadable (public domain uploads, etc.).
|
||||
try:
|
||||
can_direct, pdf_url = self._archive_check_direct_download(archive_id_local)
|
||||
if can_direct and pdf_url:
|
||||
return "download", reason_local, archive_id_local, str(pdf_url)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return "unavailable", reason_local, archive_id_local, ""
|
||||
|
||||
availability_rows: List[Tuple[str, str, str, str]] = [("unknown", "", "", "") for _ in range(len(docs))]
|
||||
if docs:
|
||||
log(f"[openlibrary] Enriching availability for {len(docs)} result(s)...")
|
||||
max_workers = min(8, max(1, len(docs)))
|
||||
done = 0
|
||||
with futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
future_to_index = {
|
||||
executor.submit(_compute_availability, doc_dict): i
|
||||
for i, doc_dict in enumerate(docs)
|
||||
if isinstance(doc_dict, dict)
|
||||
}
|
||||
for fut in futures.as_completed(list(future_to_index.keys())):
|
||||
i = future_to_index[fut]
|
||||
try:
|
||||
availability_rows[i] = fut.result()
|
||||
except Exception:
|
||||
availability_rows[i] = ("unknown", "", "", "")
|
||||
done += 1
|
||||
if done in {1, len(future_to_index)} or (done % 10 == 0):
|
||||
log(f"[openlibrary] Availability: {done}/{len(future_to_index)}")
|
||||
log("[openlibrary] Availability enrichment complete")
|
||||
|
||||
for idx, doc in enumerate(docs):
|
||||
if not isinstance(doc, dict):
|
||||
continue
|
||||
|
||||
@@ -172,6 +777,7 @@ class OpenLibrary(SearchProvider):
|
||||
year = str(year_val) if year_val is not None else ""
|
||||
|
||||
edition_id = _resolve_edition_id(doc)
|
||||
work_key = doc.get("key") if isinstance(doc.get("key"), str) else ""
|
||||
|
||||
ia_val = doc.get("ia") or []
|
||||
if isinstance(ia_val, str):
|
||||
@@ -193,9 +799,21 @@ class OpenLibrary(SearchProvider):
|
||||
("Title", book_title),
|
||||
("Author", ", ".join(authors_list)),
|
||||
("Year", year),
|
||||
("Avail", ""),
|
||||
("OLID", edition_id),
|
||||
]
|
||||
|
||||
# Determine availability using the concurrently computed enrichment.
|
||||
availability, availability_reason, archive_id, direct_url = ("unknown", "", "", "")
|
||||
if 0 <= idx < len(availability_rows):
|
||||
availability, availability_reason, archive_id, direct_url = availability_rows[idx]
|
||||
|
||||
# Patch the display column.
|
||||
for idx, (name, _val) in enumerate(columns):
|
||||
if name == "Avail":
|
||||
columns[idx] = ("Avail", availability)
|
||||
break
|
||||
|
||||
annotations: List[str] = []
|
||||
if isbn_13:
|
||||
annotations.append(f"isbn_13:{isbn_13}")
|
||||
@@ -203,12 +821,18 @@ class OpenLibrary(SearchProvider):
|
||||
annotations.append(f"isbn_10:{isbn_10}")
|
||||
if ia_ids:
|
||||
annotations.append("archive")
|
||||
if availability in {"download", "borrow"}:
|
||||
annotations.append(availability)
|
||||
|
||||
results.append(
|
||||
SearchResult(
|
||||
table="openlibrary",
|
||||
title=book_title,
|
||||
path=(f"https://openlibrary.org/books/{edition_id}" if edition_id else "https://openlibrary.org"),
|
||||
path=(
|
||||
f"https://openlibrary.org/books/{edition_id}" if edition_id else (
|
||||
f"https://openlibrary.org{work_key}" if isinstance(work_key, str) and work_key.startswith("/") else "https://openlibrary.org"
|
||||
)
|
||||
),
|
||||
detail=(
|
||||
(f"By: {', '.join(authors_list)}" if authors_list else "")
|
||||
+ (f" ({year})" if year else "")
|
||||
@@ -218,11 +842,16 @@ class OpenLibrary(SearchProvider):
|
||||
columns=columns,
|
||||
full_metadata={
|
||||
"openlibrary_id": edition_id,
|
||||
"openlibrary_key": work_key,
|
||||
"authors": authors_list,
|
||||
"year": year,
|
||||
"isbn_10": isbn_10,
|
||||
"isbn_13": isbn_13,
|
||||
"ia": ia_ids,
|
||||
"availability": availability,
|
||||
"availability_reason": availability_reason,
|
||||
"archive_id": archive_id,
|
||||
"direct_url": direct_url,
|
||||
"raw": doc,
|
||||
},
|
||||
)
|
||||
@@ -256,9 +885,7 @@ class OpenLibrary(SearchProvider):
|
||||
|
||||
# 1) Direct download if available.
|
||||
try:
|
||||
from API.archive_client import check_direct_download
|
||||
|
||||
can_direct, pdf_url = check_direct_download(archive_id)
|
||||
can_direct, pdf_url = self._archive_check_direct_download(archive_id)
|
||||
except Exception:
|
||||
can_direct, pdf_url = False, ""
|
||||
|
||||
@@ -272,10 +899,7 @@ class OpenLibrary(SearchProvider):
|
||||
|
||||
# 2) Borrow flow (credentials required).
|
||||
try:
|
||||
from API.archive_client import BookNotAvailableError, credential_openlibrary, download as archive_download
|
||||
from API.archive_client import get_book_infos, loan, login
|
||||
|
||||
email, password = credential_openlibrary(self.config or {})
|
||||
email, password = self._credential_archive(self.config or {})
|
||||
if not email or not password:
|
||||
log("[openlibrary] Archive credentials missing; cannot borrow", file=sys.stderr)
|
||||
return None
|
||||
@@ -285,13 +909,13 @@ class OpenLibrary(SearchProvider):
|
||||
log(f"[openlibrary] Not lendable: {reason}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
session = login(email, password)
|
||||
session = self._archive_login(email, password)
|
||||
try:
|
||||
session = loan(session, archive_id, verbose=False)
|
||||
except BookNotAvailableError:
|
||||
session = self._archive_loan(session, archive_id, verbose=False)
|
||||
except self.BookNotAvailableError:
|
||||
log("[openlibrary] Book not available to borrow", file=sys.stderr)
|
||||
return None
|
||||
except SystemExit:
|
||||
except Exception:
|
||||
log("[openlibrary] Borrow failed", file=sys.stderr)
|
||||
return None
|
||||
|
||||
@@ -301,7 +925,7 @@ class OpenLibrary(SearchProvider):
|
||||
last_exc: Optional[Exception] = None
|
||||
for u in urls:
|
||||
try:
|
||||
title_raw, links, _metadata = get_book_infos(session, u)
|
||||
title_raw, links, _metadata = self._archive_get_book_infos(session, u)
|
||||
if title_raw:
|
||||
title = sanitize_filename(title_raw)
|
||||
break
|
||||
@@ -315,7 +939,7 @@ class OpenLibrary(SearchProvider):
|
||||
|
||||
temp_dir = tempfile.mkdtemp(prefix=f"{title}_", dir=str(output_dir))
|
||||
try:
|
||||
images = archive_download(session=session, n_threads=10, directory=temp_dir, links=links, scale=3, book_id=archive_id)
|
||||
images = self._archive_download(session=session, n_threads=10, directory=temp_dir, links=links, scale=3, book_id=archive_id)
|
||||
|
||||
try:
|
||||
import img2pdf # type: ignore
|
||||
|
||||
Reference in New Issue
Block a user