dfdfsdd
This commit is contained in:
@@ -12,6 +12,7 @@ import sys
|
|||||||
import time
|
import time
|
||||||
|
|
||||||
from SYS.logger import log, debug
|
from SYS.logger import log, debug
|
||||||
|
from SYS.utils_constant import ALL_SUPPORTED_EXTENSIONS as GLOBAL_SUPPORTED_EXTENSIONS
|
||||||
import tempfile
|
import tempfile
|
||||||
import logging
|
import logging
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
@@ -1103,9 +1104,7 @@ SUPPORTED_FILETYPES = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
# Flatten to get all supported extensions
|
# Flatten to get all supported extensions
|
||||||
ALL_SUPPORTED_EXTENSIONS = set()
|
ALL_SUPPORTED_EXTENSIONS = set(GLOBAL_SUPPORTED_EXTENSIONS)
|
||||||
for category_extensions in SUPPORTED_FILETYPES.values():
|
|
||||||
ALL_SUPPORTED_EXTENSIONS.update(category_extensions.keys())
|
|
||||||
|
|
||||||
|
|
||||||
# Global Hydrus client cache to reuse session keys
|
# Global Hydrus client cache to reuse session keys
|
||||||
|
|||||||
@@ -1,584 +0,0 @@
|
|||||||
"""Archive.org API client for borrowing and downloading books.
|
|
||||||
|
|
||||||
This module provides low-level functions for interacting with Archive.org:
|
|
||||||
- Authentication (login, credential management)
|
|
||||||
- Borrowing (loan, return_loan)
|
|
||||||
- Book metadata extraction (get_book_infos, get_book_metadata)
|
|
||||||
- Image downloading and deobfuscation
|
|
||||||
- PDF creation with metadata
|
|
||||||
|
|
||||||
Used by Provider/openlibrary.py for the borrowing workflow.
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import base64
|
|
||||||
import hashlib
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
import time
|
|
||||||
from concurrent import futures
|
|
||||||
from typing import Any, Dict, List, Optional, Sequence, Tuple
|
|
||||||
|
|
||||||
import requests
|
|
||||||
|
|
||||||
from SYS.logger import log, debug
|
|
||||||
|
|
||||||
try:
|
|
||||||
from Crypto.Cipher import AES # type: ignore
|
|
||||||
from Crypto.Util import Counter # type: ignore
|
|
||||||
except ImportError:
|
|
||||||
AES = None # type: ignore
|
|
||||||
Counter = None # type: ignore
|
|
||||||
|
|
||||||
try:
|
|
||||||
from tqdm import tqdm # type: ignore
|
|
||||||
except ImportError:
|
|
||||||
tqdm = None # type: ignore
|
|
||||||
|
|
||||||
|
|
||||||
def credential_openlibrary(config: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
|
|
||||||
"""Get OpenLibrary/Archive.org email and password from config.
|
|
||||||
|
|
||||||
Supports both formats:
|
|
||||||
- New: {"provider": {"openlibrary": {"email": "...", "password": "..."}}}
|
|
||||||
- Old: {"Archive": {"email": "...", "password": "..."}}
|
|
||||||
{"archive_org_email": "...", "archive_org_password": "..."}
|
|
||||||
|
|
||||||
Returns: (email, password) tuple, each can be None
|
|
||||||
"""
|
|
||||||
if not isinstance(config, dict):
|
|
||||||
return None, None
|
|
||||||
|
|
||||||
# Try new format first
|
|
||||||
provider_config = config.get("provider", {})
|
|
||||||
if isinstance(provider_config, dict):
|
|
||||||
openlibrary_config = provider_config.get("openlibrary", {})
|
|
||||||
if isinstance(openlibrary_config, dict):
|
|
||||||
email = openlibrary_config.get("email")
|
|
||||||
password = openlibrary_config.get("password")
|
|
||||||
if email or password:
|
|
||||||
return email, password
|
|
||||||
|
|
||||||
# Try old nested format
|
|
||||||
archive_config = config.get("Archive")
|
|
||||||
if isinstance(archive_config, dict):
|
|
||||||
email = archive_config.get("email")
|
|
||||||
password = archive_config.get("password")
|
|
||||||
if email or password:
|
|
||||||
return email, password
|
|
||||||
|
|
||||||
# Fall back to old flat format
|
|
||||||
email = config.get("archive_org_email")
|
|
||||||
password = config.get("archive_org_password")
|
|
||||||
return email, password
|
|
||||||
|
|
||||||
|
|
||||||
class BookNotAvailableError(Exception):
|
|
||||||
"""Raised when a book is not available for borrowing (waitlisted/in use)."""
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def display_error(response: requests.Response, message: str) -> None:
|
|
||||||
"""Display error and exit."""
|
|
||||||
log(message, file=sys.stderr)
|
|
||||||
log(response.text, file=sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
|
|
||||||
def login(email: str, password: str) -> requests.Session:
|
|
||||||
"""Login to archive.org.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
email: Archive.org email
|
|
||||||
password: Archive.org password
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Authenticated requests.Session
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
SystemExit on login failure
|
|
||||||
"""
|
|
||||||
session = requests.Session()
|
|
||||||
session.get("https://archive.org/account/login", timeout=30)
|
|
||||||
|
|
||||||
data = {"username": email, "password": password}
|
|
||||||
response = session.post("https://archive.org/account/login", data=data, timeout=30)
|
|
||||||
|
|
||||||
if "bad_login" in response.text:
|
|
||||||
log("Invalid credentials!", file=sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
if "Successful login" in response.text:
|
|
||||||
debug("Successful login")
|
|
||||||
return session
|
|
||||||
display_error(response, "[-] Error while login:")
|
|
||||||
sys.exit(1) # Unreachable but satisfies type checker
|
|
||||||
|
|
||||||
|
|
||||||
def loan(session: requests.Session, book_id: str, verbose: bool = True) -> requests.Session:
|
|
||||||
"""Borrow a book from archive.org (14-day loan).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
session: Authenticated requests.Session from login()
|
|
||||||
book_id: Archive.org book identifier (e.g., 'ia_book_id')
|
|
||||||
verbose: Whether to log messages
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Session with active loan
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
SystemExit on loan failure
|
|
||||||
"""
|
|
||||||
data = {"action": "grant_access", "identifier": book_id}
|
|
||||||
response = session.post("https://archive.org/services/loans/loan/searchInside.php", data=data, timeout=30)
|
|
||||||
data["action"] = "browse_book"
|
|
||||||
response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30)
|
|
||||||
|
|
||||||
if response.status_code == 400:
|
|
||||||
try:
|
|
||||||
if response.json()["error"] == "This book is not available to borrow at this time. Please try again later.":
|
|
||||||
debug("Book is not available for borrowing (waitlisted or in use)")
|
|
||||||
raise BookNotAvailableError("Book is waitlisted or in use")
|
|
||||||
display_error(response, "Something went wrong when trying to borrow the book.")
|
|
||||||
except BookNotAvailableError:
|
|
||||||
raise
|
|
||||||
except:
|
|
||||||
display_error(response, "The book cannot be borrowed")
|
|
||||||
|
|
||||||
data["action"] = "create_token"
|
|
||||||
response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30)
|
|
||||||
|
|
||||||
if "token" in response.text:
|
|
||||||
if verbose:
|
|
||||||
debug("Successful loan")
|
|
||||||
return session
|
|
||||||
display_error(response, "Something went wrong when trying to borrow the book.")
|
|
||||||
sys.exit(1) # Unreachable but satisfies type checker
|
|
||||||
|
|
||||||
|
|
||||||
def return_loan(session: requests.Session, book_id: str) -> None:
|
|
||||||
"""Return a borrowed book.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
session: Authenticated requests.Session with active loan
|
|
||||||
book_id: Archive.org book identifier
|
|
||||||
"""
|
|
||||||
data = {"action": "return_loan", "identifier": book_id}
|
|
||||||
response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30)
|
|
||||||
if response.status_code == 200 and response.json()["success"]:
|
|
||||||
debug("Book returned")
|
|
||||||
else:
|
|
||||||
display_error(response, "Something went wrong when trying to return the book")
|
|
||||||
|
|
||||||
|
|
||||||
def get_book_infos(session: requests.Session, url: str) -> Tuple[str, List[str], Dict[str, Any]]:
|
|
||||||
"""Extract book information and page links from archive.org viewer.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
session: Authenticated requests.Session
|
|
||||||
url: Book URL (e.g., https://archive.org/borrow/book_id or /details/book_id)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tuple of (title, page_links, metadata)
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
RuntimeError: If page data cannot be extracted
|
|
||||||
"""
|
|
||||||
r = session.get(url, timeout=30).text
|
|
||||||
|
|
||||||
# Try to extract the infos URL from the response
|
|
||||||
try:
|
|
||||||
# Look for the "url" field in the response using regex
|
|
||||||
# Matches "url":"//archive.org/..."
|
|
||||||
import re
|
|
||||||
match = re.search(r'"url"\s*:\s*"([^"]+)"', r)
|
|
||||||
if not match:
|
|
||||||
raise ValueError("No 'url' field found in response")
|
|
||||||
|
|
||||||
url_path = match.group(1)
|
|
||||||
if url_path.startswith("//"):
|
|
||||||
infos_url = "https:" + url_path
|
|
||||||
else:
|
|
||||||
infos_url = url_path
|
|
||||||
|
|
||||||
infos_url = infos_url.replace("\\u0026", "&")
|
|
||||||
except (IndexError, ValueError, AttributeError) as e:
|
|
||||||
# If URL extraction fails, raise with better error message
|
|
||||||
raise RuntimeError(f"Failed to extract book info URL from response: {e}")
|
|
||||||
|
|
||||||
response = session.get(infos_url, timeout=30)
|
|
||||||
data = response.json()["data"]
|
|
||||||
title = data["brOptions"]["bookTitle"].strip().replace(" ", "_")
|
|
||||||
title = "".join(c for c in title if c not in '<>:"/\\|?*') # Filter forbidden chars
|
|
||||||
title = title[:150] # Trim to avoid long file names
|
|
||||||
metadata = data["metadata"]
|
|
||||||
links = []
|
|
||||||
|
|
||||||
# Safely extract page links from brOptions data
|
|
||||||
try:
|
|
||||||
br_data = data.get("brOptions", {}).get("data", [])
|
|
||||||
for item in br_data:
|
|
||||||
if isinstance(item, list):
|
|
||||||
for page in item:
|
|
||||||
if isinstance(page, dict) and "uri" in page:
|
|
||||||
links.append(page["uri"])
|
|
||||||
elif isinstance(item, dict) and "uri" in item:
|
|
||||||
links.append(item["uri"])
|
|
||||||
except (KeyError, IndexError, TypeError) as e:
|
|
||||||
log(f"Warning: Error parsing page links: {e}", file=sys.stderr)
|
|
||||||
# Continue with whatever links we found
|
|
||||||
|
|
||||||
if len(links) > 1:
|
|
||||||
debug(f"Found {len(links)} pages")
|
|
||||||
return title, links, metadata
|
|
||||||
elif len(links) == 1:
|
|
||||||
debug(f"Found {len(links)} page")
|
|
||||||
return title, links, metadata
|
|
||||||
else:
|
|
||||||
log("Error while getting image links - no pages found", file=sys.stderr)
|
|
||||||
raise RuntimeError("No pages found in book data")
|
|
||||||
|
|
||||||
|
|
||||||
def image_name(pages: int, page: int, directory: str) -> str:
|
|
||||||
"""Generate image filename for page.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pages: Total number of pages
|
|
||||||
page: Current page number (0-indexed)
|
|
||||||
directory: Directory to save to
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Full path to image file
|
|
||||||
"""
|
|
||||||
return f"{directory}/{(len(str(pages)) - len(str(page))) * '0'}{page}.jpg"
|
|
||||||
|
|
||||||
|
|
||||||
def deobfuscate_image(image_data: bytes, link: str, obf_header: str) -> bytes:
|
|
||||||
"""Decrypt obfuscated image data using AES-CTR.
|
|
||||||
|
|
||||||
This handles Archive.org's image obfuscation for borrowed books.
|
|
||||||
Based on: https://github.com/justimm
|
|
||||||
|
|
||||||
Args:
|
|
||||||
image_data: Encrypted image bytes
|
|
||||||
link: Image URL (used to derive AES key)
|
|
||||||
obf_header: X-Obfuscate header value (format: "1|BASE64_COUNTER")
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Decrypted image bytes
|
|
||||||
"""
|
|
||||||
if not AES or not Counter:
|
|
||||||
raise RuntimeError("Crypto library not available")
|
|
||||||
|
|
||||||
try:
|
|
||||||
version, counter_b64 = obf_header.split("|")
|
|
||||||
except Exception as e:
|
|
||||||
raise ValueError("Invalid X-Obfuscate header format") from e
|
|
||||||
|
|
||||||
if version != "1":
|
|
||||||
raise ValueError("Unsupported obfuscation version: " + version)
|
|
||||||
|
|
||||||
# Derive AES key from URL
|
|
||||||
aesKey = re.sub(r"^https?:\/\/.*?\/", "/", link)
|
|
||||||
sha1_digest = hashlib.sha1(aesKey.encode("utf-8")).digest()
|
|
||||||
key = sha1_digest[:16]
|
|
||||||
|
|
||||||
# Decode counter
|
|
||||||
counter_bytes = base64.b64decode(counter_b64)
|
|
||||||
if len(counter_bytes) != 16:
|
|
||||||
raise ValueError(f"Expected counter to be 16 bytes, got {len(counter_bytes)}")
|
|
||||||
|
|
||||||
prefix = counter_bytes[:8]
|
|
||||||
initial_value = int.from_bytes(counter_bytes[8:], byteorder="big")
|
|
||||||
|
|
||||||
# Create AES-CTR cipher
|
|
||||||
ctr = Counter.new(64, prefix=prefix, initial_value=initial_value, little_endian=False) # type: ignore
|
|
||||||
cipher = AES.new(key, AES.MODE_CTR, counter=ctr) # type: ignore
|
|
||||||
|
|
||||||
decrypted_part = cipher.decrypt(image_data[:1024])
|
|
||||||
new_data = decrypted_part + image_data[1024:]
|
|
||||||
return new_data
|
|
||||||
|
|
||||||
|
|
||||||
def download_one_image(
|
|
||||||
session: requests.Session,
|
|
||||||
link: str,
|
|
||||||
i: int,
|
|
||||||
directory: str,
|
|
||||||
book_id: str,
|
|
||||||
pages: int,
|
|
||||||
) -> None:
|
|
||||||
"""Download a single book page image.
|
|
||||||
|
|
||||||
Handles obfuscated images and re-borrowing on 403 errors.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
session: Authenticated requests.Session
|
|
||||||
link: Direct image URL
|
|
||||||
i: Page index (0-based)
|
|
||||||
directory: Directory to save to
|
|
||||||
book_id: Archive.org book ID (for re-borrowing on 403)
|
|
||||||
pages: Total number of pages
|
|
||||||
"""
|
|
||||||
headers = {
|
|
||||||
"Referer": "https://archive.org/",
|
|
||||||
"Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
|
|
||||||
"Sec-Fetch-Site": "same-site",
|
|
||||||
"Sec-Fetch-Mode": "no-cors",
|
|
||||||
"Sec-Fetch-Dest": "image",
|
|
||||||
}
|
|
||||||
retry = True
|
|
||||||
response = None
|
|
||||||
while retry:
|
|
||||||
try:
|
|
||||||
response = session.get(link, headers=headers, timeout=30)
|
|
||||||
if response.status_code == 403:
|
|
||||||
session = loan(session, book_id, verbose=False)
|
|
||||||
raise Exception("Borrow again")
|
|
||||||
if response.status_code == 200:
|
|
||||||
retry = False
|
|
||||||
except:
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
image = image_name(pages, i, directory)
|
|
||||||
|
|
||||||
if response is None:
|
|
||||||
log(f"Failed to download page {i}", file=sys.stderr)
|
|
||||||
return
|
|
||||||
|
|
||||||
obf_header = response.headers.get("X-Obfuscate")
|
|
||||||
image_content = None
|
|
||||||
if obf_header:
|
|
||||||
try:
|
|
||||||
image_content = deobfuscate_image(response.content, link, obf_header)
|
|
||||||
except Exception as e:
|
|
||||||
log(f"Deobfuscation failed: {e}", file=sys.stderr)
|
|
||||||
return
|
|
||||||
else:
|
|
||||||
image_content = response.content
|
|
||||||
|
|
||||||
with open(image, "wb") as f:
|
|
||||||
f.write(image_content)
|
|
||||||
|
|
||||||
|
|
||||||
def download(
|
|
||||||
session: requests.Session,
|
|
||||||
n_threads: int,
|
|
||||||
directory: str,
|
|
||||||
links: List[str],
|
|
||||||
scale: int,
|
|
||||||
book_id: str,
|
|
||||||
) -> List[str]:
|
|
||||||
"""Download all book pages as images.
|
|
||||||
|
|
||||||
Uses thread pool for parallel downloads.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
session: Authenticated requests.Session
|
|
||||||
n_threads: Number of download threads
|
|
||||||
directory: Directory to save images to
|
|
||||||
links: List of image url
|
|
||||||
scale: Image resolution (0=highest, 10=lowest)
|
|
||||||
book_id: Archive.org book ID (for re-borrowing)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of downloaded image file paths
|
|
||||||
"""
|
|
||||||
debug("Downloading pages...")
|
|
||||||
links = [f"{link}&rotate=0&scale={scale}" for link in links]
|
|
||||||
pages = len(links)
|
|
||||||
|
|
||||||
tasks = []
|
|
||||||
with futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
|
|
||||||
for link in links:
|
|
||||||
i = links.index(link)
|
|
||||||
tasks.append(
|
|
||||||
executor.submit(
|
|
||||||
download_one_image,
|
|
||||||
session=session,
|
|
||||||
link=link,
|
|
||||||
i=i,
|
|
||||||
directory=directory,
|
|
||||||
book_id=book_id,
|
|
||||||
pages=pages,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if tqdm:
|
|
||||||
for _ in tqdm(futures.as_completed(tasks), total=len(tasks)): # type: ignore
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
for _ in futures.as_completed(tasks):
|
|
||||||
pass
|
|
||||||
|
|
||||||
images = [image_name(pages, i, directory) for i in range(len(links))]
|
|
||||||
return images
|
|
||||||
|
|
||||||
|
|
||||||
def check_direct_download(book_id: str) -> Tuple[bool, str]:
|
|
||||||
"""Check if a book can be downloaded directly without borrowing.
|
|
||||||
|
|
||||||
Searches Archive.org metadata for downloadable PDF files.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
book_id: Archive.org book identifier
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tuple of (can_download: bool, pdf_url: str)
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
# First, try to get the metadata to find the actual PDF filename
|
|
||||||
metadata_url = f"https://archive.org/metadata/{book_id}"
|
|
||||||
response = requests.get(metadata_url, timeout=10)
|
|
||||||
response.raise_for_status()
|
|
||||||
metadata = response.json()
|
|
||||||
|
|
||||||
# Find PDF file in files list
|
|
||||||
if "files" in metadata:
|
|
||||||
for file_info in metadata["files"]:
|
|
||||||
filename = file_info.get("name", "")
|
|
||||||
if filename.endswith(".pdf") and file_info.get("source") == "original":
|
|
||||||
# Found the original PDF
|
|
||||||
pdf_filename = filename
|
|
||||||
pdf_url = f"https://archive.org/download/{book_id}/{pdf_filename.replace(' ', '%20')}"
|
|
||||||
|
|
||||||
# Verify it's accessible
|
|
||||||
check_response = requests.head(pdf_url, timeout=5, allow_redirects=True)
|
|
||||||
if check_response.status_code == 200:
|
|
||||||
return True, pdf_url
|
|
||||||
|
|
||||||
return False, ""
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
log(f"Error checking direct download: {e}", file=sys.stderr)
|
|
||||||
return False, ""
|
|
||||||
|
|
||||||
|
|
||||||
def get_openlibrary_by_isbn(isbn: str) -> Dict[str, Any]:
|
|
||||||
"""Fetch book data from OpenLibrary using ISBN.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
isbn: ISBN-10 or ISBN-13 to search for
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dictionary with book metadata from OpenLibrary
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
# Try ISBN API first
|
|
||||||
api_url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn}&jscmd=data&format=json"
|
|
||||||
response = requests.get(api_url, timeout=10)
|
|
||||||
response.raise_for_status()
|
|
||||||
data = response.json()
|
|
||||||
|
|
||||||
if data:
|
|
||||||
# Get first result
|
|
||||||
key = list(data.keys())[0]
|
|
||||||
return data[key]
|
|
||||||
return {}
|
|
||||||
except Exception as e:
|
|
||||||
log(f"Error fetching OpenLibrary data by ISBN: {e}", file=sys.stderr)
|
|
||||||
return {}
|
|
||||||
|
|
||||||
|
|
||||||
def extract_isbn_from_metadata(metadata: Dict[str, Any]) -> str:
|
|
||||||
"""Extract ISBN from archive.org metadata.
|
|
||||||
|
|
||||||
Looks for ISBN in various metadata fields.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
metadata: Archive.org metadata dictionary
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
ISBN string (clean, no hyphens) or empty string if not found
|
|
||||||
"""
|
|
||||||
# Try various common metadata fields
|
|
||||||
isbn_fields = [
|
|
||||||
"isbn", "ISBN", "isbn_13", "isbn_10", "isbns",
|
|
||||||
"isbn-10", "isbn-13", "identifer_isbn"
|
|
||||||
]
|
|
||||||
|
|
||||||
for field in isbn_fields:
|
|
||||||
if field in metadata:
|
|
||||||
isbn_val = metadata[field]
|
|
||||||
if isinstance(isbn_val, list):
|
|
||||||
isbn_val = isbn_val[0] if isbn_val else None
|
|
||||||
if isbn_val and isinstance(isbn_val, str):
|
|
||||||
# Clean ISBN (remove hyphens, spaces)
|
|
||||||
isbn_clean = isbn_val.replace("-", "").replace(" ", "")
|
|
||||||
if len(isbn_clean) in [10, 13]:
|
|
||||||
return isbn_clean
|
|
||||||
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_url(url: str) -> str:
|
|
||||||
"""Convert openlibrary.org URL to archive.org URL.
|
|
||||||
|
|
||||||
Looks up the actual Archive.org ID from OpenLibrary API.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
url: Book URL (archive.org or openlibrary.org format)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Normalized archive.org URL
|
|
||||||
"""
|
|
||||||
url = url.strip()
|
|
||||||
|
|
||||||
# Already archive.org format
|
|
||||||
if url.startswith("https://archive.org/details/"):
|
|
||||||
return url
|
|
||||||
|
|
||||||
# Convert openlibrary.org format by querying the OpenLibrary API
|
|
||||||
if "openlibrary.org/books/" in url:
|
|
||||||
try:
|
|
||||||
# Extract the book ID (e.g., OL6796852M)
|
|
||||||
parts = url.split("/books/")
|
|
||||||
if len(parts) > 1:
|
|
||||||
book_id = parts[1].split("/")[0]
|
|
||||||
|
|
||||||
# Query OpenLibrary API to get the book metadata
|
|
||||||
api_url = f"https://openlibrary.org/books/{book_id}.json"
|
|
||||||
response = requests.get(api_url, timeout=10)
|
|
||||||
response.raise_for_status()
|
|
||||||
data = response.json()
|
|
||||||
|
|
||||||
# Look for identifiers including internet_archive or ocaid
|
|
||||||
# First try ocaid (Open Content Alliance ID) - this is most common
|
|
||||||
if "ocaid" in data:
|
|
||||||
ocaid = data["ocaid"]
|
|
||||||
return f"https://archive.org/details/{ocaid}"
|
|
||||||
|
|
||||||
# Check for identifiers object
|
|
||||||
if "identifiers" in data:
|
|
||||||
identifiers = data["identifiers"]
|
|
||||||
|
|
||||||
# Look for internet_archive ID
|
|
||||||
if "internet_archive" in identifiers:
|
|
||||||
ia_ids = identifiers["internet_archive"]
|
|
||||||
if isinstance(ia_ids, list) and ia_ids:
|
|
||||||
ia_id = ia_ids[0]
|
|
||||||
else:
|
|
||||||
ia_id = ia_ids
|
|
||||||
return f"https://archive.org/details/{ia_id}"
|
|
||||||
|
|
||||||
# If no IA identifier found, use the book ID as fallback
|
|
||||||
log(f"No Internet Archive ID found for {book_id}. Attempting with OpenLibrary ID.", file=sys.stderr)
|
|
||||||
return f"https://archive.org/details/{book_id}"
|
|
||||||
|
|
||||||
except requests.RequestException as e:
|
|
||||||
log(f"Could not fetch OpenLibrary metadata: {e}", file=sys.stderr)
|
|
||||||
# Fallback to using the book ID directly
|
|
||||||
parts = url.split("/books/")
|
|
||||||
if len(parts) > 1:
|
|
||||||
book_id = parts[1].split("/")[0]
|
|
||||||
return f"https://archive.org/details/{book_id}"
|
|
||||||
except (KeyError, IndexError) as e:
|
|
||||||
log(f"Error parsing OpenLibrary response: {e}", file=sys.stderr)
|
|
||||||
# Fallback to using the book ID directly
|
|
||||||
parts = url.split("/books/")
|
|
||||||
if len(parts) > 1:
|
|
||||||
book_id = parts[1].split("/")[0]
|
|
||||||
return f"https://archive.org/details/{book_id}"
|
|
||||||
|
|
||||||
# Return original if can't parse
|
|
||||||
return url
|
|
||||||
128
API/folder.py
128
API/folder.py
@@ -407,38 +407,53 @@ class API_folder_store:
|
|||||||
logger.error(f"Error clearing worker log for {worker_id}: {exc}", exc_info=True)
|
logger.error(f"Error clearing worker log for {worker_id}: {exc}", exc_info=True)
|
||||||
|
|
||||||
def _migrate_metadata_schema(self, cursor) -> None:
|
def _migrate_metadata_schema(self, cursor) -> None:
|
||||||
"""Import legacy metadata from old schema if present. Existing hash-based schema is ready to use."""
|
"""Ensure metadata schema is up-to-date.
|
||||||
|
|
||||||
|
- If a legacy schema is detected, attempt to import/upgrade (best-effort).
|
||||||
|
- If the hash-based schema exists, add any missing columns expected by current code.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
# Check if this is a fresh new database (hash-based schema)
|
# Check if this is a fresh new database (hash-based schema)
|
||||||
cursor.execute('PRAGMA table_info(metadata)')
|
cursor.execute('PRAGMA table_info(metadata)')
|
||||||
existing_columns = {row[1] for row in cursor.fetchall()}
|
existing_columns = {row[1] for row in cursor.fetchall()}
|
||||||
|
|
||||||
# If hash column exists, we're already on the new schema
|
# Legacy migration: If old schema exists, try to import data.
|
||||||
if 'hash' in existing_columns:
|
|
||||||
logger.info("Database is already using hash-based schema - no migration needed")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Legacy migration: If old schema exists, try to import data
|
|
||||||
# Old schema would have had: id (INTEGER PRIMARY KEY), file_hash (TEXT), etc.
|
# Old schema would have had: id (INTEGER PRIMARY KEY), file_hash (TEXT), etc.
|
||||||
|
if 'hash' not in existing_columns:
|
||||||
if 'id' in existing_columns and 'file_hash' in existing_columns:
|
if 'id' in existing_columns and 'file_hash' in existing_columns:
|
||||||
logger.info("Detected legacy metadata schema - importing to new hash-based schema")
|
logger.info("Detected legacy metadata schema - importing to new hash-based schema")
|
||||||
# This would be complex legacy migration - for now just note it
|
# This would be complex legacy migration - for now just note it.
|
||||||
logger.info("Legacy metadata table detected but import not yet implemented")
|
logger.info("Legacy metadata table detected but import not yet implemented")
|
||||||
return
|
return
|
||||||
|
|
||||||
# Add any missing columns to the new schema
|
# Unknown/unsupported schema; nothing we can safely do here.
|
||||||
for col_name, col_def in [('size', 'INTEGER'), ('ext', 'TEXT'),
|
return
|
||||||
('type', 'TEXT'),
|
|
||||||
('time_imported', 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP'),
|
# Hash-based schema exists: add any missing columns expected by current code.
|
||||||
('time_modified', 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP')]:
|
# These are safe ALTER TABLE additions for older DBs.
|
||||||
|
column_specs = {
|
||||||
|
'size': 'INTEGER',
|
||||||
|
'ext': 'TEXT',
|
||||||
|
'type': 'TEXT',
|
||||||
|
'url': 'TEXT',
|
||||||
|
'relationships': 'TEXT',
|
||||||
|
'duration': 'REAL',
|
||||||
|
'time_imported': 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP',
|
||||||
|
'time_modified': 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP',
|
||||||
|
'created_at': 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP',
|
||||||
|
'updated_at': 'TIMESTAMP DEFAULT CURRENT_TIMESTAMP',
|
||||||
|
}
|
||||||
|
|
||||||
|
for col_name, col_def in column_specs.items():
|
||||||
if col_name not in existing_columns:
|
if col_name not in existing_columns:
|
||||||
try:
|
try:
|
||||||
cursor.execute(f"ALTER TABLE metadata ADD COLUMN {col_name} {col_def}")
|
cursor.execute(f"ALTER TABLE metadata ADD COLUMN {col_name} {col_def}")
|
||||||
|
existing_columns.add(col_name)
|
||||||
logger.info(f"Added '{col_name}' column to metadata table")
|
logger.info(f"Added '{col_name}' column to metadata table")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"Column '{col_name}' may already exist: {e}")
|
logger.debug(f"Column '{col_name}' may already exist: {e}")
|
||||||
|
|
||||||
# Populate type column from ext if not already populated
|
# Populate type column from ext if not already populated.
|
||||||
if 'type' in existing_columns and 'ext' in existing_columns:
|
if 'type' in existing_columns and 'ext' in existing_columns:
|
||||||
try:
|
try:
|
||||||
from SYS.utils_constant import get_type_from_ext
|
from SYS.utils_constant import get_type_from_ext
|
||||||
@@ -929,6 +944,13 @@ class API_folder_store:
|
|||||||
if not fields:
|
if not fields:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Ensure a metadata row exists so updates don't silently no-op.
|
||||||
|
# This can happen for older DBs or entries created without explicit metadata.
|
||||||
|
cursor.execute(
|
||||||
|
"INSERT OR IGNORE INTO metadata (hash) VALUES (?)",
|
||||||
|
(file_hash,),
|
||||||
|
)
|
||||||
|
|
||||||
values.append(file_hash)
|
values.append(file_hash)
|
||||||
|
|
||||||
sql = f"UPDATE metadata SET {', '.join(fields)}, time_modified = CURRENT_TIMESTAMP, updated_at = CURRENT_TIMESTAMP WHERE hash = ?"
|
sql = f"UPDATE metadata SET {', '.join(fields)}, time_modified = CURRENT_TIMESTAMP, updated_at = CURRENT_TIMESTAMP WHERE hash = ?"
|
||||||
@@ -1681,6 +1703,84 @@ class DatabaseAPI:
|
|||||||
)
|
)
|
||||||
return {row[0] for row in cursor.fetchall()}
|
return {row[0] for row in cursor.fetchall()}
|
||||||
|
|
||||||
|
def get_file_hashes_with_any_url(self, limit: Optional[int] = None) -> Set[str]:
|
||||||
|
"""Get hashes of files that have any non-empty URL metadata."""
|
||||||
|
cursor = self.get_cursor()
|
||||||
|
cursor.execute(
|
||||||
|
"""
|
||||||
|
SELECT DISTINCT f.hash
|
||||||
|
FROM files f
|
||||||
|
JOIN metadata m ON f.hash = m.hash
|
||||||
|
WHERE m.url IS NOT NULL
|
||||||
|
AND TRIM(m.url) != ''
|
||||||
|
AND TRIM(m.url) != '[]'
|
||||||
|
LIMIT ?
|
||||||
|
""",
|
||||||
|
(limit or 10000,),
|
||||||
|
)
|
||||||
|
return {row[0] for row in cursor.fetchall()}
|
||||||
|
|
||||||
|
def get_file_hashes_by_url_like(self, like_pattern: str, limit: Optional[int] = None) -> Set[str]:
|
||||||
|
"""Get hashes of files whose URL metadata contains a substring (case-insensitive)."""
|
||||||
|
cursor = self.get_cursor()
|
||||||
|
cursor.execute(
|
||||||
|
"""
|
||||||
|
SELECT DISTINCT f.hash
|
||||||
|
FROM files f
|
||||||
|
JOIN metadata m ON f.hash = m.hash
|
||||||
|
WHERE m.url IS NOT NULL
|
||||||
|
AND LOWER(m.url) LIKE ?
|
||||||
|
LIMIT ?
|
||||||
|
""",
|
||||||
|
(like_pattern.lower(), limit or 10000),
|
||||||
|
)
|
||||||
|
return {row[0] for row in cursor.fetchall()}
|
||||||
|
|
||||||
|
def get_files_with_any_url(self, limit: Optional[int] = None) -> List[tuple]:
|
||||||
|
"""Get files that have any non-empty URL metadata.
|
||||||
|
|
||||||
|
Returns (hash, file_path, size, ext) tuples.
|
||||||
|
"""
|
||||||
|
cursor = self.get_cursor()
|
||||||
|
cursor.execute(
|
||||||
|
"""
|
||||||
|
SELECT f.hash, f.file_path,
|
||||||
|
COALESCE((SELECT size FROM metadata WHERE hash = f.hash), 0) as size,
|
||||||
|
COALESCE((SELECT ext FROM metadata WHERE hash = f.hash), '') as ext
|
||||||
|
FROM files f
|
||||||
|
JOIN metadata m ON f.hash = m.hash
|
||||||
|
WHERE m.url IS NOT NULL
|
||||||
|
AND TRIM(m.url) != ''
|
||||||
|
AND TRIM(m.url) != '[]'
|
||||||
|
ORDER BY f.file_path
|
||||||
|
LIMIT ?
|
||||||
|
""",
|
||||||
|
(limit or 10000,),
|
||||||
|
)
|
||||||
|
return cursor.fetchall()
|
||||||
|
|
||||||
|
def get_files_by_url_like(self, like_pattern: str, limit: Optional[int] = None) -> List[tuple]:
|
||||||
|
"""Get files whose URL metadata contains a substring (case-insensitive).
|
||||||
|
|
||||||
|
Returns (hash, file_path, size, ext) tuples.
|
||||||
|
"""
|
||||||
|
cursor = self.get_cursor()
|
||||||
|
cursor.execute(
|
||||||
|
"""
|
||||||
|
SELECT f.hash, f.file_path,
|
||||||
|
COALESCE((SELECT size FROM metadata WHERE hash = f.hash), 0) as size,
|
||||||
|
COALESCE((SELECT ext FROM metadata WHERE hash = f.hash), '') as ext
|
||||||
|
FROM files f
|
||||||
|
JOIN metadata m ON f.hash = m.hash
|
||||||
|
WHERE m.url IS NOT NULL
|
||||||
|
AND LOWER(m.url) LIKE ?
|
||||||
|
ORDER BY f.file_path
|
||||||
|
LIMIT ?
|
||||||
|
""",
|
||||||
|
(like_pattern.lower(), limit or 10000),
|
||||||
|
)
|
||||||
|
return cursor.fetchall()
|
||||||
|
|
||||||
def get_file_metadata(self, file_hashes: Set[str], limit: Optional[int] = None) -> List[tuple]:
|
def get_file_metadata(self, file_hashes: Set[str], limit: Optional[int] = None) -> List[tuple]:
|
||||||
"""Get metadata for files given their hashes. Returns (hash, file_path, size, extension) tuples."""
|
"""Get metadata for files given their hashes. Returns (hash, file_path, size, extension) tuples."""
|
||||||
if not file_hashes:
|
if not file_hashes:
|
||||||
|
|||||||
35
CLI.py
35
CLI.py
@@ -1498,6 +1498,9 @@ def _execute_pipeline(tokens: list):
|
|||||||
elif table_type == 'soulseek':
|
elif table_type == 'soulseek':
|
||||||
print(f"Auto-piping Soulseek selection to download-file")
|
print(f"Auto-piping Soulseek selection to download-file")
|
||||||
stages.append(['download-file'])
|
stages.append(['download-file'])
|
||||||
|
elif table_type == 'openlibrary':
|
||||||
|
print(f"Auto-piping OpenLibrary selection to download-file")
|
||||||
|
stages.append(['download-file'])
|
||||||
elif source_cmd == 'search-file' and source_args and 'youtube' in source_args:
|
elif source_cmd == 'search-file' and source_args and 'youtube' in source_args:
|
||||||
# Legacy check
|
# Legacy check
|
||||||
print(f"Auto-piping YouTube selection to .pipe")
|
print(f"Auto-piping YouTube selection to .pipe")
|
||||||
@@ -1667,6 +1670,35 @@ def _execute_pipeline(tokens: list):
|
|||||||
filtered_pipe_objs = [coerce_to_pipe_object(item) for item in filtered]
|
filtered_pipe_objs = [coerce_to_pipe_object(item) for item in filtered]
|
||||||
piped_result = filtered_pipe_objs if len(filtered_pipe_objs) > 1 else filtered_pipe_objs[0]
|
piped_result = filtered_pipe_objs if len(filtered_pipe_objs) > 1 else filtered_pipe_objs[0]
|
||||||
print(f"Selected {len(filtered)} item(s) using {cmd_name}")
|
print(f"Selected {len(filtered)} item(s) using {cmd_name}")
|
||||||
|
|
||||||
|
# If selection is the last stage and looks like a provider result,
|
||||||
|
# auto-initiate the borrow/download flow.
|
||||||
|
if stage_index + 1 >= len(stages):
|
||||||
|
try:
|
||||||
|
from ProviderCore.registry import get_search_provider as _get_search_provider
|
||||||
|
except Exception:
|
||||||
|
_get_search_provider = None
|
||||||
|
|
||||||
|
if _get_search_provider is not None:
|
||||||
|
selected_list = filtered_pipe_objs
|
||||||
|
provider_table: Optional[str] = None
|
||||||
|
try:
|
||||||
|
for obj in selected_list:
|
||||||
|
extra = getattr(obj, "extra", None)
|
||||||
|
if isinstance(extra, dict) and extra.get("table"):
|
||||||
|
provider_table = str(extra.get("table"))
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
provider_table = None
|
||||||
|
|
||||||
|
if provider_table:
|
||||||
|
try:
|
||||||
|
provider = _get_search_provider(provider_table, config)
|
||||||
|
except Exception:
|
||||||
|
provider = None
|
||||||
|
if provider is not None:
|
||||||
|
print("Auto-downloading selection via download-file")
|
||||||
|
stages.append(["download-file"])
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
print(f"No items matched selection {cmd_name}\n")
|
print(f"No items matched selection {cmd_name}\n")
|
||||||
@@ -1736,13 +1768,14 @@ def _execute_pipeline(tokens: list):
|
|||||||
}
|
}
|
||||||
# Display-only commands (just show data, don't modify or search)
|
# Display-only commands (just show data, don't modify or search)
|
||||||
display_only_commands = {
|
display_only_commands = {
|
||||||
'get-url', 'get_url', 'get-note', 'get_note',
|
'get-note', 'get_note',
|
||||||
'get-relationship', 'get_relationship', 'get-file', 'get_file',
|
'get-relationship', 'get_relationship', 'get-file', 'get_file',
|
||||||
'check-file-status', 'check_file_status'
|
'check-file-status', 'check_file_status'
|
||||||
}
|
}
|
||||||
# Commands that manage their own table/history state (e.g. get-tag)
|
# Commands that manage their own table/history state (e.g. get-tag)
|
||||||
self_managing_commands = {
|
self_managing_commands = {
|
||||||
'get-tag', 'get_tag', 'tags',
|
'get-tag', 'get_tag', 'tags',
|
||||||
|
'get-url', 'get_url',
|
||||||
'search-file', 'search_file'
|
'search-file', 'search_file'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,19 +1,38 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import base64
|
||||||
|
from concurrent import futures
|
||||||
|
import hashlib
|
||||||
|
import json as json_module
|
||||||
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import sys
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Dict, List, Optional, Tuple
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
from API.HTTP import HTTPClient
|
||||||
from ProviderCore.base import SearchProvider, SearchResult
|
from ProviderCore.base import SearchProvider, SearchResult
|
||||||
from ProviderCore.download import download_file, sanitize_filename
|
from ProviderCore.download import download_file, sanitize_filename
|
||||||
from cli_syntax import get_field, get_free_text, parse_query
|
from cli_syntax import get_field, get_free_text, parse_query
|
||||||
from SYS.logger import log
|
from SYS.logger import log
|
||||||
from SYS.utils import unique_path
|
from SYS.utils import unique_path
|
||||||
|
|
||||||
|
try:
|
||||||
|
from Crypto.Cipher import AES # type: ignore
|
||||||
|
from Crypto.Util import Counter # type: ignore
|
||||||
|
except ImportError:
|
||||||
|
AES = None # type: ignore
|
||||||
|
Counter = None # type: ignore
|
||||||
|
|
||||||
|
try:
|
||||||
|
from tqdm import tqdm # type: ignore
|
||||||
|
except ImportError:
|
||||||
|
tqdm = None # type: ignore
|
||||||
|
|
||||||
|
|
||||||
def _looks_like_isbn(text: str) -> bool:
|
def _looks_like_isbn(text: str) -> bool:
|
||||||
t = (text or "").replace("-", "").strip()
|
t = (text or "").replace("-", "").strip()
|
||||||
@@ -38,6 +57,13 @@ def _resolve_edition_id(doc: Dict[str, Any]) -> str:
|
|||||||
edition_key = doc.get("edition_key")
|
edition_key = doc.get("edition_key")
|
||||||
if isinstance(edition_key, list) and edition_key:
|
if isinstance(edition_key, list) and edition_key:
|
||||||
return str(edition_key[0]).strip()
|
return str(edition_key[0]).strip()
|
||||||
|
if isinstance(edition_key, str) and edition_key.strip():
|
||||||
|
return edition_key.strip()
|
||||||
|
|
||||||
|
# Often present even when edition_key is missing.
|
||||||
|
cover_edition_key = doc.get("cover_edition_key")
|
||||||
|
if isinstance(cover_edition_key, str) and cover_edition_key.strip():
|
||||||
|
return cover_edition_key.strip()
|
||||||
|
|
||||||
# Fallback: sometimes key can be /books/OL...M
|
# Fallback: sometimes key can be /books/OL...M
|
||||||
key = doc.get("key")
|
key = doc.get("key")
|
||||||
@@ -54,7 +80,7 @@ def _check_lendable(session: requests.Session, edition_id: str) -> Tuple[bool, s
|
|||||||
return False, "not-an-edition"
|
return False, "not-an-edition"
|
||||||
|
|
||||||
url = f"https://openlibrary.org/api/volumes/brief/json/OLID:{edition_id}"
|
url = f"https://openlibrary.org/api/volumes/brief/json/OLID:{edition_id}"
|
||||||
resp = session.get(url, timeout=10)
|
resp = session.get(url, timeout=6)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
data = resp.json() or {}
|
data = resp.json() or {}
|
||||||
wrapped = data.get(f"OLID:{edition_id}")
|
wrapped = data.get(f"OLID:{edition_id}")
|
||||||
@@ -88,7 +114,7 @@ def _resolve_archive_id(session: requests.Session, edition_id: str, ia_candidate
|
|||||||
|
|
||||||
# Otherwise query the edition JSON.
|
# Otherwise query the edition JSON.
|
||||||
try:
|
try:
|
||||||
resp = session.get(f"https://openlibrary.org/books/{edition_id}.json", timeout=10)
|
resp = session.get(f"https://openlibrary.org/books/{edition_id}.json", timeout=6)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
data = resp.json() or {}
|
data = resp.json() or {}
|
||||||
|
|
||||||
@@ -116,6 +142,522 @@ class OpenLibrary(SearchProvider):
|
|||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
self._session = requests.Session()
|
self._session = requests.Session()
|
||||||
|
|
||||||
|
class BookNotAvailableError(Exception):
|
||||||
|
"""Raised when a book is not available for borrowing (waitlisted/in use)."""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _credential_archive(config: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
|
||||||
|
"""Get Archive.org email/password from config.
|
||||||
|
|
||||||
|
Supports:
|
||||||
|
- New: {"provider": {"openlibrary": {"email": "...", "password": "..."}}}
|
||||||
|
- Old: {"Archive": {"email": "...", "password": "..."}}
|
||||||
|
{"archive_org_email": "...", "archive_org_password": "..."}
|
||||||
|
"""
|
||||||
|
if not isinstance(config, dict):
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
provider_config = config.get("provider", {})
|
||||||
|
if isinstance(provider_config, dict):
|
||||||
|
openlibrary_config = provider_config.get("openlibrary", {})
|
||||||
|
if isinstance(openlibrary_config, dict):
|
||||||
|
email = openlibrary_config.get("email")
|
||||||
|
password = openlibrary_config.get("password")
|
||||||
|
if email or password:
|
||||||
|
return str(email) if email is not None else None, str(password) if password is not None else None
|
||||||
|
|
||||||
|
archive_config = config.get("Archive")
|
||||||
|
if isinstance(archive_config, dict):
|
||||||
|
email = archive_config.get("email")
|
||||||
|
password = archive_config.get("password")
|
||||||
|
if email or password:
|
||||||
|
return str(email) if email is not None else None, str(password) if password is not None else None
|
||||||
|
|
||||||
|
email = config.get("archive_org_email")
|
||||||
|
password = config.get("archive_org_password")
|
||||||
|
return str(email) if email is not None else None, str(password) if password is not None else None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _archive_error_body(response: requests.Response) -> str:
|
||||||
|
try:
|
||||||
|
body = response.text or ""
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
if len(body) > 2000:
|
||||||
|
return body[:1200] + "\n... (truncated) ...\n" + body[-400:]
|
||||||
|
return body
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _archive_login(cls, email: str, password: str) -> requests.Session:
|
||||||
|
"""Login to archive.org using the token-based services endpoint (matches test-login.py)."""
|
||||||
|
session = requests.Session()
|
||||||
|
|
||||||
|
token_resp = session.get("https://archive.org/services/account/login/", timeout=30)
|
||||||
|
try:
|
||||||
|
token_json = token_resp.json()
|
||||||
|
except Exception as exc:
|
||||||
|
raise RuntimeError(f"Archive login token parse failed: {exc}\n{cls._archive_error_body(token_resp)}")
|
||||||
|
|
||||||
|
if not token_json.get("success"):
|
||||||
|
raise RuntimeError(f"Archive login token fetch failed\n{cls._archive_error_body(token_resp)}")
|
||||||
|
|
||||||
|
token = (token_json.get("value") or {}).get("token")
|
||||||
|
if not token:
|
||||||
|
raise RuntimeError("Archive login token missing")
|
||||||
|
|
||||||
|
headers = {"Content-Type": "application/x-www-form-urlencoded"}
|
||||||
|
payload = {"username": email, "password": password, "t": token}
|
||||||
|
|
||||||
|
login_resp = session.post(
|
||||||
|
"https://archive.org/services/account/login/",
|
||||||
|
headers=headers,
|
||||||
|
data=json_module.dumps(payload),
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
login_json = login_resp.json()
|
||||||
|
except Exception as exc:
|
||||||
|
raise RuntimeError(f"Archive login parse failed: {exc}\n{cls._archive_error_body(login_resp)}")
|
||||||
|
|
||||||
|
if login_json.get("success") is False:
|
||||||
|
if login_json.get("value") == "bad_login":
|
||||||
|
raise RuntimeError("Invalid Archive.org credentials")
|
||||||
|
raise RuntimeError(f"Archive login failed: {login_json}")
|
||||||
|
|
||||||
|
return session
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _archive_loan(cls, session: requests.Session, book_id: str, *, verbose: bool = True) -> requests.Session:
|
||||||
|
data = {"action": "grant_access", "identifier": book_id}
|
||||||
|
session.post("https://archive.org/services/loans/loan/searchInside.php", data=data, timeout=30)
|
||||||
|
data["action"] = "browse_book"
|
||||||
|
response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30)
|
||||||
|
|
||||||
|
if response.status_code == 400:
|
||||||
|
try:
|
||||||
|
err = (response.json() or {}).get("error")
|
||||||
|
if err == "This book is not available to borrow at this time. Please try again later.":
|
||||||
|
raise cls.BookNotAvailableError("Book is waitlisted or in use")
|
||||||
|
raise RuntimeError(f"Borrow failed: {err or response.text}")
|
||||||
|
except cls.BookNotAvailableError:
|
||||||
|
raise
|
||||||
|
except Exception:
|
||||||
|
raise RuntimeError("The book cannot be borrowed")
|
||||||
|
|
||||||
|
data["action"] = "create_token"
|
||||||
|
response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30)
|
||||||
|
if "token" in (response.text or ""):
|
||||||
|
return session
|
||||||
|
raise RuntimeError("Something went wrong when trying to borrow the book")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _archive_return_loan(session: requests.Session, book_id: str) -> None:
|
||||||
|
data = {"action": "return_loan", "identifier": book_id}
|
||||||
|
response = session.post("https://archive.org/services/loans/loan/", data=data, timeout=30)
|
||||||
|
if response.status_code == 200:
|
||||||
|
try:
|
||||||
|
if (response.json() or {}).get("success"):
|
||||||
|
return
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
raise RuntimeError("Something went wrong when trying to return the book")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _archive_get_book_infos(session: requests.Session, url: str) -> Tuple[str, List[str], Dict[str, Any]]:
|
||||||
|
"""Extract page links from Archive.org book reader."""
|
||||||
|
r = session.get(url, timeout=30).text
|
||||||
|
|
||||||
|
# Matches: "url":"//archive.org/..." (allow whitespace)
|
||||||
|
match = re.search(r'"url"\s*:\s*"([^"]+)"', r)
|
||||||
|
if not match:
|
||||||
|
raise RuntimeError("Failed to extract book info URL from response")
|
||||||
|
|
||||||
|
url_path = match.group(1)
|
||||||
|
infos_url = ("https:" + url_path) if url_path.startswith("//") else url_path
|
||||||
|
infos_url = infos_url.replace("\\u0026", "&")
|
||||||
|
|
||||||
|
response = session.get(infos_url, timeout=30)
|
||||||
|
payload = response.json()
|
||||||
|
data = payload["data"]
|
||||||
|
|
||||||
|
title = str(data["brOptions"]["bookTitle"]).strip().replace(" ", "_")
|
||||||
|
title = "".join(c for c in title if c not in '<>:"/\\|?*')
|
||||||
|
title = title[:150]
|
||||||
|
|
||||||
|
metadata = data.get("metadata") or {}
|
||||||
|
links: List[str] = []
|
||||||
|
br_data = (data.get("brOptions") or {}).get("data", [])
|
||||||
|
if isinstance(br_data, list):
|
||||||
|
for item in br_data:
|
||||||
|
if isinstance(item, list):
|
||||||
|
for page in item:
|
||||||
|
if isinstance(page, dict) and "uri" in page:
|
||||||
|
links.append(page["uri"])
|
||||||
|
elif isinstance(item, dict) and "uri" in item:
|
||||||
|
links.append(item["uri"])
|
||||||
|
|
||||||
|
if not links:
|
||||||
|
raise RuntimeError("No pages found in book data")
|
||||||
|
return title, links, metadata if isinstance(metadata, dict) else {}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _archive_image_name(pages: int, page: int, directory: str) -> str:
|
||||||
|
return f"{directory}/{(len(str(pages)) - len(str(page))) * '0'}{page}.jpg"
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _archive_deobfuscate_image(image_data: bytes, link: str, obf_header: str) -> bytes:
|
||||||
|
if not AES or not Counter:
|
||||||
|
raise RuntimeError("Crypto library not available")
|
||||||
|
|
||||||
|
try:
|
||||||
|
version, counter_b64 = obf_header.split("|")
|
||||||
|
except Exception as exc:
|
||||||
|
raise ValueError("Invalid X-Obfuscate header format") from exc
|
||||||
|
|
||||||
|
if version != "1":
|
||||||
|
raise ValueError("Unsupported obfuscation version: " + version)
|
||||||
|
|
||||||
|
aes_key = re.sub(r"^https?:\/\/.*?\/", "/", link)
|
||||||
|
sha1_digest = hashlib.sha1(aes_key.encode("utf-8")).digest()
|
||||||
|
key = sha1_digest[:16]
|
||||||
|
|
||||||
|
counter_bytes = base64.b64decode(counter_b64)
|
||||||
|
if len(counter_bytes) != 16:
|
||||||
|
raise ValueError(f"Expected counter to be 16 bytes, got {len(counter_bytes)}")
|
||||||
|
|
||||||
|
prefix = counter_bytes[:8]
|
||||||
|
initial_value = int.from_bytes(counter_bytes[8:], byteorder="big")
|
||||||
|
ctr = Counter.new(64, prefix=prefix, initial_value=initial_value, little_endian=False) # type: ignore
|
||||||
|
cipher = AES.new(key, AES.MODE_CTR, counter=ctr) # type: ignore
|
||||||
|
|
||||||
|
decrypted_part = cipher.decrypt(image_data[:1024])
|
||||||
|
return decrypted_part + image_data[1024:]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _archive_download_one_image(
|
||||||
|
cls,
|
||||||
|
session: requests.Session,
|
||||||
|
link: str,
|
||||||
|
i: int,
|
||||||
|
directory: str,
|
||||||
|
book_id: str,
|
||||||
|
pages: int,
|
||||||
|
) -> None:
|
||||||
|
headers = {
|
||||||
|
"Referer": "https://archive.org/",
|
||||||
|
"Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
|
||||||
|
"Sec-Fetch-Site": "same-site",
|
||||||
|
"Sec-Fetch-Mode": "no-cors",
|
||||||
|
"Sec-Fetch-Dest": "image",
|
||||||
|
}
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
response = session.get(link, headers=headers, timeout=30)
|
||||||
|
if response.status_code == 403:
|
||||||
|
cls._archive_loan(session, book_id, verbose=False)
|
||||||
|
raise RuntimeError("Borrow again")
|
||||||
|
if response.status_code == 200:
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
image = cls._archive_image_name(pages, i, directory)
|
||||||
|
obf_header = response.headers.get("X-Obfuscate")
|
||||||
|
if obf_header:
|
||||||
|
image_content = cls._archive_deobfuscate_image(response.content, link, obf_header)
|
||||||
|
else:
|
||||||
|
image_content = response.content
|
||||||
|
|
||||||
|
with open(image, "wb") as f:
|
||||||
|
f.write(image_content)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _archive_download(
|
||||||
|
cls,
|
||||||
|
session: requests.Session,
|
||||||
|
n_threads: int,
|
||||||
|
directory: str,
|
||||||
|
links: List[str],
|
||||||
|
scale: int,
|
||||||
|
book_id: str,
|
||||||
|
) -> List[str]:
|
||||||
|
links_scaled = [f"{link}&rotate=0&scale={scale}" for link in links]
|
||||||
|
pages = len(links_scaled)
|
||||||
|
|
||||||
|
tasks = []
|
||||||
|
with futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
|
||||||
|
for i, link in enumerate(links_scaled):
|
||||||
|
tasks.append(
|
||||||
|
executor.submit(
|
||||||
|
cls._archive_download_one_image,
|
||||||
|
session=session,
|
||||||
|
link=link,
|
||||||
|
i=i,
|
||||||
|
directory=directory,
|
||||||
|
book_id=book_id,
|
||||||
|
pages=pages,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if tqdm:
|
||||||
|
for _ in tqdm(futures.as_completed(tasks), total=len(tasks)): # type: ignore
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
for _ in futures.as_completed(tasks):
|
||||||
|
pass
|
||||||
|
|
||||||
|
return [cls._archive_image_name(pages, i, directory) for i in range(pages)]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _archive_check_direct_download(book_id: str) -> Tuple[bool, str]:
|
||||||
|
"""Check for a directly downloadable original PDF in Archive.org metadata."""
|
||||||
|
try:
|
||||||
|
metadata_url = f"https://archive.org/metadata/{book_id}"
|
||||||
|
response = requests.get(metadata_url, timeout=6)
|
||||||
|
response.raise_for_status()
|
||||||
|
metadata = response.json()
|
||||||
|
files = metadata.get("files") if isinstance(metadata, dict) else None
|
||||||
|
if isinstance(files, list):
|
||||||
|
for file_info in files:
|
||||||
|
if not isinstance(file_info, dict):
|
||||||
|
continue
|
||||||
|
filename = str(file_info.get("name", ""))
|
||||||
|
if filename.endswith(".pdf") and file_info.get("source") == "original":
|
||||||
|
pdf_url = f"https://archive.org/download/{book_id}/{filename.replace(' ', '%20')}"
|
||||||
|
check_response = requests.head(pdf_url, timeout=4, allow_redirects=True)
|
||||||
|
if check_response.status_code == 200:
|
||||||
|
return True, pdf_url
|
||||||
|
return False, ""
|
||||||
|
except Exception:
|
||||||
|
return False, ""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def scrape_isbn_metadata(isbn: str) -> List[str]:
|
||||||
|
"""Scrape tags for an ISBN using Open Library API.
|
||||||
|
|
||||||
|
Returns tags such as:
|
||||||
|
- title:<...>, author:<...>, publish_date:<...>, publisher:<...>, description:<...>, pages:<...>
|
||||||
|
- identifiers: openlibrary:<...>, lccn:<...>, oclc:<...>, goodreads:<...>, librarything:<...>, doi:<...>, internet_archive:<...>
|
||||||
|
"""
|
||||||
|
new_tags: List[str] = []
|
||||||
|
|
||||||
|
isbn_clean = str(isbn or "").replace("isbn:", "").replace("-", "").strip()
|
||||||
|
if not isbn_clean:
|
||||||
|
return []
|
||||||
|
|
||||||
|
url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json"
|
||||||
|
try:
|
||||||
|
with HTTPClient() as client:
|
||||||
|
response = client.get(url)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = json_module.loads(response.content.decode("utf-8"))
|
||||||
|
except Exception as exc:
|
||||||
|
log(f"Failed to fetch ISBN metadata: {exc}", file=sys.stderr)
|
||||||
|
return []
|
||||||
|
|
||||||
|
if not data:
|
||||||
|
log(f"No ISBN metadata found for: {isbn}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
book_data = next(iter(data.values()), None)
|
||||||
|
if not isinstance(book_data, dict):
|
||||||
|
return []
|
||||||
|
|
||||||
|
if "title" in book_data:
|
||||||
|
new_tags.append(f"title:{book_data['title']}")
|
||||||
|
|
||||||
|
authors = book_data.get("authors")
|
||||||
|
if isinstance(authors, list):
|
||||||
|
for author in authors[:3]:
|
||||||
|
if isinstance(author, dict) and author.get("name"):
|
||||||
|
new_tags.append(f"author:{author['name']}")
|
||||||
|
|
||||||
|
if book_data.get("publish_date"):
|
||||||
|
new_tags.append(f"publish_date:{book_data['publish_date']}")
|
||||||
|
|
||||||
|
publishers = book_data.get("publishers")
|
||||||
|
if isinstance(publishers, list) and publishers:
|
||||||
|
pub = publishers[0]
|
||||||
|
if isinstance(pub, dict) and pub.get("name"):
|
||||||
|
new_tags.append(f"publisher:{pub['name']}")
|
||||||
|
|
||||||
|
if "description" in book_data:
|
||||||
|
desc = book_data.get("description")
|
||||||
|
if isinstance(desc, dict) and "value" in desc:
|
||||||
|
desc = desc.get("value")
|
||||||
|
if desc:
|
||||||
|
desc_str = str(desc).strip()
|
||||||
|
if desc_str:
|
||||||
|
new_tags.append(f"description:{desc_str[:200]}")
|
||||||
|
|
||||||
|
page_count = book_data.get("number_of_pages")
|
||||||
|
if isinstance(page_count, int) and page_count > 0:
|
||||||
|
new_tags.append(f"pages:{page_count}")
|
||||||
|
|
||||||
|
identifiers = book_data.get("identifiers")
|
||||||
|
if isinstance(identifiers, dict):
|
||||||
|
|
||||||
|
def _first(value: Any) -> Any:
|
||||||
|
if isinstance(value, list) and value:
|
||||||
|
return value[0]
|
||||||
|
return value
|
||||||
|
|
||||||
|
for key, ns in (
|
||||||
|
("openlibrary", "openlibrary"),
|
||||||
|
("lccn", "lccn"),
|
||||||
|
("oclc", "oclc"),
|
||||||
|
("goodreads", "goodreads"),
|
||||||
|
("librarything", "librarything"),
|
||||||
|
("doi", "doi"),
|
||||||
|
("internet_archive", "internet_archive"),
|
||||||
|
):
|
||||||
|
val = _first(identifiers.get(key))
|
||||||
|
if val:
|
||||||
|
new_tags.append(f"{ns}:{val}")
|
||||||
|
|
||||||
|
log(f"Found {len(new_tags)} tag(s) from ISBN lookup")
|
||||||
|
return new_tags
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def scrape_openlibrary_metadata(olid: str) -> List[str]:
|
||||||
|
"""Scrape tags for an OpenLibrary ID using the .json API endpoint."""
|
||||||
|
new_tags: List[str] = []
|
||||||
|
|
||||||
|
olid_text = str(olid or "").strip()
|
||||||
|
if not olid_text:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Normalize OLID to the common "OL<digits>M" form when possible.
|
||||||
|
olid_norm = olid_text
|
||||||
|
try:
|
||||||
|
if not olid_norm.startswith("OL"):
|
||||||
|
olid_norm = f"OL{olid_norm}"
|
||||||
|
if not olid_norm.endswith("M"):
|
||||||
|
olid_norm = f"{olid_norm}M"
|
||||||
|
except Exception:
|
||||||
|
olid_norm = olid_text
|
||||||
|
|
||||||
|
# Ensure we always include a scrapeable identifier tag.
|
||||||
|
new_tags.append(f"openlibrary:{olid_norm}")
|
||||||
|
|
||||||
|
# Accept OL9674499M, 9674499M, or just digits.
|
||||||
|
olid_clean = olid_text.replace("OL", "").replace("M", "")
|
||||||
|
if not olid_clean.isdigit():
|
||||||
|
olid_clean = olid_text
|
||||||
|
|
||||||
|
if not olid_text.startswith("OL"):
|
||||||
|
url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
|
||||||
|
else:
|
||||||
|
url = f"https://openlibrary.org/books/{olid_text}.json"
|
||||||
|
|
||||||
|
try:
|
||||||
|
with HTTPClient() as client:
|
||||||
|
response = client.get(url)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = json_module.loads(response.content.decode("utf-8"))
|
||||||
|
except Exception as exc:
|
||||||
|
log(f"Failed to fetch OpenLibrary metadata: {exc}", file=sys.stderr)
|
||||||
|
return []
|
||||||
|
|
||||||
|
if not isinstance(data, dict) or not data:
|
||||||
|
log(f"No OpenLibrary metadata found for: {olid_text}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
if "title" in data:
|
||||||
|
new_tags.append(f"title:{data['title']}")
|
||||||
|
|
||||||
|
authors = data.get("authors")
|
||||||
|
if isinstance(authors, list):
|
||||||
|
for author in authors[:3]:
|
||||||
|
if isinstance(author, dict) and author.get("name"):
|
||||||
|
new_tags.append(f"author:{author['name']}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Common OL shape: {"key": "/authors/OL...A"} or {"author": {"key": ...}}
|
||||||
|
author_key = None
|
||||||
|
if isinstance(author, dict):
|
||||||
|
if isinstance(author.get("author"), dict):
|
||||||
|
author_key = author.get("author", {}).get("key")
|
||||||
|
if not author_key:
|
||||||
|
author_key = author.get("key")
|
||||||
|
|
||||||
|
if isinstance(author_key, str) and author_key.startswith("/"):
|
||||||
|
try:
|
||||||
|
author_url = f"https://openlibrary.org{author_key}.json"
|
||||||
|
with HTTPClient(timeout=10) as client:
|
||||||
|
author_resp = client.get(author_url)
|
||||||
|
author_resp.raise_for_status()
|
||||||
|
author_data = json_module.loads(author_resp.content.decode("utf-8"))
|
||||||
|
if isinstance(author_data, dict) and author_data.get("name"):
|
||||||
|
new_tags.append(f"author:{author_data['name']}")
|
||||||
|
continue
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if isinstance(author, str) and author:
|
||||||
|
new_tags.append(f"author:{author}")
|
||||||
|
|
||||||
|
if data.get("publish_date"):
|
||||||
|
new_tags.append(f"publish_date:{data['publish_date']}")
|
||||||
|
|
||||||
|
publishers = data.get("publishers")
|
||||||
|
if isinstance(publishers, list) and publishers:
|
||||||
|
pub = publishers[0]
|
||||||
|
if isinstance(pub, dict) and pub.get("name"):
|
||||||
|
new_tags.append(f"publisher:{pub['name']}")
|
||||||
|
elif isinstance(pub, str) and pub:
|
||||||
|
new_tags.append(f"publisher:{pub}")
|
||||||
|
|
||||||
|
if "description" in data:
|
||||||
|
desc = data.get("description")
|
||||||
|
if isinstance(desc, dict) and "value" in desc:
|
||||||
|
desc = desc.get("value")
|
||||||
|
if desc:
|
||||||
|
desc_str = str(desc).strip()
|
||||||
|
if desc_str:
|
||||||
|
new_tags.append(f"description:{desc_str[:200]}")
|
||||||
|
|
||||||
|
page_count = data.get("number_of_pages")
|
||||||
|
if isinstance(page_count, int) and page_count > 0:
|
||||||
|
new_tags.append(f"pages:{page_count}")
|
||||||
|
|
||||||
|
subjects = data.get("subjects")
|
||||||
|
if isinstance(subjects, list):
|
||||||
|
for subject in subjects[:10]:
|
||||||
|
if isinstance(subject, str):
|
||||||
|
subject_clean = subject.strip()
|
||||||
|
if subject_clean and subject_clean not in new_tags:
|
||||||
|
new_tags.append(subject_clean)
|
||||||
|
|
||||||
|
identifiers = data.get("identifiers")
|
||||||
|
if isinstance(identifiers, dict):
|
||||||
|
|
||||||
|
def _first(value: Any) -> Any:
|
||||||
|
if isinstance(value, list) and value:
|
||||||
|
return value[0]
|
||||||
|
return value
|
||||||
|
|
||||||
|
for key, ns in (
|
||||||
|
("isbn_10", "isbn_10"),
|
||||||
|
("isbn_13", "isbn_13"),
|
||||||
|
("lccn", "lccn"),
|
||||||
|
("oclc_numbers", "oclc"),
|
||||||
|
("goodreads", "goodreads"),
|
||||||
|
("internet_archive", "internet_archive"),
|
||||||
|
):
|
||||||
|
val = _first(identifiers.get(key))
|
||||||
|
if val:
|
||||||
|
new_tags.append(f"{ns}:{val}")
|
||||||
|
|
||||||
|
# Some editions expose a direct Archive.org identifier as "ocaid".
|
||||||
|
ocaid = data.get("ocaid")
|
||||||
|
if isinstance(ocaid, str) and ocaid.strip():
|
||||||
|
new_tags.append(f"internet_archive:{ocaid.strip()}")
|
||||||
|
|
||||||
|
log(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup")
|
||||||
|
return new_tags
|
||||||
|
|
||||||
def search(
|
def search(
|
||||||
self,
|
self,
|
||||||
query: str,
|
query: str,
|
||||||
@@ -155,7 +697,70 @@ class OpenLibrary(SearchProvider):
|
|||||||
if not isinstance(docs, list):
|
if not isinstance(docs, list):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
for doc in docs[: int(limit)]:
|
# Availability enrichment can be slow if done sequentially (it may require multiple
|
||||||
|
# network calls per row). Do it concurrently to keep the pipeline responsive.
|
||||||
|
docs = docs[: int(limit)]
|
||||||
|
|
||||||
|
def _compute_availability(doc_dict: Dict[str, Any]) -> Tuple[str, str, str, str]:
|
||||||
|
edition_id_local = _resolve_edition_id(doc_dict)
|
||||||
|
if not edition_id_local:
|
||||||
|
return "no-olid", "", "", ""
|
||||||
|
|
||||||
|
ia_val_local = doc_dict.get("ia") or []
|
||||||
|
if isinstance(ia_val_local, str):
|
||||||
|
ia_val_local = [ia_val_local]
|
||||||
|
if not isinstance(ia_val_local, list):
|
||||||
|
ia_val_local = []
|
||||||
|
ia_ids_local = [str(x) for x in ia_val_local if x]
|
||||||
|
|
||||||
|
session_local = requests.Session()
|
||||||
|
|
||||||
|
try:
|
||||||
|
archive_id_local = _resolve_archive_id(session_local, edition_id_local, ia_ids_local)
|
||||||
|
except Exception:
|
||||||
|
archive_id_local = ""
|
||||||
|
|
||||||
|
if not archive_id_local:
|
||||||
|
return "no-archive", "", "", ""
|
||||||
|
|
||||||
|
# Prefer the fastest signal first: OpenLibrary lendable status.
|
||||||
|
lendable_local, reason_local = _check_lendable(session_local, edition_id_local)
|
||||||
|
if lendable_local:
|
||||||
|
return "borrow", reason_local, archive_id_local, ""
|
||||||
|
|
||||||
|
# Not lendable: check whether it's directly downloadable (public domain uploads, etc.).
|
||||||
|
try:
|
||||||
|
can_direct, pdf_url = self._archive_check_direct_download(archive_id_local)
|
||||||
|
if can_direct and pdf_url:
|
||||||
|
return "download", reason_local, archive_id_local, str(pdf_url)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return "unavailable", reason_local, archive_id_local, ""
|
||||||
|
|
||||||
|
availability_rows: List[Tuple[str, str, str, str]] = [("unknown", "", "", "") for _ in range(len(docs))]
|
||||||
|
if docs:
|
||||||
|
log(f"[openlibrary] Enriching availability for {len(docs)} result(s)...")
|
||||||
|
max_workers = min(8, max(1, len(docs)))
|
||||||
|
done = 0
|
||||||
|
with futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||||
|
future_to_index = {
|
||||||
|
executor.submit(_compute_availability, doc_dict): i
|
||||||
|
for i, doc_dict in enumerate(docs)
|
||||||
|
if isinstance(doc_dict, dict)
|
||||||
|
}
|
||||||
|
for fut in futures.as_completed(list(future_to_index.keys())):
|
||||||
|
i = future_to_index[fut]
|
||||||
|
try:
|
||||||
|
availability_rows[i] = fut.result()
|
||||||
|
except Exception:
|
||||||
|
availability_rows[i] = ("unknown", "", "", "")
|
||||||
|
done += 1
|
||||||
|
if done in {1, len(future_to_index)} or (done % 10 == 0):
|
||||||
|
log(f"[openlibrary] Availability: {done}/{len(future_to_index)}")
|
||||||
|
log("[openlibrary] Availability enrichment complete")
|
||||||
|
|
||||||
|
for idx, doc in enumerate(docs):
|
||||||
if not isinstance(doc, dict):
|
if not isinstance(doc, dict):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -172,6 +777,7 @@ class OpenLibrary(SearchProvider):
|
|||||||
year = str(year_val) if year_val is not None else ""
|
year = str(year_val) if year_val is not None else ""
|
||||||
|
|
||||||
edition_id = _resolve_edition_id(doc)
|
edition_id = _resolve_edition_id(doc)
|
||||||
|
work_key = doc.get("key") if isinstance(doc.get("key"), str) else ""
|
||||||
|
|
||||||
ia_val = doc.get("ia") or []
|
ia_val = doc.get("ia") or []
|
||||||
if isinstance(ia_val, str):
|
if isinstance(ia_val, str):
|
||||||
@@ -193,9 +799,21 @@ class OpenLibrary(SearchProvider):
|
|||||||
("Title", book_title),
|
("Title", book_title),
|
||||||
("Author", ", ".join(authors_list)),
|
("Author", ", ".join(authors_list)),
|
||||||
("Year", year),
|
("Year", year),
|
||||||
|
("Avail", ""),
|
||||||
("OLID", edition_id),
|
("OLID", edition_id),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Determine availability using the concurrently computed enrichment.
|
||||||
|
availability, availability_reason, archive_id, direct_url = ("unknown", "", "", "")
|
||||||
|
if 0 <= idx < len(availability_rows):
|
||||||
|
availability, availability_reason, archive_id, direct_url = availability_rows[idx]
|
||||||
|
|
||||||
|
# Patch the display column.
|
||||||
|
for idx, (name, _val) in enumerate(columns):
|
||||||
|
if name == "Avail":
|
||||||
|
columns[idx] = ("Avail", availability)
|
||||||
|
break
|
||||||
|
|
||||||
annotations: List[str] = []
|
annotations: List[str] = []
|
||||||
if isbn_13:
|
if isbn_13:
|
||||||
annotations.append(f"isbn_13:{isbn_13}")
|
annotations.append(f"isbn_13:{isbn_13}")
|
||||||
@@ -203,12 +821,18 @@ class OpenLibrary(SearchProvider):
|
|||||||
annotations.append(f"isbn_10:{isbn_10}")
|
annotations.append(f"isbn_10:{isbn_10}")
|
||||||
if ia_ids:
|
if ia_ids:
|
||||||
annotations.append("archive")
|
annotations.append("archive")
|
||||||
|
if availability in {"download", "borrow"}:
|
||||||
|
annotations.append(availability)
|
||||||
|
|
||||||
results.append(
|
results.append(
|
||||||
SearchResult(
|
SearchResult(
|
||||||
table="openlibrary",
|
table="openlibrary",
|
||||||
title=book_title,
|
title=book_title,
|
||||||
path=(f"https://openlibrary.org/books/{edition_id}" if edition_id else "https://openlibrary.org"),
|
path=(
|
||||||
|
f"https://openlibrary.org/books/{edition_id}" if edition_id else (
|
||||||
|
f"https://openlibrary.org{work_key}" if isinstance(work_key, str) and work_key.startswith("/") else "https://openlibrary.org"
|
||||||
|
)
|
||||||
|
),
|
||||||
detail=(
|
detail=(
|
||||||
(f"By: {', '.join(authors_list)}" if authors_list else "")
|
(f"By: {', '.join(authors_list)}" if authors_list else "")
|
||||||
+ (f" ({year})" if year else "")
|
+ (f" ({year})" if year else "")
|
||||||
@@ -218,11 +842,16 @@ class OpenLibrary(SearchProvider):
|
|||||||
columns=columns,
|
columns=columns,
|
||||||
full_metadata={
|
full_metadata={
|
||||||
"openlibrary_id": edition_id,
|
"openlibrary_id": edition_id,
|
||||||
|
"openlibrary_key": work_key,
|
||||||
"authors": authors_list,
|
"authors": authors_list,
|
||||||
"year": year,
|
"year": year,
|
||||||
"isbn_10": isbn_10,
|
"isbn_10": isbn_10,
|
||||||
"isbn_13": isbn_13,
|
"isbn_13": isbn_13,
|
||||||
"ia": ia_ids,
|
"ia": ia_ids,
|
||||||
|
"availability": availability,
|
||||||
|
"availability_reason": availability_reason,
|
||||||
|
"archive_id": archive_id,
|
||||||
|
"direct_url": direct_url,
|
||||||
"raw": doc,
|
"raw": doc,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
@@ -256,9 +885,7 @@ class OpenLibrary(SearchProvider):
|
|||||||
|
|
||||||
# 1) Direct download if available.
|
# 1) Direct download if available.
|
||||||
try:
|
try:
|
||||||
from API.archive_client import check_direct_download
|
can_direct, pdf_url = self._archive_check_direct_download(archive_id)
|
||||||
|
|
||||||
can_direct, pdf_url = check_direct_download(archive_id)
|
|
||||||
except Exception:
|
except Exception:
|
||||||
can_direct, pdf_url = False, ""
|
can_direct, pdf_url = False, ""
|
||||||
|
|
||||||
@@ -272,10 +899,7 @@ class OpenLibrary(SearchProvider):
|
|||||||
|
|
||||||
# 2) Borrow flow (credentials required).
|
# 2) Borrow flow (credentials required).
|
||||||
try:
|
try:
|
||||||
from API.archive_client import BookNotAvailableError, credential_openlibrary, download as archive_download
|
email, password = self._credential_archive(self.config or {})
|
||||||
from API.archive_client import get_book_infos, loan, login
|
|
||||||
|
|
||||||
email, password = credential_openlibrary(self.config or {})
|
|
||||||
if not email or not password:
|
if not email or not password:
|
||||||
log("[openlibrary] Archive credentials missing; cannot borrow", file=sys.stderr)
|
log("[openlibrary] Archive credentials missing; cannot borrow", file=sys.stderr)
|
||||||
return None
|
return None
|
||||||
@@ -285,13 +909,13 @@ class OpenLibrary(SearchProvider):
|
|||||||
log(f"[openlibrary] Not lendable: {reason}", file=sys.stderr)
|
log(f"[openlibrary] Not lendable: {reason}", file=sys.stderr)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
session = login(email, password)
|
session = self._archive_login(email, password)
|
||||||
try:
|
try:
|
||||||
session = loan(session, archive_id, verbose=False)
|
session = self._archive_loan(session, archive_id, verbose=False)
|
||||||
except BookNotAvailableError:
|
except self.BookNotAvailableError:
|
||||||
log("[openlibrary] Book not available to borrow", file=sys.stderr)
|
log("[openlibrary] Book not available to borrow", file=sys.stderr)
|
||||||
return None
|
return None
|
||||||
except SystemExit:
|
except Exception:
|
||||||
log("[openlibrary] Borrow failed", file=sys.stderr)
|
log("[openlibrary] Borrow failed", file=sys.stderr)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@@ -301,7 +925,7 @@ class OpenLibrary(SearchProvider):
|
|||||||
last_exc: Optional[Exception] = None
|
last_exc: Optional[Exception] = None
|
||||||
for u in urls:
|
for u in urls:
|
||||||
try:
|
try:
|
||||||
title_raw, links, _metadata = get_book_infos(session, u)
|
title_raw, links, _metadata = self._archive_get_book_infos(session, u)
|
||||||
if title_raw:
|
if title_raw:
|
||||||
title = sanitize_filename(title_raw)
|
title = sanitize_filename(title_raw)
|
||||||
break
|
break
|
||||||
@@ -315,7 +939,7 @@ class OpenLibrary(SearchProvider):
|
|||||||
|
|
||||||
temp_dir = tempfile.mkdtemp(prefix=f"{title}_", dir=str(output_dir))
|
temp_dir = tempfile.mkdtemp(prefix=f"{title}_", dir=str(output_dir))
|
||||||
try:
|
try:
|
||||||
images = archive_download(session=session, n_threads=10, directory=temp_dir, links=links, scale=3, book_id=archive_id)
|
images = self._archive_download(session=session, n_threads=10, directory=temp_dir, links=links, scale=3, book_id=archive_id)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import img2pdf # type: ignore
|
import img2pdf # type: ignore
|
||||||
|
|||||||
@@ -642,7 +642,7 @@ def _download_direct_file(
|
|||||||
return DownloadMediaResult(
|
return DownloadMediaResult(
|
||||||
path=file_path,
|
path=file_path,
|
||||||
info=info,
|
info=info,
|
||||||
tags=tags,
|
tag=tags,
|
||||||
source_url=url,
|
source_url=url,
|
||||||
hash_value=hash_value,
|
hash_value=hash_value,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -36,6 +36,7 @@ mime_maps = {
|
|||||||
"mp3": { "ext": ".mp3", "mimes": ["audio/mpeg", "audio/mp3"] },
|
"mp3": { "ext": ".mp3", "mimes": ["audio/mpeg", "audio/mp3"] },
|
||||||
"m4a": { "ext": ".m4a", "mimes": ["audio/mp4", "audio/x-m4a"] },
|
"m4a": { "ext": ".m4a", "mimes": ["audio/mp4", "audio/x-m4a"] },
|
||||||
"ogg": { "ext": ".ogg", "mimes": ["audio/ogg"] },
|
"ogg": { "ext": ".ogg", "mimes": ["audio/ogg"] },
|
||||||
|
"opus": { "ext": ".opus", "mimes": ["audio/opus"] },
|
||||||
"flac": { "ext": ".flac", "mimes": ["audio/flac"] },
|
"flac": { "ext": ".flac", "mimes": ["audio/flac"] },
|
||||||
"wav": { "ext": ".wav", "mimes": ["audio/wav", "audio/x-wav", "audio/vnd.wave"] },
|
"wav": { "ext": ".wav", "mimes": ["audio/wav", "audio/x-wav", "audio/vnd.wave"] },
|
||||||
"wma": { "ext": ".wma", "mimes": ["audio/x-ms-wma"] },
|
"wma": { "ext": ".wma", "mimes": ["audio/x-ms-wma"] },
|
||||||
@@ -98,3 +99,13 @@ def get_type_from_ext(ext: str) -> str:
|
|||||||
return type_name
|
return type_name
|
||||||
|
|
||||||
return 'other'
|
return 'other'
|
||||||
|
|
||||||
|
|
||||||
|
# Canonical supported extension set for all stores/cmdlets.
|
||||||
|
# Derived from mime_maps so there is a single source of truth.
|
||||||
|
ALL_SUPPORTED_EXTENSIONS: set[str] = {
|
||||||
|
spec["ext"].lower()
|
||||||
|
for group in mime_maps.values()
|
||||||
|
for spec in group.values()
|
||||||
|
if isinstance(spec, dict) and isinstance(spec.get("ext"), str) and spec.get("ext")
|
||||||
|
}
|
||||||
|
|||||||
175
Store/Folder.py
175
Store/Folder.py
@@ -30,6 +30,8 @@ def _resolve_file_hash(db_hash: Optional[str], file_path: Path) -> Optional[str]
|
|||||||
return _normalize_hash(file_path.stem)
|
return _normalize_hash(file_path.stem)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class Folder(Store):
|
class Folder(Store):
|
||||||
""""""
|
""""""
|
||||||
# Track which locations have already been migrated to avoid repeated migrations
|
# Track which locations have already been migrated to avoid repeated migrations
|
||||||
@@ -360,6 +362,17 @@ class Folder(Store):
|
|||||||
shutil.copy2(str(file_path), str(save_file))
|
shutil.copy2(str(file_path), str(save_file))
|
||||||
debug(f"Local copy: {save_file}", file=sys.stderr)
|
debug(f"Local copy: {save_file}", file=sys.stderr)
|
||||||
|
|
||||||
|
# Best-effort: capture duration for media
|
||||||
|
duration_value: float | None = None
|
||||||
|
try:
|
||||||
|
from SYS.utils import ffprobe
|
||||||
|
probe = ffprobe(str(save_file))
|
||||||
|
duration = probe.get("duration")
|
||||||
|
if isinstance(duration, (int, float)) and duration > 0:
|
||||||
|
duration_value = float(duration)
|
||||||
|
except Exception:
|
||||||
|
duration_value = None
|
||||||
|
|
||||||
# Save to database
|
# Save to database
|
||||||
with API_folder_store(Path(self._location)) as db:
|
with API_folder_store(Path(self._location)) as db:
|
||||||
db.get_or_create_file_entry(save_file)
|
db.get_or_create_file_entry(save_file)
|
||||||
@@ -368,7 +381,8 @@ class Folder(Store):
|
|||||||
db.save_metadata(save_file, {
|
db.save_metadata(save_file, {
|
||||||
'hash': file_hash,
|
'hash': file_hash,
|
||||||
'ext': ext_clean,
|
'ext': ext_clean,
|
||||||
'size': file_path.stat().st_size
|
'size': file_path.stat().st_size,
|
||||||
|
'duration': duration_value,
|
||||||
})
|
})
|
||||||
|
|
||||||
# Add tags if provided
|
# Add tags if provided
|
||||||
@@ -405,6 +419,21 @@ class Folder(Store):
|
|||||||
results = []
|
results = []
|
||||||
search_dir = Path(self._location).expanduser()
|
search_dir = Path(self._location).expanduser()
|
||||||
|
|
||||||
|
def _url_like_pattern(value: str) -> str:
|
||||||
|
# Interpret user patterns as substring matches (with optional glob wildcards).
|
||||||
|
v = (value or "").strip().lower()
|
||||||
|
if not v or v == "*":
|
||||||
|
return "%"
|
||||||
|
v = v.replace("%", "\\%").replace("_", "\\_")
|
||||||
|
v = v.replace("*", "%").replace("?", "_")
|
||||||
|
if "%" not in v and "_" not in v:
|
||||||
|
return f"%{v}%"
|
||||||
|
if not v.startswith("%"):
|
||||||
|
v = "%" + v
|
||||||
|
if not v.endswith("%"):
|
||||||
|
v = v + "%"
|
||||||
|
return v
|
||||||
|
|
||||||
tokens = [t.strip() for t in query.split(',') if t.strip()]
|
tokens = [t.strip() for t in query.split(',') if t.strip()]
|
||||||
|
|
||||||
if not match_all and len(tokens) == 1 and _normalize_hash(query):
|
if not match_all and len(tokens) == 1 and _normalize_hash(query):
|
||||||
@@ -453,6 +482,8 @@ class Folder(Store):
|
|||||||
try:
|
try:
|
||||||
with DatabaseAPI(search_dir) as api:
|
with DatabaseAPI(search_dir) as api:
|
||||||
if tokens and len(tokens) > 1:
|
if tokens and len(tokens) > 1:
|
||||||
|
url_fetch_limit = (limit or 45) * 50
|
||||||
|
|
||||||
def _like_pattern(term: str) -> str:
|
def _like_pattern(term: str) -> str:
|
||||||
return term.replace('*', '%').replace('?', '_')
|
return term.replace('*', '%').replace('?', '_')
|
||||||
|
|
||||||
@@ -473,6 +504,11 @@ class Folder(Store):
|
|||||||
h = api.get_file_hash_by_hash(normalized_hash)
|
h = api.get_file_hash_by_hash(normalized_hash)
|
||||||
return {h} if h else set()
|
return {h} if h else set()
|
||||||
|
|
||||||
|
if namespace == 'url':
|
||||||
|
if not pattern or pattern == '*':
|
||||||
|
return api.get_file_hashes_with_any_url(limit=url_fetch_limit)
|
||||||
|
return api.get_file_hashes_by_url_like(_url_like_pattern(pattern), limit=url_fetch_limit)
|
||||||
|
|
||||||
if namespace == 'store':
|
if namespace == 'store':
|
||||||
if pattern not in {'local', 'file', 'filesystem'}:
|
if pattern not in {'local', 'file', 'filesystem'}:
|
||||||
return set()
|
return set()
|
||||||
@@ -563,6 +599,29 @@ class Folder(Store):
|
|||||||
return results
|
return results
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
if namespace == "url":
|
||||||
|
if not pattern or pattern == "*":
|
||||||
|
rows = api.get_files_with_any_url(limit)
|
||||||
|
else:
|
||||||
|
rows = api.get_files_by_url_like(_url_like_pattern(pattern), limit)
|
||||||
|
for file_hash, file_path_str, size_bytes, ext in rows:
|
||||||
|
if not file_path_str:
|
||||||
|
continue
|
||||||
|
file_path = Path(file_path_str)
|
||||||
|
if not file_path.exists():
|
||||||
|
continue
|
||||||
|
if size_bytes is None:
|
||||||
|
try:
|
||||||
|
size_bytes = file_path.stat().st_size
|
||||||
|
except OSError:
|
||||||
|
size_bytes = None
|
||||||
|
tags = api.get_tags_for_file(file_hash)
|
||||||
|
entry = _create_entry(file_path, tags, size_bytes, file_hash)
|
||||||
|
results.append(entry)
|
||||||
|
if limit is not None and len(results) >= limit:
|
||||||
|
return results
|
||||||
|
return results
|
||||||
|
|
||||||
query_pattern = f"{namespace}:%"
|
query_pattern = f"{namespace}:%"
|
||||||
rows = api.get_files_by_namespace_pattern(query_pattern, limit)
|
rows = api.get_files_by_namespace_pattern(query_pattern, limit)
|
||||||
debug(f"Found {len(rows)} potential matches in DB")
|
debug(f"Found {len(rows)} potential matches in DB")
|
||||||
@@ -592,84 +651,37 @@ class Folder(Store):
|
|||||||
if limit is not None and len(results) >= limit:
|
if limit is not None and len(results) >= limit:
|
||||||
return results
|
return results
|
||||||
elif not match_all:
|
elif not match_all:
|
||||||
|
# Strict tag-based search only (no filename/path searching).
|
||||||
terms = [t.strip() for t in query_lower.replace(',', ' ').split() if t.strip()]
|
terms = [t.strip() for t in query_lower.replace(',', ' ').split() if t.strip()]
|
||||||
if not terms:
|
if not terms:
|
||||||
terms = [query_lower]
|
terms = [query_lower]
|
||||||
|
|
||||||
debug(f"Performing filename/tag search for terms: {terms}")
|
|
||||||
|
|
||||||
fetch_limit = (limit or 45) * 50
|
fetch_limit = (limit or 45) * 50
|
||||||
|
|
||||||
conditions = ["LOWER(f.file_path) LIKE ?" for _ in terms]
|
# AND semantics across terms: each term must match at least one tag.
|
||||||
params = [f"%{t}%" for t in terms]
|
hits: dict[str, dict[str, Any]] = {}
|
||||||
|
|
||||||
rows = api.get_files_by_multiple_path_conditions(conditions, params, fetch_limit)
|
|
||||||
debug(f"Found {len(rows)} filename matches in DB (before whole-word filter)")
|
|
||||||
|
|
||||||
word_regex = None
|
|
||||||
if len(terms) == 1:
|
|
||||||
term = terms[0]
|
|
||||||
has_wildcard = '*' in term or '?' in term
|
|
||||||
|
|
||||||
if has_wildcard:
|
|
||||||
try:
|
|
||||||
from fnmatch import translate
|
|
||||||
word_regex = re.compile(translate(term), re.IGNORECASE)
|
|
||||||
except Exception:
|
|
||||||
word_regex = None
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
pattern = r'(?<![a-zA-Z0-9])' + re.escape(term) + r'(?![a-zA-Z0-9])'
|
|
||||||
word_regex = re.compile(pattern, re.IGNORECASE)
|
|
||||||
except Exception:
|
|
||||||
word_regex = None
|
|
||||||
|
|
||||||
seen_files = set()
|
|
||||||
for file_id, file_path_str, size_bytes, file_hash in rows:
|
|
||||||
if not file_path_str or file_path_str in seen_files:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if word_regex:
|
|
||||||
p = Path(file_path_str)
|
|
||||||
if not word_regex.search(p.name):
|
|
||||||
continue
|
|
||||||
seen_files.add(file_path_str)
|
|
||||||
|
|
||||||
file_path = Path(file_path_str)
|
|
||||||
if file_path.exists():
|
|
||||||
if size_bytes is None:
|
|
||||||
size_bytes = file_path.stat().st_size
|
|
||||||
|
|
||||||
tags = api.get_tags_for_file(file_hash)
|
|
||||||
entry = _create_entry(file_path, tags, size_bytes, file_hash)
|
|
||||||
results.append(entry)
|
|
||||||
if limit is not None and len(results) >= limit:
|
|
||||||
return results
|
|
||||||
|
|
||||||
if terms:
|
|
||||||
title_hits: dict[str, dict[str, Any]] = {}
|
|
||||||
for term in terms:
|
for term in terms:
|
||||||
title_pattern = f"title:%{term}%"
|
tag_pattern = f"%{term}%"
|
||||||
title_rows = api.get_files_by_title_tag_pattern(title_pattern, fetch_limit)
|
term_rows = api.get_files_by_namespace_pattern(tag_pattern, fetch_limit)
|
||||||
for file_hash, file_path_str, size_bytes, ext in title_rows:
|
for file_hash, file_path_str, size_bytes, ext in term_rows:
|
||||||
if not file_path_str:
|
if not file_path_str:
|
||||||
continue
|
continue
|
||||||
entry = title_hits.get(file_hash)
|
entry = hits.get(file_hash)
|
||||||
if entry:
|
if entry:
|
||||||
entry["count"] += 1
|
entry["count"] += 1
|
||||||
if size_bytes is not None:
|
if size_bytes is not None:
|
||||||
entry["size"] = size_bytes
|
entry["size"] = size_bytes
|
||||||
else:
|
else:
|
||||||
title_hits[file_hash] = {
|
hits[file_hash] = {
|
||||||
"path": file_path_str,
|
"path": file_path_str,
|
||||||
"size": size_bytes,
|
"size": size_bytes,
|
||||||
"hash": file_hash,
|
"hash": file_hash,
|
||||||
"count": 1,
|
"count": 1,
|
||||||
}
|
}
|
||||||
|
|
||||||
if title_hits:
|
|
||||||
required = len(terms)
|
required = len(terms)
|
||||||
for file_hash, info in title_hits.items():
|
seen_files: set[str] = set()
|
||||||
|
for file_hash, info in hits.items():
|
||||||
if info.get("count") != required:
|
if info.get("count") != required:
|
||||||
continue
|
continue
|
||||||
file_path_str = info.get("path")
|
file_path_str = info.get("path")
|
||||||
@@ -688,30 +700,10 @@ class Folder(Store):
|
|||||||
size_bytes = None
|
size_bytes = None
|
||||||
|
|
||||||
tags = api.get_tags_for_file(file_hash)
|
tags = api.get_tags_for_file(file_hash)
|
||||||
entry = _create_entry(file_path, tags, size_bytes, info.get("hash"))
|
entry_obj = _create_entry(file_path, tags, size_bytes, info.get("hash"))
|
||||||
results.append(entry)
|
results.append(entry_obj)
|
||||||
if limit is not None and len(results) >= limit:
|
if limit is not None and len(results) >= limit:
|
||||||
return results
|
break
|
||||||
|
|
||||||
query_pattern = f"%{query_lower}%"
|
|
||||||
tag_rows = api.get_files_by_simple_tag_pattern(query_pattern, limit)
|
|
||||||
|
|
||||||
for file_hash, file_path_str, size_bytes, ext in tag_rows:
|
|
||||||
if not file_path_str or file_path_str in seen_files:
|
|
||||||
continue
|
|
||||||
seen_files.add(file_path_str)
|
|
||||||
|
|
||||||
file_path = Path(file_path_str)
|
|
||||||
if file_path.exists():
|
|
||||||
if size_bytes is None:
|
|
||||||
size_bytes = file_path.stat().st_size
|
|
||||||
|
|
||||||
tags = api.get_tags_for_file(file_hash)
|
|
||||||
entry = _create_entry(file_path, tags, size_bytes, file_hash)
|
|
||||||
results.append(entry)
|
|
||||||
|
|
||||||
if limit is not None and len(results) >= limit:
|
|
||||||
return results
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
rows = api.get_all_files(limit)
|
rows = api.get_all_files(limit)
|
||||||
@@ -726,10 +718,8 @@ class Folder(Store):
|
|||||||
entry = _create_entry(file_path, tags, size_bytes, file_hash)
|
entry = _create_entry(file_path, tags, size_bytes, file_hash)
|
||||||
results.append(entry)
|
results.append(entry)
|
||||||
|
|
||||||
if results:
|
backend_label = str(getattr(self, "_name", "") or getattr(self, "NAME", "") or "folder")
|
||||||
debug(f"Returning {len(results)} results from DB")
|
debug(f"[folder:{backend_label}] {len(results)} result(s)")
|
||||||
else:
|
|
||||||
debug("No results found in DB")
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -938,9 +928,11 @@ class Folder(Store):
|
|||||||
file_hash = file_identifier
|
file_hash = file_identifier
|
||||||
if self._location:
|
if self._location:
|
||||||
try:
|
try:
|
||||||
|
from metadata import normalize_urls
|
||||||
with API_folder_store(Path(self._location)) as db:
|
with API_folder_store(Path(self._location)) as db:
|
||||||
meta = db.get_metadata(file_hash) or {}
|
meta = db.get_metadata(file_hash) or {}
|
||||||
return list(meta.get("url") or [])
|
urls = normalize_urls(meta.get("url"))
|
||||||
|
return urls
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
debug(f"Local DB get_metadata failed: {exc}")
|
debug(f"Local DB get_metadata failed: {exc}")
|
||||||
return []
|
return []
|
||||||
@@ -955,11 +947,13 @@ class Folder(Store):
|
|||||||
file_hash = file_identifier
|
file_hash = file_identifier
|
||||||
if self._location:
|
if self._location:
|
||||||
try:
|
try:
|
||||||
|
from metadata import normalize_urls
|
||||||
with API_folder_store(Path(self._location)) as db:
|
with API_folder_store(Path(self._location)) as db:
|
||||||
meta = db.get_metadata(file_hash) or {}
|
meta = db.get_metadata(file_hash) or {}
|
||||||
existing_urls = list(meta.get("url") or [])
|
existing_urls = normalize_urls(meta.get("url"))
|
||||||
|
incoming_urls = normalize_urls(url)
|
||||||
changed = False
|
changed = False
|
||||||
for u in list(url or []):
|
for u in list(incoming_urls or []):
|
||||||
if not u:
|
if not u:
|
||||||
continue
|
continue
|
||||||
if u not in existing_urls:
|
if u not in existing_urls:
|
||||||
@@ -982,10 +976,11 @@ class Folder(Store):
|
|||||||
file_hash = file_identifier
|
file_hash = file_identifier
|
||||||
if self._location:
|
if self._location:
|
||||||
try:
|
try:
|
||||||
|
from metadata import normalize_urls
|
||||||
with API_folder_store(Path(self._location)) as db:
|
with API_folder_store(Path(self._location)) as db:
|
||||||
meta = db.get_metadata(file_hash) or {}
|
meta = db.get_metadata(file_hash) or {}
|
||||||
existing_urls = list(meta.get("url") or [])
|
existing_urls = normalize_urls(meta.get("url"))
|
||||||
remove_set = {u for u in (url or []) if u}
|
remove_set = {u for u in normalize_urls(url) if u}
|
||||||
if not remove_set:
|
if not remove_set:
|
||||||
return False
|
return False
|
||||||
new_urls = [u for u in existing_urls if u not in remove_set]
|
new_urls = [u for u in existing_urls if u not in remove_set]
|
||||||
|
|||||||
@@ -264,6 +264,170 @@ class HydrusNetwork(Store):
|
|||||||
|
|
||||||
debug(f"Searching Hydrus for: {query}")
|
debug(f"Searching Hydrus for: {query}")
|
||||||
|
|
||||||
|
def _extract_urls(meta_obj: Any) -> list[str]:
|
||||||
|
if not isinstance(meta_obj, dict):
|
||||||
|
return []
|
||||||
|
raw = meta_obj.get("url")
|
||||||
|
if raw is None:
|
||||||
|
raw = meta_obj.get("urls")
|
||||||
|
if isinstance(raw, str):
|
||||||
|
val = raw.strip()
|
||||||
|
return [val] if val else []
|
||||||
|
if isinstance(raw, list):
|
||||||
|
out: list[str] = []
|
||||||
|
for item in raw:
|
||||||
|
if not isinstance(item, str):
|
||||||
|
continue
|
||||||
|
s = item.strip()
|
||||||
|
if s:
|
||||||
|
out.append(s)
|
||||||
|
return out
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _iter_url_filtered_metadata(url_value: str | None, want_any: bool, fetch_limit: int) -> list[dict[str, Any]]:
|
||||||
|
"""Best-effort URL search by scanning Hydrus metadata with include_file_url=True."""
|
||||||
|
|
||||||
|
# First try a fast system predicate if Hydrus supports it.
|
||||||
|
candidate_file_ids: list[int] = []
|
||||||
|
try:
|
||||||
|
if want_any:
|
||||||
|
predicate = "system:has url"
|
||||||
|
url_search = client.search_files(
|
||||||
|
tags=[predicate],
|
||||||
|
return_hashes=False,
|
||||||
|
return_file_ids=True,
|
||||||
|
return_file_count=False,
|
||||||
|
)
|
||||||
|
ids = url_search.get("file_ids", []) if isinstance(url_search, dict) else []
|
||||||
|
if isinstance(ids, list):
|
||||||
|
candidate_file_ids = [int(x) for x in ids if isinstance(x, (int, float, str)) and str(x).strip().isdigit()]
|
||||||
|
except Exception:
|
||||||
|
candidate_file_ids = []
|
||||||
|
|
||||||
|
if not candidate_file_ids:
|
||||||
|
# Fallback: scan from system:everything and filter by URL substring.
|
||||||
|
everything = client.search_files(
|
||||||
|
tags=["system:everything"],
|
||||||
|
return_hashes=False,
|
||||||
|
return_file_ids=True,
|
||||||
|
return_file_count=False,
|
||||||
|
)
|
||||||
|
ids = everything.get("file_ids", []) if isinstance(everything, dict) else []
|
||||||
|
if isinstance(ids, list):
|
||||||
|
candidate_file_ids = [int(x) for x in ids if isinstance(x, (int, float))]
|
||||||
|
|
||||||
|
if not candidate_file_ids:
|
||||||
|
return []
|
||||||
|
|
||||||
|
needle = (url_value or "").strip().lower()
|
||||||
|
chunk_size = 200
|
||||||
|
out: list[dict[str, Any]] = []
|
||||||
|
|
||||||
|
for start in range(0, len(candidate_file_ids), chunk_size):
|
||||||
|
if len(out) >= fetch_limit:
|
||||||
|
break
|
||||||
|
chunk = candidate_file_ids[start : start + chunk_size]
|
||||||
|
try:
|
||||||
|
payload = client.fetch_file_metadata(
|
||||||
|
file_ids=chunk,
|
||||||
|
include_file_url=True,
|
||||||
|
include_service_keys_to_tags=True,
|
||||||
|
include_duration=True,
|
||||||
|
include_size=True,
|
||||||
|
include_mime=True,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
metas = payload.get("metadata", []) if isinstance(payload, dict) else []
|
||||||
|
if not isinstance(metas, list):
|
||||||
|
continue
|
||||||
|
|
||||||
|
for meta in metas:
|
||||||
|
if not isinstance(meta, dict):
|
||||||
|
continue
|
||||||
|
urls = _extract_urls(meta)
|
||||||
|
if not urls:
|
||||||
|
continue
|
||||||
|
if want_any:
|
||||||
|
out.append(meta)
|
||||||
|
if len(out) >= fetch_limit:
|
||||||
|
break
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not needle:
|
||||||
|
continue
|
||||||
|
if any(needle in u.lower() for u in urls):
|
||||||
|
out.append(meta)
|
||||||
|
if len(out) >= fetch_limit:
|
||||||
|
break
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
query_lower = query.lower().strip()
|
||||||
|
|
||||||
|
# Special case: url:* and url:<value>
|
||||||
|
metadata_list: list[dict[str, Any]] | None = None
|
||||||
|
if ":" in query_lower and not query_lower.startswith(":"):
|
||||||
|
namespace, pattern = query_lower.split(":", 1)
|
||||||
|
namespace = namespace.strip().lower()
|
||||||
|
pattern = pattern.strip()
|
||||||
|
if namespace == "url":
|
||||||
|
if not pattern or pattern == "*":
|
||||||
|
metadata_list = _iter_url_filtered_metadata(None, want_any=True, fetch_limit=int(limit) if limit else 100)
|
||||||
|
else:
|
||||||
|
# Fast-path: exact URL via /add_url/get_url_files when a full URL is provided.
|
||||||
|
try:
|
||||||
|
if pattern.startswith("http://") or pattern.startswith("https://"):
|
||||||
|
from API.HydrusNetwork import HydrusRequestSpec
|
||||||
|
|
||||||
|
spec = HydrusRequestSpec(method="GET", endpoint="/add_url/get_url_files", query={"url": pattern})
|
||||||
|
response = client._perform_request(spec) # type: ignore[attr-defined]
|
||||||
|
hashes: list[str] = []
|
||||||
|
file_ids: list[int] = []
|
||||||
|
if isinstance(response, dict):
|
||||||
|
raw_hashes = response.get("hashes") or response.get("file_hashes")
|
||||||
|
if isinstance(raw_hashes, list):
|
||||||
|
hashes = [str(h).strip() for h in raw_hashes if isinstance(h, str) and str(h).strip()]
|
||||||
|
raw_ids = response.get("file_ids")
|
||||||
|
if isinstance(raw_ids, list):
|
||||||
|
for item in raw_ids:
|
||||||
|
try:
|
||||||
|
file_ids.append(int(item))
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if file_ids:
|
||||||
|
payload = client.fetch_file_metadata(
|
||||||
|
file_ids=file_ids,
|
||||||
|
include_file_url=True,
|
||||||
|
include_service_keys_to_tags=True,
|
||||||
|
include_duration=True,
|
||||||
|
include_size=True,
|
||||||
|
include_mime=True,
|
||||||
|
)
|
||||||
|
metas = payload.get("metadata", []) if isinstance(payload, dict) else []
|
||||||
|
if isinstance(metas, list):
|
||||||
|
metadata_list = [m for m in metas if isinstance(m, dict)]
|
||||||
|
elif hashes:
|
||||||
|
payload = client.fetch_file_metadata(
|
||||||
|
hashes=hashes,
|
||||||
|
include_file_url=True,
|
||||||
|
include_service_keys_to_tags=True,
|
||||||
|
include_duration=True,
|
||||||
|
include_size=True,
|
||||||
|
include_mime=True,
|
||||||
|
)
|
||||||
|
metas = payload.get("metadata", []) if isinstance(payload, dict) else []
|
||||||
|
if isinstance(metas, list):
|
||||||
|
metadata_list = [m for m in metas if isinstance(m, dict)]
|
||||||
|
except Exception:
|
||||||
|
metadata_list = None
|
||||||
|
|
||||||
|
# Fallback: substring scan
|
||||||
|
if metadata_list is None:
|
||||||
|
metadata_list = _iter_url_filtered_metadata(pattern, want_any=False, fetch_limit=int(limit) if limit else 100)
|
||||||
|
|
||||||
# Parse the query into tags
|
# Parse the query into tags
|
||||||
# Handle both simple tags and complex queries
|
# Handle both simple tags and complex queries
|
||||||
# "*" means "match all" - use system:everything tag in Hydrus
|
# "*" means "match all" - use system:everything tag in Hydrus
|
||||||
@@ -271,7 +435,6 @@ class HydrusNetwork(Store):
|
|||||||
# Use system:everything to match all files in Hydrus
|
# Use system:everything to match all files in Hydrus
|
||||||
tags = ["system:everything"]
|
tags = ["system:everything"]
|
||||||
else:
|
else:
|
||||||
query_lower = query.lower().strip()
|
|
||||||
# If query doesn't have a namespace (no ':'), search all files and filter by title/tags
|
# If query doesn't have a namespace (no ':'), search all files and filter by title/tags
|
||||||
# If query has explicit namespace, use it as a tag search
|
# If query has explicit namespace, use it as a tag search
|
||||||
if ':' not in query_lower:
|
if ':' not in query_lower:
|
||||||
@@ -286,30 +449,36 @@ class HydrusNetwork(Store):
|
|||||||
debug(f"Found 0 result(s)")
|
debug(f"Found 0 result(s)")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# Search files with the tags
|
# Search files with the tags (unless url: search already produced metadata)
|
||||||
|
results = []
|
||||||
|
# Split by comma or space for AND logic
|
||||||
|
search_terms = set(query_lower.replace(',', ' ').split()) # For substring matching
|
||||||
|
|
||||||
|
if metadata_list is None:
|
||||||
search_result = client.search_files(
|
search_result = client.search_files(
|
||||||
tags=tags,
|
tags=tags,
|
||||||
return_hashes=True,
|
return_hashes=True,
|
||||||
return_file_ids=True
|
return_file_ids=True
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extract file IDs from search result
|
file_ids = search_result.get("file_ids", []) if isinstance(search_result, dict) else []
|
||||||
file_ids = search_result.get("file_ids", [])
|
hashes = search_result.get("hashes", []) if isinstance(search_result, dict) else []
|
||||||
hashes = search_result.get("hashes", [])
|
|
||||||
|
|
||||||
if not file_ids and not hashes:
|
if not file_ids and not hashes:
|
||||||
debug(f"Found 0 result(s)")
|
debug(f"Found 0 result(s)")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# Fetch metadata for the found files
|
|
||||||
results = []
|
|
||||||
query_lower = query.lower().strip()
|
|
||||||
# Split by comma or space for AND logic
|
|
||||||
search_terms = set(query_lower.replace(',', ' ').split()) # For substring matching
|
|
||||||
|
|
||||||
if file_ids:
|
if file_ids:
|
||||||
metadata = client.fetch_file_metadata(file_ids=file_ids)
|
metadata = client.fetch_file_metadata(file_ids=file_ids)
|
||||||
metadata_list = metadata.get("metadata", [])
|
metadata_list = metadata.get("metadata", [])
|
||||||
|
elif hashes:
|
||||||
|
metadata = client.fetch_file_metadata(hashes=hashes)
|
||||||
|
metadata_list = metadata.get("metadata", [])
|
||||||
|
else:
|
||||||
|
metadata_list = []
|
||||||
|
|
||||||
|
if not isinstance(metadata_list, list):
|
||||||
|
metadata_list = []
|
||||||
|
|
||||||
for meta in metadata_list:
|
for meta in metadata_list:
|
||||||
if len(results) >= limit:
|
if len(results) >= limit:
|
||||||
|
|||||||
@@ -119,6 +119,37 @@ class Store:
|
|||||||
self._backend_errors: Dict[str, str] = {}
|
self._backend_errors: Dict[str, str] = {}
|
||||||
self._load_backends()
|
self._load_backends()
|
||||||
|
|
||||||
|
def _maybe_register_temp_alias(self, store_type: str, backend_name: str, kwargs: Dict[str, Any], backend: BaseStore) -> None:
|
||||||
|
"""If a folder backend points at config['temp'], also expose it as the 'temp' backend.
|
||||||
|
|
||||||
|
This keeps config compatibility (e.g. existing 'default') while presenting the temp
|
||||||
|
directory under a clearer name.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if _normalize_store_type(store_type) != "folder":
|
||||||
|
return
|
||||||
|
temp_value = self._config.get("temp")
|
||||||
|
if not temp_value:
|
||||||
|
return
|
||||||
|
path_value = kwargs.get("PATH") or kwargs.get("path")
|
||||||
|
if not path_value:
|
||||||
|
return
|
||||||
|
|
||||||
|
temp_path = Path(str(temp_value)).expanduser().resolve()
|
||||||
|
backend_path = Path(str(path_value)).expanduser().resolve()
|
||||||
|
if backend_path != temp_path:
|
||||||
|
return
|
||||||
|
|
||||||
|
# If the user already has a dedicated temp backend, do nothing.
|
||||||
|
if "temp" in self._backends:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Keep original name working, but add an alias.
|
||||||
|
if backend_name != "temp":
|
||||||
|
self._backends["temp"] = backend
|
||||||
|
except Exception:
|
||||||
|
return
|
||||||
|
|
||||||
def _load_backends(self) -> None:
|
def _load_backends(self) -> None:
|
||||||
store_cfg = self._config.get("store")
|
store_cfg = self._config.get("store")
|
||||||
if not isinstance(store_cfg, dict):
|
if not isinstance(store_cfg, dict):
|
||||||
@@ -161,6 +192,9 @@ class Store:
|
|||||||
|
|
||||||
backend_name = str(kwargs.get("NAME") or instance_name)
|
backend_name = str(kwargs.get("NAME") or instance_name)
|
||||||
self._backends[backend_name] = backend
|
self._backends[backend_name] = backend
|
||||||
|
|
||||||
|
# If this is the configured temp directory, also alias it as 'temp'.
|
||||||
|
self._maybe_register_temp_alias(store_type, backend_name, kwargs, backend)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
err_text = str(exc)
|
err_text = str(exc)
|
||||||
self._backend_errors[str(instance_name)] = err_text
|
self._backend_errors[str(instance_name)] = err_text
|
||||||
@@ -177,11 +211,24 @@ class Store:
|
|||||||
return sorted(self._backends.keys())
|
return sorted(self._backends.keys())
|
||||||
|
|
||||||
def list_searchable_backends(self) -> list[str]:
|
def list_searchable_backends(self) -> list[str]:
|
||||||
searchable: list[str] = []
|
# De-duplicate backends by instance (aliases can point at the same object).
|
||||||
|
def _rank(name: str) -> int:
|
||||||
|
n = str(name or "").strip().lower()
|
||||||
|
if n == "temp":
|
||||||
|
return 0
|
||||||
|
if n == "default":
|
||||||
|
return 2
|
||||||
|
return 1
|
||||||
|
|
||||||
|
chosen: Dict[int, str] = {}
|
||||||
for name, backend in self._backends.items():
|
for name, backend in self._backends.items():
|
||||||
if type(backend).search is not BaseStore.search:
|
if type(backend).search is BaseStore.search:
|
||||||
searchable.append(name)
|
continue
|
||||||
return sorted(searchable)
|
key = id(backend)
|
||||||
|
prev = chosen.get(key)
|
||||||
|
if prev is None or _rank(name) < _rank(prev):
|
||||||
|
chosen[key] = name
|
||||||
|
return sorted(chosen.values())
|
||||||
|
|
||||||
def __getitem__(self, backend_name: str) -> BaseStore:
|
def __getitem__(self, backend_name: str) -> BaseStore:
|
||||||
if backend_name not in self._backends:
|
if backend_name not in self._backends:
|
||||||
|
|||||||
@@ -5,10 +5,9 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
import inspect
|
|
||||||
from collections.abc import Iterable as IterableABC
|
from collections.abc import Iterable as IterableABC
|
||||||
|
|
||||||
from SYS.logger import log, debug
|
from SYS.logger import log
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Set
|
from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Set
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
@@ -690,7 +689,9 @@ def get_field(obj: Any, field: str, default: Optional[Any] = None) -> Any:
|
|||||||
get_field(result, "table", "unknown") # With default
|
get_field(result, "table", "unknown") # With default
|
||||||
"""
|
"""
|
||||||
# Handle lists by accessing the first element
|
# Handle lists by accessing the first element
|
||||||
if isinstance(obj, list) and obj:
|
if isinstance(obj, list):
|
||||||
|
if not obj:
|
||||||
|
return default
|
||||||
obj = obj[0]
|
obj = obj[0]
|
||||||
|
|
||||||
if isinstance(obj, dict):
|
if isinstance(obj, dict):
|
||||||
@@ -702,8 +703,9 @@ def get_field(obj: Any, field: str, default: Optional[Any] = None) -> Any:
|
|||||||
return value
|
return value
|
||||||
|
|
||||||
# For PipeObjects, also check the extra field
|
# For PipeObjects, also check the extra field
|
||||||
if hasattr(obj, 'extra') and isinstance(obj.extra, dict):
|
extra_val = getattr(obj, 'extra', None)
|
||||||
return obj.extra.get(field, default)
|
if isinstance(extra_val, dict):
|
||||||
|
return extra_val.get(field, default)
|
||||||
|
|
||||||
return default
|
return default
|
||||||
|
|
||||||
@@ -1118,7 +1120,7 @@ def create_pipe_object_result(
|
|||||||
Returns:
|
Returns:
|
||||||
Dict with all PipeObject fields for emission
|
Dict with all PipeObject fields for emission
|
||||||
"""
|
"""
|
||||||
result = {
|
result: Dict[str, Any] = {
|
||||||
'source': source,
|
'source': source,
|
||||||
'id': identifier,
|
'id': identifier,
|
||||||
'path': file_path,
|
'path': file_path,
|
||||||
@@ -1546,14 +1548,11 @@ def coerce_to_pipe_object(value: Any, default_path: Optional[str] = None) -> mod
|
|||||||
extra = {k: v for k, v in value.items() if k not in known_keys}
|
extra = {k: v for k, v in value.items() if k not in known_keys}
|
||||||
|
|
||||||
# Extract URL: prefer direct url field, then url list
|
# Extract URL: prefer direct url field, then url list
|
||||||
url_val = value.get("url")
|
from metadata import normalize_urls
|
||||||
if not url_val:
|
url_list = normalize_urls(value.get("url"))
|
||||||
url = value.get("url") or value.get("url") or []
|
url_val = url_list[0] if url_list else None
|
||||||
if url and isinstance(url, list) and len(url) > 0:
|
if len(url_list) > 1:
|
||||||
url_val = url[0]
|
extra["url"] = url_list
|
||||||
# Preserve url in extra if multiple url exist
|
|
||||||
if url and len(url) > 1:
|
|
||||||
extra["url"] = url
|
|
||||||
|
|
||||||
# Extract relationships
|
# Extract relationships
|
||||||
rels = value.get("relationships") or {}
|
rels = value.get("relationships") or {}
|
||||||
|
|||||||
@@ -1,14 +1,16 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from typing import Any, Dict, Optional, Sequence, Tuple, List, Union
|
from typing import Any, Dict, Optional, Sequence, Tuple, List
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import sys
|
import sys
|
||||||
import shutil
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
|
||||||
import models
|
import models
|
||||||
import pipeline as ctx
|
import pipeline as ctx
|
||||||
from API import HydrusNetwork as hydrus_wrapper
|
from API import HydrusNetwork as hydrus_wrapper
|
||||||
from SYS.logger import log, debug
|
from SYS.logger import log, debug
|
||||||
|
from SYS.utils_constant import ALL_SUPPORTED_EXTENSIONS
|
||||||
from Store import Store
|
from Store import Store
|
||||||
from ._shared import (
|
from ._shared import (
|
||||||
Cmdlet, CmdletArg, parse_cmdlet_args, SharedArgs,
|
Cmdlet, CmdletArg, parse_cmdlet_args, SharedArgs,
|
||||||
@@ -20,8 +22,8 @@ from API.folder import read_sidecar, find_sidecar, write_sidecar, API_folder_sto
|
|||||||
from SYS.utils import sha256_file, unique_path
|
from SYS.utils import sha256_file, unique_path
|
||||||
from metadata import write_metadata
|
from metadata import write_metadata
|
||||||
|
|
||||||
# Use official Hydrus supported filetypes from hydrus_wrapper
|
# Canonical supported filetypes for all stores/cmdlets
|
||||||
SUPPORTED_MEDIA_EXTENSIONS = hydrus_wrapper.ALL_SUPPORTED_EXTENSIONS
|
SUPPORTED_MEDIA_EXTENSIONS = ALL_SUPPORTED_EXTENSIONS
|
||||||
|
|
||||||
class Add_File(Cmdlet):
|
class Add_File(Cmdlet):
|
||||||
"""Add file into the DB"""
|
"""Add file into the DB"""
|
||||||
@@ -53,92 +55,209 @@ class Add_File(Cmdlet):
|
|||||||
|
|
||||||
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||||
"""Main execution entry point."""
|
"""Main execution entry point."""
|
||||||
# Parse arguments
|
|
||||||
parsed = parse_cmdlet_args(args, self)
|
parsed = parse_cmdlet_args(args, self)
|
||||||
|
|
||||||
# Initialize state
|
|
||||||
path_arg = parsed.get("path")
|
path_arg = parsed.get("path")
|
||||||
location = parsed.get("store") # Fixed: was "storage", should be "store"
|
location = parsed.get("store")
|
||||||
provider_name = parsed.get("provider")
|
provider_name = parsed.get("provider")
|
||||||
delete_after = parsed.get("delete", False)
|
delete_after = parsed.get("delete", False)
|
||||||
|
|
||||||
# Coerce result to PipeObject; if result is a list, prefer the first element
|
stage_ctx = ctx.get_stage_context()
|
||||||
effective_result = result
|
is_last_stage = (stage_ctx is None) or bool(getattr(stage_ctx, "is_last_stage", False))
|
||||||
if isinstance(result, list) and result:
|
|
||||||
first_item = result[0]
|
# Decide which items to process.
|
||||||
# Prefer first item if it's a dict or PipeObject
|
# - If user provided -path, treat this invocation as single-item.
|
||||||
if isinstance(first_item, (dict, )):
|
# - Otherwise, if piped input is a list, ingest each item.
|
||||||
effective_result = first_item
|
if path_arg:
|
||||||
pipe_obj = coerce_to_pipe_object(effective_result, path_arg)
|
items_to_process: List[Any] = [result]
|
||||||
|
elif isinstance(result, list) and result:
|
||||||
|
items_to_process = list(result)
|
||||||
|
else:
|
||||||
|
items_to_process = [result]
|
||||||
|
|
||||||
# Debug: Log input result details
|
|
||||||
debug(f"[add-file] INPUT result type={type(result).__name__}")
|
debug(f"[add-file] INPUT result type={type(result).__name__}")
|
||||||
if isinstance(result, list):
|
if isinstance(result, list):
|
||||||
debug(f"[add-file] INPUT result is list with {len(result)} items")
|
debug(f"[add-file] INPUT result is list with {len(result)} items")
|
||||||
if result and isinstance(result[0], dict):
|
|
||||||
first = result[0]
|
|
||||||
hash_val = first.get('hash')
|
|
||||||
hash_str = hash_val[:12] + "..." if hash_val else "N/A"
|
|
||||||
debug(f"[add-file] First item details: title={first.get('title')}, hash={hash_str}, store={first.get('store', 'N/A')}")
|
|
||||||
elif isinstance(result, dict):
|
|
||||||
hash_val = result.get('hash')
|
|
||||||
hash_str = hash_val[:12] + "..." if hash_val else "N/A"
|
|
||||||
debug(f"[add-file] INPUT result is dict: title={result.get('title')}, hash={hash_str}, store={result.get('store', 'N/A')}")
|
|
||||||
|
|
||||||
# Debug: Log parsed arguments
|
|
||||||
debug(f"[add-file] PARSED args: location={location}, provider={provider_name}, delete={delete_after}")
|
debug(f"[add-file] PARSED args: location={location}, provider={provider_name}, delete={delete_after}")
|
||||||
|
|
||||||
# Resolve source - returns (media_path_or_url, file_hash)
|
collected_payloads: List[Dict[str, Any]] = []
|
||||||
media_path_or_url, file_hash = self._resolve_source(result, path_arg, pipe_obj, config)
|
successes = 0
|
||||||
|
failures = 0
|
||||||
|
|
||||||
|
# Only run the search-store refresh when add-file is the last stage.
|
||||||
|
# In the middle of a pipeline, downstream cmdlets should receive the emitted
|
||||||
|
# storage payload directly (no need to re-search and risk duplicate emits).
|
||||||
|
auto_search_store_after_add = bool(is_last_stage) and len(items_to_process) == 1
|
||||||
|
|
||||||
|
for item in items_to_process:
|
||||||
|
pipe_obj = coerce_to_pipe_object(item, path_arg)
|
||||||
|
|
||||||
|
temp_dir_to_cleanup: Optional[Path] = None
|
||||||
|
delete_after_item = delete_after
|
||||||
|
try:
|
||||||
|
media_path_or_url, file_hash = self._resolve_source(item, path_arg, pipe_obj, config)
|
||||||
debug(f"[add-file] RESOLVED source: path={media_path_or_url}, hash={file_hash[:12] if file_hash else 'N/A'}...")
|
debug(f"[add-file] RESOLVED source: path={media_path_or_url}, hash={file_hash[:12] if file_hash else 'N/A'}...")
|
||||||
if not media_path_or_url:
|
if not media_path_or_url:
|
||||||
debug(f"[add-file] ERROR: Could not resolve source file/URL")
|
failures += 1
|
||||||
return 1
|
continue
|
||||||
|
|
||||||
# Update pipe_obj with resolved path
|
# Update pipe_obj with resolved path
|
||||||
pipe_obj.path = str(media_path_or_url) if isinstance(media_path_or_url, (str, Path)) else str(media_path_or_url)
|
pipe_obj.path = str(media_path_or_url)
|
||||||
|
|
||||||
# Check if it's a URL before validating as file
|
# URL targets: prefer provider-aware download for OpenLibrary selections.
|
||||||
if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith(("http://", "https://", "magnet:", "torrent:")):
|
if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith(
|
||||||
debug(f"Detected URL target, delegating to download-data: {media_path_or_url}")
|
("http://", "https://", "magnet:", "torrent:")
|
||||||
return self._delegate_to_download_data(result, media_path_or_url, location, provider_name, args, config)
|
):
|
||||||
|
table = None
|
||||||
|
full_metadata = None
|
||||||
|
if isinstance(pipe_obj.extra, dict):
|
||||||
|
table = pipe_obj.extra.get("table")
|
||||||
|
full_metadata = pipe_obj.extra.get("full_metadata")
|
||||||
|
|
||||||
|
is_openlibrary = (str(table or "").lower() == "openlibrary") or ("openlibrary.org/books/" in media_path_or_url.lower())
|
||||||
|
if is_openlibrary:
|
||||||
|
# Enrich tags from OpenLibrary metadata so the stored file has book tags (author/pages/etc).
|
||||||
|
try:
|
||||||
|
from Provider.openlibrary import OpenLibrary as _OpenLibrary
|
||||||
|
|
||||||
|
olid = None
|
||||||
|
archive_id = None
|
||||||
|
if isinstance(full_metadata, dict):
|
||||||
|
olid = full_metadata.get("openlibrary_id") or full_metadata.get("openlibrary")
|
||||||
|
archive_id = full_metadata.get("archive_id")
|
||||||
|
|
||||||
|
if not olid:
|
||||||
|
import re
|
||||||
|
m = re.search(r"/books/(OL\d+M)", str(media_path_or_url), flags=re.IGNORECASE)
|
||||||
|
if m:
|
||||||
|
olid = m.group(1)
|
||||||
|
|
||||||
|
scraped_tags: List[str] = []
|
||||||
|
if olid:
|
||||||
|
scraped_tags.extend(_OpenLibrary.scrape_openlibrary_metadata(str(olid)) or [])
|
||||||
|
if archive_id:
|
||||||
|
scraped_tags.append(f"internet_archive:{archive_id}")
|
||||||
|
|
||||||
|
if scraped_tags:
|
||||||
|
existing = list(pipe_obj.tag or [])
|
||||||
|
pipe_obj.tag = merge_sequences(existing, scraped_tags, case_sensitive=False)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
from ProviderCore.registry import get_search_provider
|
||||||
|
from ProviderCore.base import SearchResult
|
||||||
|
|
||||||
|
provider = get_search_provider("openlibrary", config)
|
||||||
|
if provider is None:
|
||||||
|
log("[add-file] OpenLibrary provider not available", file=sys.stderr)
|
||||||
|
failures += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
temp_dir_to_cleanup = Path(tempfile.mkdtemp(prefix="medios_openlibrary_"))
|
||||||
|
sr = SearchResult(
|
||||||
|
table="openlibrary",
|
||||||
|
title=str(getattr(pipe_obj, "title", None) or "Unknown"),
|
||||||
|
path=str(media_path_or_url),
|
||||||
|
full_metadata=full_metadata if isinstance(full_metadata, dict) else {},
|
||||||
|
)
|
||||||
|
downloaded = provider.download(sr, temp_dir_to_cleanup)
|
||||||
|
if downloaded is None:
|
||||||
|
log("[add-file] OpenLibrary download failed", file=sys.stderr)
|
||||||
|
failures += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
downloaded_path = Path(downloaded)
|
||||||
|
if downloaded_path.exists() and downloaded_path.is_dir():
|
||||||
|
log(
|
||||||
|
"[add-file] OpenLibrary download produced a directory (missing img2pdf?). Cannot ingest.",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
failures += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
media_path_or_url = str(downloaded_path)
|
||||||
|
pipe_obj.path = str(downloaded_path)
|
||||||
|
delete_after_item = True
|
||||||
|
|
||||||
|
# For non-provider URLs, or if still a URL after provider attempt, delegate to download-media.
|
||||||
|
if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith(
|
||||||
|
("http://", "https://", "magnet:", "torrent:")
|
||||||
|
):
|
||||||
|
code = self._delegate_to_download_data(item, media_path_or_url, location, provider_name, args, config)
|
||||||
|
if code == 0:
|
||||||
|
successes += 1
|
||||||
|
else:
|
||||||
|
failures += 1
|
||||||
|
continue
|
||||||
|
|
||||||
# Convert to Path and validate
|
|
||||||
media_path = Path(media_path_or_url) if isinstance(media_path_or_url, str) else media_path_or_url
|
media_path = Path(media_path_or_url) if isinstance(media_path_or_url, str) else media_path_or_url
|
||||||
|
|
||||||
# Validate source
|
|
||||||
if not self._validate_source(media_path):
|
if not self._validate_source(media_path):
|
||||||
debug(f"[add-file] ERROR: Source validation failed for {media_path}")
|
failures += 1
|
||||||
return 1
|
continue
|
||||||
|
|
||||||
# Debug: Log execution path decision
|
|
||||||
debug(f"[add-file] DECISION POINT: provider={provider_name}, location={location}")
|
|
||||||
debug(f" media_path={media_path}, exists={media_path.exists()}")
|
|
||||||
|
|
||||||
# Execute transfer based on destination (using Store registry)
|
|
||||||
if provider_name:
|
if provider_name:
|
||||||
debug(f"[add-file] ROUTE: file provider upload")
|
code = self._handle_provider_upload(media_path, provider_name, pipe_obj, config, delete_after_item)
|
||||||
return self._handle_provider_upload(media_path, provider_name, pipe_obj, config, delete_after)
|
if code == 0:
|
||||||
elif location:
|
successes += 1
|
||||||
# Check if location is a registered backend name
|
else:
|
||||||
|
failures += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if location:
|
||||||
try:
|
try:
|
||||||
store = Store(config)
|
store = Store(config)
|
||||||
backends = store.list_backends()
|
backends = store.list_backends()
|
||||||
|
|
||||||
if location in backends:
|
if location in backends:
|
||||||
debug(f"[add-file] ROUTE: storage backend '{location}'")
|
code = self._handle_storage_backend(
|
||||||
return self._handle_storage_backend(media_path, location, pipe_obj, config, delete_after)
|
item,
|
||||||
|
media_path,
|
||||||
|
location,
|
||||||
|
pipe_obj,
|
||||||
|
config,
|
||||||
|
delete_after_item,
|
||||||
|
collect_payloads=collected_payloads,
|
||||||
|
suppress_last_stage_overlay=is_last_stage and len(items_to_process) > 1,
|
||||||
|
auto_search_store=auto_search_store_after_add,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
# Treat as local export path
|
code = self._handle_local_export(media_path, location, pipe_obj, config, delete_after_item)
|
||||||
debug(f"[add-file] ROUTE: local export to path '{location}'")
|
|
||||||
return self._handle_local_export(media_path, location, pipe_obj, config, delete_after)
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
debug(f"[add-file] ERROR: Failed to resolve location: {exc}")
|
debug(f"[add-file] ERROR: Failed to resolve location: {exc}")
|
||||||
log(f"Invalid location: {location}", file=sys.stderr)
|
log(f"Invalid location: {location}", file=sys.stderr)
|
||||||
return 1
|
failures += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if code == 0:
|
||||||
|
successes += 1
|
||||||
else:
|
else:
|
||||||
debug(f"[add-file] ERROR: No location or provider specified")
|
failures += 1
|
||||||
log(f"No storage location or provider specified", file=sys.stderr)
|
continue
|
||||||
|
|
||||||
|
log("No destination specified", file=sys.stderr)
|
||||||
|
failures += 1
|
||||||
|
finally:
|
||||||
|
if temp_dir_to_cleanup is not None:
|
||||||
|
try:
|
||||||
|
shutil.rmtree(temp_dir_to_cleanup, ignore_errors=True)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# If we processed multiple storage ingests, present a single consolidated overlay table.
|
||||||
|
if is_last_stage and len(items_to_process) > 1 and collected_payloads:
|
||||||
|
try:
|
||||||
|
from result_table import ResultTable
|
||||||
|
|
||||||
|
table = ResultTable("Result")
|
||||||
|
for payload in collected_payloads:
|
||||||
|
table.add_result(payload)
|
||||||
|
# Make this the active selectable table so @.. returns here (and playlist table is kept in history).
|
||||||
|
ctx.set_last_result_table(table, collected_payloads, subject=collected_payloads)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if successes > 0:
|
||||||
|
return 0
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -150,9 +269,6 @@ class Add_File(Cmdlet):
|
|||||||
) -> Tuple[Optional[Path | str], Optional[str]]:
|
) -> Tuple[Optional[Path | str], Optional[str]]:
|
||||||
"""Resolve the source file path from args or pipeline result.
|
"""Resolve the source file path from args or pipeline result.
|
||||||
|
|
||||||
PRIORITY: hash+store pattern is preferred over path-based resolution.
|
|
||||||
This ensures consistency when @N selections pass hash+store identifiers.
|
|
||||||
|
|
||||||
Returns (media_path_or_url, file_hash)
|
Returns (media_path_or_url, file_hash)
|
||||||
where media_path_or_url can be a Path object or a URL string.
|
where media_path_or_url can be a Path object or a URL string.
|
||||||
"""
|
"""
|
||||||
@@ -161,8 +277,9 @@ class Add_File(Cmdlet):
|
|||||||
result_hash = result.get("hash")
|
result_hash = result.get("hash")
|
||||||
result_store = result.get("store")
|
result_store = result.get("store")
|
||||||
if result_hash and result_store:
|
if result_hash and result_store:
|
||||||
debug(f"[add-file] Using hash+store from result: hash={result_hash[:12]}..., store={result_store}")
|
debug(
|
||||||
# Use get_file to retrieve from the specific store
|
f"[add-file] Using hash+store from result: hash={str(result_hash)[:12]}..., store={result_store}"
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
store = Store(config)
|
store = Store(config)
|
||||||
if result_store in store.list_backends():
|
if result_store in store.list_backends():
|
||||||
@@ -170,13 +287,12 @@ class Add_File(Cmdlet):
|
|||||||
media_path = backend.get_file(result_hash)
|
media_path = backend.get_file(result_hash)
|
||||||
if isinstance(media_path, Path) and media_path.exists():
|
if isinstance(media_path, Path) and media_path.exists():
|
||||||
pipe_obj.path = str(media_path)
|
pipe_obj.path = str(media_path)
|
||||||
debug(f"[add-file] Retrieved file from {result_store}: {media_path}")
|
return media_path, str(result_hash)
|
||||||
return media_path, result_hash
|
if isinstance(media_path, str) and media_path.lower().startswith(
|
||||||
|
("http://", "https://", "magnet:", "torrent:")
|
||||||
if isinstance(media_path, str) and media_path.lower().startswith(("http://", "https://")):
|
):
|
||||||
pipe_obj.path = media_path
|
pipe_obj.path = media_path
|
||||||
debug(f"[add-file] Retrieved URL from {result_store}: {media_path}")
|
return media_path, str(result_hash)
|
||||||
return media_path, result_hash
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
debug(f"[add-file] Failed to retrieve via hash+store: {exc}")
|
debug(f"[add-file] Failed to retrieve via hash+store: {exc}")
|
||||||
|
|
||||||
@@ -196,10 +312,9 @@ class Add_File(Cmdlet):
|
|||||||
file_hash = pipe_path_str.split(":", 1)[1]
|
file_hash = pipe_path_str.split(":", 1)[1]
|
||||||
media_path, success = Add_File._fetch_hydrus_path(file_hash, config)
|
media_path, success = Add_File._fetch_hydrus_path(file_hash, config)
|
||||||
return media_path, file_hash if success else None
|
return media_path, file_hash if success else None
|
||||||
# Check if pipe_path is a URL - skip to URL handling below
|
if pipe_path_str.lower().startswith(("http://", "https://", "magnet:", "torrent:")):
|
||||||
if not pipe_path_str.lower().startswith(("http://", "https://", "magnet:", "torrent:")):
|
return pipe_path_str, None
|
||||||
media_path = Path(pipe_path_str)
|
return Path(pipe_path_str), None
|
||||||
return media_path, None
|
|
||||||
|
|
||||||
# PRIORITY 4: Try from pipe_obj.url (for streaming url without downloaded file)
|
# PRIORITY 4: Try from pipe_obj.url (for streaming url without downloaded file)
|
||||||
pipe_url = getattr(pipe_obj, "url", None)
|
pipe_url = getattr(pipe_obj, "url", None)
|
||||||
@@ -248,8 +363,9 @@ class Add_File(Cmdlet):
|
|||||||
# Look for path or path-like keys
|
# Look for path or path-like keys
|
||||||
path_candidate = first_item.get("path") or first_item.get("filepath") or first_item.get("file")
|
path_candidate = first_item.get("path") or first_item.get("filepath") or first_item.get("file")
|
||||||
# If the dict includes a 'paths' list (multi-part/section download), prefer the first file
|
# If the dict includes a 'paths' list (multi-part/section download), prefer the first file
|
||||||
if not path_candidate and isinstance(first_item.get("paths"), (list, tuple)) and first_item.get("paths"):
|
paths_val = first_item.get("paths")
|
||||||
path_candidate = first_item.get("paths")[0]
|
if not path_candidate and isinstance(paths_val, (list, tuple)) and paths_val:
|
||||||
|
path_candidate = paths_val[0]
|
||||||
if path_candidate:
|
if path_candidate:
|
||||||
debug(f"Resolved path from result dict: {path_candidate}")
|
debug(f"Resolved path from result dict: {path_candidate}")
|
||||||
try:
|
try:
|
||||||
@@ -361,8 +477,10 @@ class Add_File(Cmdlet):
|
|||||||
selection_args = result["_selection_args"]
|
selection_args = result["_selection_args"]
|
||||||
if selection_args:
|
if selection_args:
|
||||||
dl_args.extend(selection_args)
|
dl_args.extend(selection_args)
|
||||||
elif hasattr(result, 'extra') and isinstance(result.extra, dict) and "_selection_args" in result.extra:
|
else:
|
||||||
selection_args = result.extra["_selection_args"]
|
extra_val = getattr(result, "extra", None)
|
||||||
|
if isinstance(extra_val, dict) and "_selection_args" in extra_val:
|
||||||
|
selection_args = extra_val["_selection_args"]
|
||||||
if selection_args:
|
if selection_args:
|
||||||
dl_args.extend(selection_args)
|
dl_args.extend(selection_args)
|
||||||
|
|
||||||
@@ -375,18 +493,32 @@ class Add_File(Cmdlet):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_url(result: Any, pipe_obj: models.PipeObject) -> List[str]:
|
def _get_url(result: Any, pipe_obj: models.PipeObject) -> List[str]:
|
||||||
url: List[str] = []
|
from metadata import normalize_urls
|
||||||
|
|
||||||
|
# Prefer explicit PipeObject.url if present
|
||||||
|
urls: List[str] = []
|
||||||
|
try:
|
||||||
|
urls = normalize_urls(getattr(pipe_obj, "url", None))
|
||||||
|
except Exception:
|
||||||
|
urls = []
|
||||||
|
|
||||||
|
# Then check extra.url
|
||||||
|
if not urls:
|
||||||
try:
|
try:
|
||||||
if isinstance(pipe_obj.extra, dict):
|
if isinstance(pipe_obj.extra, dict):
|
||||||
url = list(pipe_obj.extra.get("url") or pipe_obj.extra.get("url") or [])
|
urls = normalize_urls(pipe_obj.extra.get("url"))
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if not url and isinstance(result, dict):
|
# Then check result dict
|
||||||
url = list(result.get("url") or result.get("url") or [])
|
if not urls and isinstance(result, dict):
|
||||||
if not url:
|
urls = normalize_urls(result.get("url"))
|
||||||
url = list(extract_url_from_result(result) or [])
|
|
||||||
return url
|
# Finally, try extractor helper
|
||||||
|
if not urls:
|
||||||
|
urls = normalize_urls(extract_url_from_result(result))
|
||||||
|
|
||||||
|
return urls
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_relationships(result: Any, pipe_obj: models.PipeObject) -> Optional[Dict[str, Any]]:
|
def _get_relationships(result: Any, pipe_obj: models.PipeObject) -> Optional[Dict[str, Any]]:
|
||||||
@@ -405,10 +537,36 @@ class Add_File(Cmdlet):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_duration(result: Any, pipe_obj: models.PipeObject) -> Optional[float]:
|
def _get_duration(result: Any, pipe_obj: models.PipeObject) -> Optional[float]:
|
||||||
if getattr(pipe_obj, "duration", None) is not None:
|
def _parse_duration(value: Any) -> Optional[float]:
|
||||||
return pipe_obj.duration
|
if value is None:
|
||||||
|
return None
|
||||||
|
if isinstance(value, (int, float)):
|
||||||
|
return float(value) if value > 0 else None
|
||||||
|
if isinstance(value, str):
|
||||||
|
s = value.strip()
|
||||||
|
if not s:
|
||||||
|
return None
|
||||||
try:
|
try:
|
||||||
return extract_duration(result)
|
candidate = float(s)
|
||||||
|
return candidate if candidate > 0 else None
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
if ":" in s:
|
||||||
|
parts = [p.strip() for p in s.split(":") if p.strip()]
|
||||||
|
if len(parts) in {2, 3} and all(p.isdigit() for p in parts):
|
||||||
|
nums = [int(p) for p in parts]
|
||||||
|
if len(nums) == 2:
|
||||||
|
minutes, seconds = nums
|
||||||
|
return float(minutes * 60 + seconds)
|
||||||
|
hours, minutes, seconds = nums
|
||||||
|
return float(hours * 3600 + minutes * 60 + seconds)
|
||||||
|
return None
|
||||||
|
|
||||||
|
parsed = _parse_duration(getattr(pipe_obj, "duration", None))
|
||||||
|
if parsed is not None:
|
||||||
|
return parsed
|
||||||
|
try:
|
||||||
|
return _parse_duration(extract_duration(result))
|
||||||
except Exception:
|
except Exception:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@@ -442,19 +600,20 @@ class Add_File(Cmdlet):
|
|||||||
ctx.set_current_stage_table(None)
|
ctx.set_current_stage_table(None)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _emit_storage_result(payload: Dict[str, Any]) -> None:
|
def _emit_storage_result(payload: Dict[str, Any], *, overlay: bool = True, emit: bool = True) -> None:
|
||||||
"""Emit a storage-style result payload.
|
"""Emit a storage-style result payload.
|
||||||
|
|
||||||
- Always emits the dict downstream (when in a pipeline).
|
- Always emits the dict downstream (when in a pipeline).
|
||||||
- If this is the last stage (or not in a pipeline), prints a search-store-like table
|
- If this is the last stage (or not in a pipeline), prints a search-store-like table
|
||||||
and sets an overlay table/items for @N selection.
|
and sets an overlay table/items for @N selection.
|
||||||
"""
|
"""
|
||||||
# Always emit for downstream commands (no-op if not in a pipeline)
|
# Emit for downstream commands (no-op if not in a pipeline)
|
||||||
|
if emit:
|
||||||
ctx.emit(payload)
|
ctx.emit(payload)
|
||||||
|
|
||||||
stage_ctx = ctx.get_stage_context()
|
stage_ctx = ctx.get_stage_context()
|
||||||
is_last = (stage_ctx is None) or bool(getattr(stage_ctx, "is_last_stage", False))
|
is_last = (stage_ctx is None) or bool(getattr(stage_ctx, "is_last_stage", False))
|
||||||
if not is_last:
|
if not is_last or not overlay:
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -470,6 +629,53 @@ class Add_File(Cmdlet):
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _try_emit_search_store_by_hash(*, store: str, hash_value: str, config: Dict[str, Any]) -> bool:
|
||||||
|
"""Run search-store for a single hash so the final table/payload is consistent.
|
||||||
|
|
||||||
|
Important: `add-file` is treated as an action command by the CLI, so the CLI only
|
||||||
|
prints tables for it when a display overlay exists. After running search-store,
|
||||||
|
this copies the resulting table into the display overlay (when this is the last
|
||||||
|
stage) so the canonical store table is what the user sees and can select from.
|
||||||
|
|
||||||
|
Returns True if search-store ran successfully, else False.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from cmdlet.search_store import CMDLET as search_store_cmdlet
|
||||||
|
|
||||||
|
args = ["-store", str(store), f"hash:{str(hash_value)}"]
|
||||||
|
log(f"[add-file] Refresh: search-store -store {store} \"hash:{hash_value}\"", file=sys.stderr)
|
||||||
|
|
||||||
|
# Run search-store under a temporary stage context so its ctx.emit() calls
|
||||||
|
# don't interfere with the outer add-file pipeline stage.
|
||||||
|
prev_ctx = ctx.get_stage_context()
|
||||||
|
temp_ctx = ctx.PipelineStageContext(stage_index=0, total_stages=1, worker_id=getattr(prev_ctx, "worker_id", None))
|
||||||
|
ctx.set_stage_context(temp_ctx)
|
||||||
|
try:
|
||||||
|
code = search_store_cmdlet.run(None, args, config)
|
||||||
|
finally:
|
||||||
|
ctx.set_stage_context(prev_ctx)
|
||||||
|
if code != 0:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Promote the search-store result to a display overlay so the CLI prints it
|
||||||
|
# for action commands like add-file.
|
||||||
|
stage_ctx = ctx.get_stage_context()
|
||||||
|
is_last = (stage_ctx is None) or bool(getattr(stage_ctx, "is_last_stage", False))
|
||||||
|
if is_last:
|
||||||
|
try:
|
||||||
|
table = ctx.get_last_result_table()
|
||||||
|
items = ctx.get_last_result_items()
|
||||||
|
if table is not None and items:
|
||||||
|
ctx.set_last_result_table_overlay(table, items, subject={"store": store, "hash": hash_value})
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception as exc:
|
||||||
|
debug(f"[add-file] Failed to run search-store after add-file: {type(exc).__name__}: {exc}")
|
||||||
|
return False
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _prepare_metadata(
|
def _prepare_metadata(
|
||||||
result: Any,
|
result: Any,
|
||||||
@@ -664,8 +870,9 @@ class Add_File(Cmdlet):
|
|||||||
|
|
||||||
if not username or not filename:
|
if not username or not filename:
|
||||||
debug(f"[add-file] ERROR: Could not extract soulseek metadata from result (type={type(result).__name__})")
|
debug(f"[add-file] ERROR: Could not extract soulseek metadata from result (type={type(result).__name__})")
|
||||||
if hasattr(result, "extra"):
|
extra_val = getattr(result, "extra", None)
|
||||||
debug(f"[add-file] Result extra keys: {list(result.extra.keys())}")
|
if isinstance(extra_val, dict):
|
||||||
|
debug(f"[add-file] Result extra keys: {list(extra_val.keys())}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if not username or not filename:
|
if not username or not filename:
|
||||||
@@ -769,28 +976,55 @@ class Add_File(Cmdlet):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _handle_storage_backend(
|
def _handle_storage_backend(
|
||||||
|
result: Any,
|
||||||
media_path: Path,
|
media_path: Path,
|
||||||
backend_name: str,
|
backend_name: str,
|
||||||
pipe_obj: models.PipeObject,
|
pipe_obj: models.PipeObject,
|
||||||
config: Dict[str, Any],
|
config: Dict[str, Any],
|
||||||
delete_after: bool,
|
delete_after: bool,
|
||||||
|
*,
|
||||||
|
collect_payloads: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
suppress_last_stage_overlay: bool = False,
|
||||||
|
auto_search_store: bool = True,
|
||||||
) -> int:
|
) -> int:
|
||||||
"""Handle uploading to a registered storage backend (e.g., 'test' folder store, 'hydrus', etc.)."""
|
"""Handle uploading to a registered storage backend (e.g., 'test' folder store, 'hydrus', etc.)."""
|
||||||
log(f"Adding file to storage backend '{backend_name}': {media_path.name}", file=sys.stderr)
|
log(f"Adding file to storage backend '{backend_name}': {media_path.name}", file=sys.stderr)
|
||||||
|
|
||||||
|
delete_after_effective = bool(delete_after)
|
||||||
|
if not delete_after_effective:
|
||||||
|
# When download-media is piped into add-file, the downloaded artifact is a temp file.
|
||||||
|
# After it is persisted to a storage backend, delete the temp copy to avoid duplicates.
|
||||||
|
try:
|
||||||
|
if (
|
||||||
|
str(backend_name or "").strip().lower() != "temp"
|
||||||
|
and getattr(pipe_obj, "is_temp", False)
|
||||||
|
and getattr(pipe_obj, "action", None) == "cmdlet:download-media"
|
||||||
|
):
|
||||||
|
from config import resolve_output_dir
|
||||||
|
temp_dir = resolve_output_dir(config)
|
||||||
|
try:
|
||||||
|
if media_path.resolve().is_relative_to(temp_dir.expanduser().resolve()):
|
||||||
|
delete_after_effective = True
|
||||||
|
debug(f"[add-file] Auto-delete temp source after ingest: {media_path}")
|
||||||
|
except Exception:
|
||||||
|
# If path resolution fails, fall back to non-destructive behavior
|
||||||
|
pass
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
store = Store(config)
|
store = Store(config)
|
||||||
backend = store[backend_name]
|
backend = store[backend_name]
|
||||||
|
|
||||||
# Prepare metadata from pipe_obj and sidecars
|
# Prepare metadata from pipe_obj and sidecars
|
||||||
tags, url, title, f_hash = Add_File._prepare_metadata(None, media_path, pipe_obj, config)
|
tags, url, title, f_hash = Add_File._prepare_metadata(result, media_path, pipe_obj, config)
|
||||||
|
|
||||||
# Call backend's add_file with full metadata
|
# Call backend's add_file with full metadata
|
||||||
# Backend returns hash as identifier
|
# Backend returns hash as identifier
|
||||||
file_identifier = backend.add_file(
|
file_identifier = backend.add_file(
|
||||||
media_path,
|
media_path,
|
||||||
title=title,
|
title=title,
|
||||||
tags=tags,
|
tag=tags,
|
||||||
url=url
|
url=url
|
||||||
)
|
)
|
||||||
log(f"✓ File added to '{backend_name}': {file_identifier}", file=sys.stderr)
|
log(f"✓ File added to '{backend_name}': {file_identifier}", file=sys.stderr)
|
||||||
@@ -822,6 +1056,14 @@ class Add_File(Cmdlet):
|
|||||||
# Keep hash/store for downstream commands (get-tag, get-file, etc.).
|
# Keep hash/store for downstream commands (get-tag, get-file, etc.).
|
||||||
resolved_hash = file_identifier if len(file_identifier) == 64 else (f_hash or file_identifier or "unknown")
|
resolved_hash = file_identifier if len(file_identifier) == 64 else (f_hash or file_identifier or "unknown")
|
||||||
|
|
||||||
|
# If we have url(s), ensure they get associated with the destination file.
|
||||||
|
# This mirrors `add-url` behavior but avoids emitting extra pipeline noise.
|
||||||
|
if url:
|
||||||
|
try:
|
||||||
|
backend.add_url(resolved_hash, list(url))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
meta: Dict[str, Any] = {}
|
meta: Dict[str, Any] = {}
|
||||||
try:
|
try:
|
||||||
meta = backend.get_metadata(resolved_hash) or {}
|
meta = backend.get_metadata(resolved_hash) or {}
|
||||||
@@ -865,9 +1107,30 @@ class Add_File(Cmdlet):
|
|||||||
"tag": list(tags or []),
|
"tag": list(tags or []),
|
||||||
"url": list(url or []),
|
"url": list(url or []),
|
||||||
}
|
}
|
||||||
Add_File._emit_storage_result(payload)
|
if collect_payloads is not None:
|
||||||
|
try:
|
||||||
|
collect_payloads.append(payload)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
Add_File._cleanup_after_success(media_path, delete_source=delete_after)
|
# Keep the add-file 1-row summary overlay (when last stage), then emit the
|
||||||
|
# canonical search-store payload/table for piping/selection consistency.
|
||||||
|
if auto_search_store and resolved_hash and resolved_hash != "unknown":
|
||||||
|
# Show the add-file summary (overlay only) but let search-store provide the downstream payload.
|
||||||
|
Add_File._emit_storage_result(payload, overlay=not suppress_last_stage_overlay, emit=False)
|
||||||
|
|
||||||
|
ok = Add_File._try_emit_search_store_by_hash(
|
||||||
|
store=backend_name,
|
||||||
|
hash_value=resolved_hash,
|
||||||
|
config=config,
|
||||||
|
)
|
||||||
|
if not ok:
|
||||||
|
# Fall back to emitting the add-file payload so downstream stages still receive an item.
|
||||||
|
ctx.emit(payload)
|
||||||
|
else:
|
||||||
|
Add_File._emit_storage_result(payload, overlay=not suppress_last_stage_overlay, emit=True)
|
||||||
|
|
||||||
|
Add_File._cleanup_after_success(media_path, delete_source=delete_after_effective)
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ from __future__ import annotations
|
|||||||
from typing import Any, Dict, Sequence
|
from typing import Any, Dict, Sequence
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from . import register
|
|
||||||
import pipeline as ctx
|
import pipeline as ctx
|
||||||
from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field, normalize_hash
|
from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field, normalize_hash
|
||||||
from SYS.logger import log
|
from SYS.logger import log
|
||||||
@@ -13,18 +12,23 @@ from Store import Store
|
|||||||
class Add_Url(Cmdlet):
|
class Add_Url(Cmdlet):
|
||||||
"""Add URL associations to files via hash+store."""
|
"""Add URL associations to files via hash+store."""
|
||||||
|
|
||||||
NAME = "add-url"
|
def __init__(self) -> None:
|
||||||
SUMMARY = "Associate a URL with a file"
|
super().__init__(
|
||||||
USAGE = "@1 | add-url <url>"
|
name="add-url",
|
||||||
ARGS = [
|
summary="Associate a URL with a file",
|
||||||
|
usage="@1 | add-url <url>",
|
||||||
|
arg=[
|
||||||
SharedArgs.HASH,
|
SharedArgs.HASH,
|
||||||
SharedArgs.STORE,
|
SharedArgs.STORE,
|
||||||
CmdletArg("url", required=True, description="URL to associate"),
|
CmdletArg("url", required=True, description="URL to associate"),
|
||||||
]
|
],
|
||||||
DETAIL = [
|
detail=[
|
||||||
"- Associates URL with file identified by hash+store",
|
"- Associates URL with file identified by hash+store",
|
||||||
"- Multiple url can be comma-separated",
|
"- Multiple url can be comma-separated",
|
||||||
]
|
],
|
||||||
|
exec=self.run,
|
||||||
|
)
|
||||||
|
self.register()
|
||||||
|
|
||||||
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||||
"""Add URL to file via hash+store backend."""
|
"""Add URL to file via hash+store backend."""
|
||||||
@@ -78,8 +82,7 @@ class Add_Url(Cmdlet):
|
|||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
|
||||||
# Register cmdlet
|
CMDLET = Add_Url()
|
||||||
register(["add-url", "add_url"])(Add_Url)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ from __future__ import annotations
|
|||||||
from typing import Any, Dict, Sequence
|
from typing import Any, Dict, Sequence
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from . import register
|
|
||||||
import pipeline as ctx
|
import pipeline as ctx
|
||||||
from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field, normalize_hash
|
from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field, normalize_hash
|
||||||
from SYS.logger import log
|
from SYS.logger import log
|
||||||
@@ -13,18 +12,23 @@ from Store import Store
|
|||||||
class Delete_Url(Cmdlet):
|
class Delete_Url(Cmdlet):
|
||||||
"""Delete URL associations from files via hash+store."""
|
"""Delete URL associations from files via hash+store."""
|
||||||
|
|
||||||
NAME = "delete-url"
|
def __init__(self) -> None:
|
||||||
SUMMARY = "Remove a URL association from a file"
|
super().__init__(
|
||||||
USAGE = "@1 | delete-url <url>"
|
name="delete-url",
|
||||||
ARGS = [
|
summary="Remove a URL association from a file",
|
||||||
|
usage="@1 | delete-url <url>",
|
||||||
|
arg=[
|
||||||
SharedArgs.HASH,
|
SharedArgs.HASH,
|
||||||
SharedArgs.STORE,
|
SharedArgs.STORE,
|
||||||
CmdletArg("url", required=True, description="URL to remove"),
|
CmdletArg("url", required=True, description="URL to remove"),
|
||||||
]
|
],
|
||||||
DETAIL = [
|
detail=[
|
||||||
"- Removes URL association from file identified by hash+store",
|
"- Removes URL association from file identified by hash+store",
|
||||||
"- Multiple url can be comma-separated",
|
"- Multiple url can be comma-separated",
|
||||||
]
|
],
|
||||||
|
exec=self.run,
|
||||||
|
)
|
||||||
|
self.register()
|
||||||
|
|
||||||
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||||
"""Delete URL from file via hash+store backend."""
|
"""Delete URL from file via hash+store backend."""
|
||||||
@@ -78,5 +82,4 @@ class Delete_Url(Cmdlet):
|
|||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
|
||||||
# Register cmdlet
|
CMDLET = Delete_Url()
|
||||||
register(["delete-url", "del-url", "delete_url"])(Delete_Url)
|
|
||||||
|
|||||||
@@ -190,9 +190,11 @@ class Download_File(Cmdlet):
|
|||||||
|
|
||||||
# If this looks like a provider item and providers are available, prefer provider.download()
|
# If this looks like a provider item and providers are available, prefer provider.download()
|
||||||
downloaded_path: Optional[Path] = None
|
downloaded_path: Optional[Path] = None
|
||||||
|
attempted_provider_download = False
|
||||||
if table and get_search_provider and SearchResult:
|
if table and get_search_provider and SearchResult:
|
||||||
provider = get_search_provider(str(table), config)
|
provider = get_search_provider(str(table), config)
|
||||||
if provider is not None:
|
if provider is not None:
|
||||||
|
attempted_provider_download = True
|
||||||
sr = SearchResult(
|
sr = SearchResult(
|
||||||
table=str(table),
|
table=str(table),
|
||||||
title=str(title or "Unknown"),
|
title=str(title or "Unknown"),
|
||||||
@@ -202,6 +204,19 @@ class Download_File(Cmdlet):
|
|||||||
debug(f"[download-file] Downloading provider item via {table}: {sr.title}")
|
debug(f"[download-file] Downloading provider item via {table}: {sr.title}")
|
||||||
downloaded_path = provider.download(sr, final_output_dir)
|
downloaded_path = provider.download(sr, final_output_dir)
|
||||||
|
|
||||||
|
# OpenLibrary: if provider download failed, do NOT try to download the OpenLibrary page HTML.
|
||||||
|
if downloaded_path is None and attempted_provider_download and str(table or "").lower() == "openlibrary":
|
||||||
|
availability = None
|
||||||
|
reason = None
|
||||||
|
if isinstance(full_metadata, dict):
|
||||||
|
availability = full_metadata.get("availability")
|
||||||
|
reason = full_metadata.get("availability_reason")
|
||||||
|
msg = "[download-file] OpenLibrary item not downloadable"
|
||||||
|
if availability or reason:
|
||||||
|
msg += f" (availability={availability or ''} reason={reason or ''})"
|
||||||
|
log(msg, file=sys.stderr)
|
||||||
|
continue
|
||||||
|
|
||||||
# Fallback: if we have a direct HTTP URL, download it directly
|
# Fallback: if we have a direct HTTP URL, download it directly
|
||||||
if downloaded_path is None and isinstance(target, str) and target.startswith("http"):
|
if downloaded_path is None and isinstance(target, str) and target.startswith("http"):
|
||||||
debug(f"[download-file] Provider item looks like direct URL, downloading: {target}")
|
debug(f"[download-file] Provider item looks like direct URL, downloading: {target}")
|
||||||
|
|||||||
@@ -693,6 +693,7 @@ def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15) ->
|
|||||||
return
|
return
|
||||||
|
|
||||||
# Extract relevant fields
|
# Extract relevant fields
|
||||||
|
webpage_url = info.get("webpage_url") or info.get("original_url") or info.get("url")
|
||||||
result_container[0] = {
|
result_container[0] = {
|
||||||
"extractor": info.get("extractor", ""),
|
"extractor": info.get("extractor", ""),
|
||||||
"title": info.get("title", ""),
|
"title": info.get("title", ""),
|
||||||
@@ -700,7 +701,9 @@ def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15) ->
|
|||||||
"duration": info.get("duration"),
|
"duration": info.get("duration"),
|
||||||
"uploader": info.get("uploader"),
|
"uploader": info.get("uploader"),
|
||||||
"description": info.get("description"),
|
"description": info.get("description"),
|
||||||
"url": url,
|
# Keep both the requested and canonical URL forms; callers should prefer webpage_url.
|
||||||
|
"requested_url": url,
|
||||||
|
"webpage_url": webpage_url,
|
||||||
}
|
}
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
log(f"Probe error for {url}: {exc}")
|
log(f"Probe error for {url}: {exc}")
|
||||||
@@ -1220,9 +1223,359 @@ class Download_Media(Cmdlet):
|
|||||||
log(f"Invalid clip format: {clip_spec}", file=sys.stderr)
|
log(f"Invalid clip format: {clip_spec}", file=sys.stderr)
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
quiet_mode = bool(config.get("_quiet_background_output")) if isinstance(config, dict) else False
|
||||||
|
|
||||||
|
storage = None
|
||||||
|
hydrus_available = True
|
||||||
|
try:
|
||||||
|
from Store import Store
|
||||||
|
storage = Store(config=config or {}, suppress_debug=True)
|
||||||
|
from API.HydrusNetwork import is_hydrus_available
|
||||||
|
hydrus_available = bool(is_hydrus_available(config or {}))
|
||||||
|
except Exception:
|
||||||
|
storage = None
|
||||||
|
|
||||||
|
def _preflight_url_duplicate(candidate_url: str, extra_urls: Optional[Sequence[str]] = None) -> bool:
|
||||||
|
# NOTE: download-media sets _quiet_background_output=True when running in a pipeline to
|
||||||
|
# reduce background noise. URL de-dup is interactive and must still run in pipelines.
|
||||||
|
if storage is None:
|
||||||
|
debug("Preflight URL check skipped: storage unavailable")
|
||||||
|
return True
|
||||||
|
|
||||||
|
debug(f"Preflight URL check: candidate={candidate_url}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
from metadata import normalize_urls
|
||||||
|
except Exception:
|
||||||
|
normalize_urls = None # type: ignore[assignment]
|
||||||
|
|
||||||
|
needles: List[str] = []
|
||||||
|
if normalize_urls is not None:
|
||||||
|
for raw in [candidate_url, *(list(extra_urls) if extra_urls else [])]:
|
||||||
|
try:
|
||||||
|
needles.extend(normalize_urls(raw))
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
# Fallback: always have at least one needle
|
||||||
|
if not needles:
|
||||||
|
needles = [str(candidate_url)]
|
||||||
|
|
||||||
|
# Deduplicate needles (preserve order)
|
||||||
|
seen_needles: List[str] = []
|
||||||
|
for needle in needles:
|
||||||
|
if needle and needle not in seen_needles:
|
||||||
|
seen_needles.append(needle)
|
||||||
|
needles = seen_needles
|
||||||
|
|
||||||
|
try:
|
||||||
|
debug(f"Preflight URL needles: {needles}")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
url_matches: List[Dict[str, Any]] = []
|
||||||
|
try:
|
||||||
|
from Store.HydrusNetwork import HydrusNetwork
|
||||||
|
|
||||||
|
# Avoid searching the temp/download directory backend during dedup.
|
||||||
|
# We only want to warn about duplicates in real stores.
|
||||||
|
backend_names_all = storage.list_searchable_backends()
|
||||||
|
backend_names: List[str] = []
|
||||||
|
skipped: List[str] = []
|
||||||
|
for backend_name in backend_names_all:
|
||||||
|
try:
|
||||||
|
backend = storage[backend_name]
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
if str(backend_name).strip().lower() == "temp":
|
||||||
|
skipped.append(backend_name)
|
||||||
|
continue
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Heuristic: if a Folder backend points at the configured temp output dir, skip it.
|
||||||
|
try:
|
||||||
|
backend_location = getattr(backend, "_location", None)
|
||||||
|
if backend_location and final_output_dir:
|
||||||
|
backend_path = Path(str(backend_location)).expanduser().resolve()
|
||||||
|
temp_path = Path(str(final_output_dir)).expanduser().resolve()
|
||||||
|
if backend_path == temp_path:
|
||||||
|
skipped.append(backend_name)
|
||||||
|
continue
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
backend_names.append(backend_name)
|
||||||
|
|
||||||
|
try:
|
||||||
|
if skipped:
|
||||||
|
debug(f"Preflight backends: {backend_names} (skipped temp: {skipped})")
|
||||||
|
else:
|
||||||
|
debug(f"Preflight backends: {backend_names}")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
for backend_name in backend_names:
|
||||||
|
backend = storage[backend_name]
|
||||||
|
if isinstance(backend, HydrusNetwork) and not hydrus_available:
|
||||||
|
continue
|
||||||
|
|
||||||
|
backend_hits: List[Dict[str, Any]] = []
|
||||||
|
for needle in needles:
|
||||||
|
try:
|
||||||
|
backend_hits = backend.search(f"url:{needle}", limit=25) or []
|
||||||
|
if backend_hits:
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
if backend_hits:
|
||||||
|
url_matches.extend([dict(x) if isinstance(x, dict) else {"title": str(x)} for x in backend_hits])
|
||||||
|
|
||||||
|
if len(url_matches) >= 25:
|
||||||
|
url_matches = url_matches[:25]
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
url_matches = []
|
||||||
|
|
||||||
|
if not url_matches:
|
||||||
|
debug("Preflight URL check: no matches")
|
||||||
|
return True
|
||||||
|
|
||||||
|
table = ResultTable(f"URL already exists ({len(url_matches)} match(es))")
|
||||||
|
results_list: List[Dict[str, Any]] = []
|
||||||
|
for item in url_matches:
|
||||||
|
if "title" not in item:
|
||||||
|
item["title"] = item.get("name") or item.get("target") or item.get("path") or "Result"
|
||||||
|
table.add_result(item)
|
||||||
|
results_list.append(item)
|
||||||
|
|
||||||
|
pipeline_context.set_current_stage_table(table)
|
||||||
|
pipeline_context.set_last_result_table(table, results_list)
|
||||||
|
|
||||||
|
print(f"\n{table}")
|
||||||
|
response = input("Continue anyway? (y/n): ").strip().lower()
|
||||||
|
if response not in {"y", "yes"}:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _canonicalize_url_for_storage(requested_url: str) -> str:
|
||||||
|
# Prefer yt-dlp's canonical webpage URL (e.g. strips timestamps/redirects).
|
||||||
|
# Fall back to the requested URL if probing fails.
|
||||||
|
# Important: when playlist item selection is used, avoid probing (can hang on large playlists).
|
||||||
|
if playlist_items:
|
||||||
|
return str(requested_url)
|
||||||
|
try:
|
||||||
|
pr = probe_url(requested_url, no_playlist=False, timeout_seconds=15)
|
||||||
|
if isinstance(pr, dict):
|
||||||
|
for key in ("webpage_url", "original_url", "url", "requested_url"):
|
||||||
|
value = pr.get(key)
|
||||||
|
if isinstance(value, str) and value.strip():
|
||||||
|
return value.strip()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return str(requested_url)
|
||||||
|
|
||||||
# Check if we need to show format selection
|
# Check if we need to show format selection
|
||||||
playlist_items = str(parsed.get("item")) if parsed.get("item") else None
|
playlist_items = str(parsed.get("item")) if parsed.get("item") else None
|
||||||
ytdl_format = parsed.get("format")
|
ytdl_format = parsed.get("format")
|
||||||
|
playlist_selection_handled = False
|
||||||
|
|
||||||
|
def _parse_at_selection(choice: str, *, max_index: int) -> Optional[List[int]]:
|
||||||
|
"""Parse @ selection syntax (@2, @2-5, @{1,3,5}, @2,5,7) into 1-based indices."""
|
||||||
|
raw = str(choice or "").strip()
|
||||||
|
if not raw:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if raw.lower() in {"q", "quit", "cancel"}:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if raw == "@*" or raw == "*":
|
||||||
|
return list(range(1, max_index + 1))
|
||||||
|
|
||||||
|
if raw.startswith("@"):
|
||||||
|
raw = raw[1:].strip()
|
||||||
|
|
||||||
|
if raw.startswith("{") and raw.endswith("}"):
|
||||||
|
raw = raw[1:-1].strip()
|
||||||
|
|
||||||
|
if not raw:
|
||||||
|
return None
|
||||||
|
|
||||||
|
indices: set[int] = set()
|
||||||
|
for part in raw.split(","):
|
||||||
|
part = part.strip()
|
||||||
|
if not part:
|
||||||
|
continue
|
||||||
|
if "-" in part:
|
||||||
|
left, right = [p.strip() for p in part.split("-", 1)]
|
||||||
|
if not left or not right:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
start = int(left)
|
||||||
|
end = int(right)
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
if start < 1 or end < 1:
|
||||||
|
return None
|
||||||
|
if end < start:
|
||||||
|
start, end = end, start
|
||||||
|
for i in range(start, end + 1):
|
||||||
|
if 1 <= i <= max_index:
|
||||||
|
indices.add(i)
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
i = int(part)
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
if 1 <= i <= max_index:
|
||||||
|
indices.add(i)
|
||||||
|
if not indices:
|
||||||
|
return None
|
||||||
|
return sorted(indices)
|
||||||
|
|
||||||
|
def _maybe_prompt_playlist_items(url: str) -> Optional[Dict[str, Any]]:
|
||||||
|
"""If URL appears to be a playlist/channel/collection, prompt user for @ selection.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
- None if URL is not a playlist-like multi-entry page (or probe fails)
|
||||||
|
- Dict with keys:
|
||||||
|
- cancel: bool
|
||||||
|
- playlist_items: Optional[str] (None means download all)
|
||||||
|
- selected_urls: Optional[List[str]] (expanded per-entry urls when available)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
pr = probe_url(url, no_playlist=False, timeout_seconds=15)
|
||||||
|
except Exception:
|
||||||
|
pr = None
|
||||||
|
if not isinstance(pr, dict):
|
||||||
|
return None
|
||||||
|
entries = pr.get("entries")
|
||||||
|
if not isinstance(entries, list) or len(entries) <= 1:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Display table (limit rows to keep output reasonable)
|
||||||
|
max_rows = 200
|
||||||
|
display_entries = entries[:max_rows]
|
||||||
|
total = len(entries)
|
||||||
|
|
||||||
|
def _entry_to_url(entry: Any) -> Optional[str]:
|
||||||
|
if not isinstance(entry, dict):
|
||||||
|
return None
|
||||||
|
# Prefer explicit absolute URLs when present
|
||||||
|
for key in ("webpage_url", "original_url", "url"):
|
||||||
|
v = entry.get(key)
|
||||||
|
if isinstance(v, str) and v.strip():
|
||||||
|
s = v.strip()
|
||||||
|
try:
|
||||||
|
if urlparse(s).scheme in {"http", "https"}:
|
||||||
|
return s
|
||||||
|
except Exception:
|
||||||
|
return s
|
||||||
|
|
||||||
|
# Best-effort YouTube fallback from id
|
||||||
|
entry_id = entry.get("id")
|
||||||
|
if isinstance(entry_id, str) and entry_id.strip():
|
||||||
|
extractor_name = str(pr.get("extractor") or pr.get("extractor_key") or "").lower()
|
||||||
|
if "youtube" in extractor_name:
|
||||||
|
return f"https://www.youtube.com/watch?v={entry_id.strip()}"
|
||||||
|
return None
|
||||||
|
|
||||||
|
table = ResultTable()
|
||||||
|
table.title = f"Playlist items ({total}{' shown ' + str(len(display_entries)) if total > max_rows else ''})"
|
||||||
|
table.set_source_command("download-media", [url])
|
||||||
|
try:
|
||||||
|
table.set_preserve_order(True)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
results_list: List[Dict[str, Any]] = []
|
||||||
|
for idx, entry in enumerate(display_entries, 1):
|
||||||
|
title = None
|
||||||
|
uploader = None
|
||||||
|
duration = None
|
||||||
|
try:
|
||||||
|
if isinstance(entry, dict):
|
||||||
|
title = entry.get("title")
|
||||||
|
uploader = entry.get("uploader") or pr.get("uploader")
|
||||||
|
duration = entry.get("duration")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
row: Dict[str, Any] = {
|
||||||
|
"table": "download-media",
|
||||||
|
"title": str(title or f"Item {idx}"),
|
||||||
|
"detail": str(uploader or ""),
|
||||||
|
"media_kind": "playlist-item",
|
||||||
|
"playlist_index": idx,
|
||||||
|
"columns": [
|
||||||
|
("#", str(idx)),
|
||||||
|
("Title", str(title or "")),
|
||||||
|
("Duration", str(duration or "")),
|
||||||
|
("Uploader", str(uploader or "")),
|
||||||
|
],
|
||||||
|
}
|
||||||
|
results_list.append(row)
|
||||||
|
table.add_result(row)
|
||||||
|
|
||||||
|
pipeline_context.set_current_stage_table(table)
|
||||||
|
pipeline_context.set_last_result_table(table, results_list)
|
||||||
|
|
||||||
|
print(f"\n{table}")
|
||||||
|
choice = input("Select items to download (@N, @2-5, @{1,3}, @*, or 'q' to cancel): ").strip()
|
||||||
|
if not choice or choice.lower() in {"q", "quit", "cancel"}:
|
||||||
|
return {"cancel": True, "playlist_items": None, "selected_urls": []}
|
||||||
|
if choice.strip() == "@*" or choice.strip() == "*":
|
||||||
|
# @* means all entries, not just displayed rows.
|
||||||
|
selected_urls: List[str] = []
|
||||||
|
for entry in entries:
|
||||||
|
u = _entry_to_url(entry)
|
||||||
|
if u and u not in selected_urls:
|
||||||
|
selected_urls.append(u)
|
||||||
|
# Only expand when we can derive URLs for all entries; otherwise fall back to yt-dlp playlist handling.
|
||||||
|
if len(selected_urls) == len(entries):
|
||||||
|
return {"cancel": False, "playlist_items": None, "selected_urls": selected_urls}
|
||||||
|
return {"cancel": False, "playlist_items": None, "selected_urls": []}
|
||||||
|
|
||||||
|
parsed_indices = _parse_at_selection(choice, max_index=len(display_entries))
|
||||||
|
if not parsed_indices:
|
||||||
|
log("Invalid selection. Use @N, @2-5, @{1,3}, or @*", file=sys.stderr)
|
||||||
|
return {"cancel": True, "playlist_items": None, "selected_urls": []}
|
||||||
|
|
||||||
|
selected_urls: List[str] = []
|
||||||
|
for i in parsed_indices:
|
||||||
|
try:
|
||||||
|
entry = display_entries[i - 1]
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
u = _entry_to_url(entry)
|
||||||
|
if u and u not in selected_urls:
|
||||||
|
selected_urls.append(u)
|
||||||
|
|
||||||
|
# If we can expand per-entry URLs, return them.
|
||||||
|
if selected_urls and len(selected_urls) == len(parsed_indices):
|
||||||
|
return {"cancel": False, "playlist_items": None, "selected_urls": selected_urls}
|
||||||
|
|
||||||
|
# yt-dlp accepts comma-separated 1-based indices for playlist_items
|
||||||
|
return {"cancel": False, "playlist_items": ",".join(str(i) for i in parsed_indices), "selected_urls": []}
|
||||||
|
|
||||||
|
# Playlist/multi-entry detection: if the URL has multiple items and the user didn't
|
||||||
|
# specify -item, prompt for @ selection (supports @* for all).
|
||||||
|
if len(supported_url) == 1 and not playlist_items and not ytdl_format:
|
||||||
|
candidate_url = supported_url[0]
|
||||||
|
selection_info = _maybe_prompt_playlist_items(candidate_url)
|
||||||
|
if selection_info is not None:
|
||||||
|
playlist_selection_handled = True
|
||||||
|
if bool(selection_info.get("cancel")):
|
||||||
|
return 0
|
||||||
|
selected_urls = selection_info.get("selected_urls")
|
||||||
|
if isinstance(selected_urls, list) and selected_urls:
|
||||||
|
# Expand playlist/channel URL into per-entry URLs so that de-dup preflight
|
||||||
|
# and downloads operate per file.
|
||||||
|
supported_url = selected_urls
|
||||||
|
playlist_items = None
|
||||||
|
else:
|
||||||
|
playlist_items = selection_info.get("playlist_items")
|
||||||
|
|
||||||
# If no -item, no explicit -format specified, and single URL, show the format table.
|
# If no -item, no explicit -format specified, and single URL, show the format table.
|
||||||
# Do NOT stop to show formats when -audio is used (auto-pick) or when -clip is used.
|
# Do NOT stop to show formats when -audio is used (auto-pick) or when -clip is used.
|
||||||
@@ -1232,8 +1585,15 @@ class Download_Media(Cmdlet):
|
|||||||
and not playlist_items
|
and not playlist_items
|
||||||
and not ytdl_format
|
and not ytdl_format
|
||||||
and len(supported_url) == 1
|
and len(supported_url) == 1
|
||||||
|
and not playlist_selection_handled
|
||||||
):
|
):
|
||||||
url = supported_url[0]
|
url = supported_url[0]
|
||||||
|
|
||||||
|
canonical_url = _canonicalize_url_for_storage(url)
|
||||||
|
if not _preflight_url_duplicate(canonical_url, extra_urls=[url]):
|
||||||
|
log(f"Skipping download: {url}", file=sys.stderr)
|
||||||
|
return 0
|
||||||
|
|
||||||
formats = list_formats(url, no_playlist=False)
|
formats = list_formats(url, no_playlist=False)
|
||||||
|
|
||||||
if formats and len(formats) > 1:
|
if formats and len(formats) > 1:
|
||||||
@@ -1379,12 +1739,18 @@ class Download_Media(Cmdlet):
|
|||||||
# Download each URL
|
# Download each URL
|
||||||
downloaded_count = 0
|
downloaded_count = 0
|
||||||
clip_sections_spec = self._build_clip_sections_spec(clip_range)
|
clip_sections_spec = self._build_clip_sections_spec(clip_range)
|
||||||
quiet_mode = bool(config.get("_quiet_background_output")) if isinstance(config, dict) else False
|
|
||||||
|
|
||||||
for url in supported_url:
|
for url in supported_url:
|
||||||
try:
|
try:
|
||||||
debug(f"Processing: {url}")
|
debug(f"Processing: {url}")
|
||||||
|
|
||||||
|
canonical_url = _canonicalize_url_for_storage(url)
|
||||||
|
|
||||||
|
# Preflight: warn if URL already exists in storage backends.
|
||||||
|
if not _preflight_url_duplicate(canonical_url, extra_urls=[url]):
|
||||||
|
log(f"Skipping download: {url}", file=sys.stderr)
|
||||||
|
continue
|
||||||
|
|
||||||
# If playlist_items is specified but looks like a format ID (e.g. from table selection),
|
# If playlist_items is specified but looks like a format ID (e.g. from table selection),
|
||||||
# treat it as a format selector instead of playlist items.
|
# treat it as a format selector instead of playlist items.
|
||||||
# This handles the case where @N selection passes -item <format_id>
|
# This handles the case where @N selection passes -item <format_id>
|
||||||
@@ -1532,24 +1898,17 @@ class Download_Media(Cmdlet):
|
|||||||
if title and f"title:{title}" not in tag:
|
if title and f"title:{title}" not in tag:
|
||||||
tag.insert(0, f"title:{title}")
|
tag.insert(0, f"title:{title}")
|
||||||
|
|
||||||
# Build a single canonical URL field; prefer yt-dlp provided webpage_url or info.url,
|
# Store the canonical URL for de-dup/search purposes.
|
||||||
# but fall back to the original requested URL. If multiple unique urls are available,
|
# Prefer yt-dlp's webpage_url, and do not mix in the raw requested URL (which may contain timestamps).
|
||||||
# join them into a comma-separated string.
|
final_url = None
|
||||||
urls_to_consider: List[str] = []
|
|
||||||
try:
|
try:
|
||||||
page_url = info.get("webpage_url") or info.get("url")
|
page_url = info.get("webpage_url") or info.get("original_url") or info.get("url")
|
||||||
if page_url:
|
if page_url:
|
||||||
urls_to_consider.append(str(page_url))
|
final_url = str(page_url)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
final_url = None
|
||||||
if url:
|
if not final_url and url:
|
||||||
urls_to_consider.append(str(url))
|
final_url = str(url)
|
||||||
|
|
||||||
seen_urls: List[str] = []
|
|
||||||
for u in urls_to_consider:
|
|
||||||
if u and u not in seen_urls:
|
|
||||||
seen_urls.append(u)
|
|
||||||
final_url = ",".join(seen_urls) if seen_urls else None
|
|
||||||
|
|
||||||
# Construct canonical PipeObject dict: hash, store, path, url, title, tags
|
# Construct canonical PipeObject dict: hash, store, path, url, title, tags
|
||||||
# Prefer explicit backend names (storage_name/storage_location). If none, default to PATH
|
# Prefer explicit backend names (storage_name/storage_location). If none, default to PATH
|
||||||
@@ -1561,6 +1920,7 @@ class Download_Media(Cmdlet):
|
|||||||
"url": final_url,
|
"url": final_url,
|
||||||
"tag": tag,
|
"tag": tag,
|
||||||
"action": "cmdlet:download-media",
|
"action": "cmdlet:download-media",
|
||||||
|
"is_temp": True,
|
||||||
# download_mode removed (deprecated), keep media_kind
|
# download_mode removed (deprecated), keep media_kind
|
||||||
"store": getattr(opts, "storage_name", None) or getattr(opts, "storage_location", None) or "PATH",
|
"store": getattr(opts, "storage_name", None) or getattr(opts, "storage_location", None) or "PATH",
|
||||||
"media_kind": "video" if opts.mode == "video" else "audio",
|
"media_kind": "video" if opts.mode == "video" else "audio",
|
||||||
|
|||||||
@@ -184,6 +184,32 @@ class Get_Metadata(Cmdlet):
|
|||||||
mime_type = metadata.get("mime") or metadata.get("ext", "")
|
mime_type = metadata.get("mime") or metadata.get("ext", "")
|
||||||
file_size = metadata.get("size")
|
file_size = metadata.get("size")
|
||||||
duration_seconds = metadata.get("duration")
|
duration_seconds = metadata.get("duration")
|
||||||
|
if duration_seconds is None:
|
||||||
|
duration_seconds = metadata.get("duration_seconds")
|
||||||
|
if duration_seconds is None:
|
||||||
|
duration_seconds = metadata.get("length")
|
||||||
|
if duration_seconds is None and isinstance(metadata.get("duration_ms"), (int, float)):
|
||||||
|
try:
|
||||||
|
duration_seconds = float(metadata["duration_ms"]) / 1000.0
|
||||||
|
except Exception:
|
||||||
|
duration_seconds = None
|
||||||
|
|
||||||
|
if isinstance(duration_seconds, str):
|
||||||
|
s = duration_seconds.strip()
|
||||||
|
if s:
|
||||||
|
try:
|
||||||
|
duration_seconds = float(s)
|
||||||
|
except ValueError:
|
||||||
|
if ":" in s:
|
||||||
|
parts = [p.strip() for p in s.split(":") if p.strip()]
|
||||||
|
if len(parts) in {2, 3} and all(p.isdigit() for p in parts):
|
||||||
|
nums = [int(p) for p in parts]
|
||||||
|
if len(nums) == 2:
|
||||||
|
duration_seconds = float(nums[0] * 60 + nums[1])
|
||||||
|
else:
|
||||||
|
duration_seconds = float(nums[0] * 3600 + nums[1] * 60 + nums[2])
|
||||||
|
else:
|
||||||
|
duration_seconds = None
|
||||||
pages = metadata.get("pages")
|
pages = metadata.get("pages")
|
||||||
url = metadata.get("url") or []
|
url = metadata.get("url") or []
|
||||||
imported_ts = self._extract_imported_ts(metadata)
|
imported_ts = self._extract_imported_ts(metadata)
|
||||||
|
|||||||
@@ -12,7 +12,13 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from SYS.logger import log, debug
|
try:
|
||||||
|
from Provider.openlibrary import OpenLibrary
|
||||||
|
_ol_scrape_isbn_metadata = OpenLibrary.scrape_isbn_metadata
|
||||||
|
_ol_scrape_openlibrary_metadata = OpenLibrary.scrape_openlibrary_metadata
|
||||||
|
except Exception:
|
||||||
|
_ol_scrape_isbn_metadata = None # type: ignore[assignment]
|
||||||
|
_ol_scrape_openlibrary_metadata = None # type: ignore[assignment]
|
||||||
from Provider.metadata_provider import get_metadata_provider, list_metadata_providers
|
from Provider.metadata_provider import get_metadata_provider, list_metadata_providers
|
||||||
import subprocess
|
import subprocess
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -31,6 +37,10 @@ except ImportError:
|
|||||||
extract_title = None
|
extract_title = None
|
||||||
|
|
||||||
|
|
||||||
|
_scrape_isbn_metadata = _ol_scrape_isbn_metadata # type: ignore[assignment]
|
||||||
|
_scrape_openlibrary_metadata = _ol_scrape_openlibrary_metadata # type: ignore[assignment]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -691,249 +701,22 @@ def _extract_url_formats(formats: list) -> List[Tuple[str, str]]:
|
|||||||
|
|
||||||
|
|
||||||
def _scrape_isbn_metadata(isbn: str) -> List[str]:
|
def _scrape_isbn_metadata(isbn: str) -> List[str]:
|
||||||
"""Scrape metadata for an ISBN using Open Library API."""
|
if _ol_scrape_isbn_metadata is None:
|
||||||
new_tags = []
|
log("OpenLibrary scraper unavailable", file=sys.stderr)
|
||||||
|
return []
|
||||||
try:
|
try:
|
||||||
from ..API.HTTP import HTTPClient
|
return list(_ol_scrape_isbn_metadata(isbn))
|
||||||
import json as json_module
|
|
||||||
|
|
||||||
isbn_clean = isbn.replace('-', '').strip()
|
|
||||||
url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json"
|
|
||||||
|
|
||||||
try:
|
|
||||||
with HTTPClient() as client:
|
|
||||||
response = client.get(url)
|
|
||||||
response.raise_for_status()
|
|
||||||
data = json_module.loads(response.content.decode('utf-8'))
|
|
||||||
except Exception as e:
|
|
||||||
log(f"Failed to fetch ISBN metadata: {e}", file=sys.stderr)
|
|
||||||
return []
|
|
||||||
|
|
||||||
if not data:
|
|
||||||
log(f"No ISBN metadata found for: {isbn}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
book_data = next(iter(data.values()), None)
|
|
||||||
if not book_data:
|
|
||||||
return []
|
|
||||||
|
|
||||||
if 'title' in book_data:
|
|
||||||
new_tags.append(f"title:{book_data['title']}")
|
|
||||||
|
|
||||||
if 'authors' in book_data and isinstance(book_data['authors'], list):
|
|
||||||
for author in book_data['authors'][:3]:
|
|
||||||
if 'name' in author:
|
|
||||||
new_tags.append(f"author:{author['name']}")
|
|
||||||
|
|
||||||
if 'publish_date' in book_data:
|
|
||||||
new_tags.append(f"publish_date:{book_data['publish_date']}")
|
|
||||||
|
|
||||||
if 'publishers' in book_data and isinstance(book_data['publishers'], list):
|
|
||||||
for pub in book_data['publishers'][:1]:
|
|
||||||
if 'name' in pub:
|
|
||||||
new_tags.append(f"publisher:{pub['name']}")
|
|
||||||
|
|
||||||
if 'description' in book_data:
|
|
||||||
desc = book_data['description']
|
|
||||||
if isinstance(desc, dict) and 'value' in desc:
|
|
||||||
desc = desc['value']
|
|
||||||
if desc:
|
|
||||||
desc_str = str(desc).strip()
|
|
||||||
# Include description if available (limit to 200 chars to keep it manageable)
|
|
||||||
if len(desc_str) > 0:
|
|
||||||
new_tags.append(f"description:{desc_str[:200]}")
|
|
||||||
|
|
||||||
if 'number_of_pages' in book_data:
|
|
||||||
page_count = book_data['number_of_pages']
|
|
||||||
if page_count and isinstance(page_count, int) and page_count > 0:
|
|
||||||
new_tags.append(f"pages:{page_count}")
|
|
||||||
|
|
||||||
if 'identifiers' in book_data and isinstance(book_data['identifiers'], dict):
|
|
||||||
identifiers = book_data['identifiers']
|
|
||||||
|
|
||||||
if 'openlibrary' in identifiers:
|
|
||||||
ol_ids = identifiers['openlibrary']
|
|
||||||
if isinstance(ol_ids, list) and ol_ids:
|
|
||||||
new_tags.append(f"openlibrary:{ol_ids[0]}")
|
|
||||||
elif isinstance(ol_ids, str):
|
|
||||||
new_tags.append(f"openlibrary:{ol_ids}")
|
|
||||||
|
|
||||||
if 'lccn' in identifiers:
|
|
||||||
lccn_list = identifiers['lccn']
|
|
||||||
if isinstance(lccn_list, list) and lccn_list:
|
|
||||||
new_tags.append(f"lccn:{lccn_list[0]}")
|
|
||||||
elif isinstance(lccn_list, str):
|
|
||||||
new_tags.append(f"lccn:{lccn_list}")
|
|
||||||
|
|
||||||
if 'oclc' in identifiers:
|
|
||||||
oclc_list = identifiers['oclc']
|
|
||||||
if isinstance(oclc_list, list) and oclc_list:
|
|
||||||
new_tags.append(f"oclc:{oclc_list[0]}")
|
|
||||||
elif isinstance(oclc_list, str):
|
|
||||||
new_tags.append(f"oclc:{oclc_list}")
|
|
||||||
|
|
||||||
if 'goodreads' in identifiers:
|
|
||||||
goodreads_list = identifiers['goodreads']
|
|
||||||
if isinstance(goodreads_list, list) and goodreads_list:
|
|
||||||
new_tags.append(f"goodreads:{goodreads_list[0]}")
|
|
||||||
elif isinstance(goodreads_list, str):
|
|
||||||
new_tags.append(f"goodreads:{goodreads_list}")
|
|
||||||
|
|
||||||
if 'librarything' in identifiers:
|
|
||||||
lt_list = identifiers['librarything']
|
|
||||||
if isinstance(lt_list, list) and lt_list:
|
|
||||||
new_tags.append(f"librarything:{lt_list[0]}")
|
|
||||||
elif isinstance(lt_list, str):
|
|
||||||
new_tags.append(f"librarything:{lt_list}")
|
|
||||||
|
|
||||||
if 'doi' in identifiers:
|
|
||||||
doi_list = identifiers['doi']
|
|
||||||
if isinstance(doi_list, list) and doi_list:
|
|
||||||
new_tags.append(f"doi:{doi_list[0]}")
|
|
||||||
elif isinstance(doi_list, str):
|
|
||||||
new_tags.append(f"doi:{doi_list}")
|
|
||||||
|
|
||||||
if 'internet_archive' in identifiers:
|
|
||||||
ia_list = identifiers['internet_archive']
|
|
||||||
if isinstance(ia_list, list) and ia_list:
|
|
||||||
new_tags.append(f"internet_archive:{ia_list[0]}")
|
|
||||||
elif isinstance(ia_list, str):
|
|
||||||
new_tags.append(f"internet_archive:{ia_list}")
|
|
||||||
|
|
||||||
log(f"Found {len(new_tags)} tag(s) from ISBN lookup")
|
|
||||||
return new_tags
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log(f"ISBN scraping error: {e}", file=sys.stderr)
|
log(f"ISBN scraping error: {e}", file=sys.stderr)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
def _scrape_openlibrary_metadata(olid: str) -> List[str]:
|
def _scrape_openlibrary_metadata(olid: str) -> List[str]:
|
||||||
"""Scrape metadata for an OpenLibrary ID using the .json API endpoint.
|
if _ol_scrape_openlibrary_metadata is None:
|
||||||
|
log("OpenLibrary scraper unavailable", file=sys.stderr)
|
||||||
Fetches from https://openlibrary.org/books/{OLID}.json and extracts:
|
|
||||||
- Title, authors, publish date, publishers
|
|
||||||
- Description
|
|
||||||
- Subjects as freeform tags (without namespace prefix)
|
|
||||||
- Identifiers (ISBN, LCCN, OCLC, etc.)
|
|
||||||
"""
|
|
||||||
new_tags = []
|
|
||||||
try:
|
|
||||||
from ..API.HTTP import HTTPClient
|
|
||||||
import json as json_module
|
|
||||||
|
|
||||||
# Format: OL9674499M or just 9674499M
|
|
||||||
olid_clean = olid.replace('OL', '').replace('M', '')
|
|
||||||
if not olid_clean.isdigit():
|
|
||||||
olid_clean = olid
|
|
||||||
|
|
||||||
# Ensure we have the full OLID format for the URL
|
|
||||||
if not olid.startswith('OL'):
|
|
||||||
url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
|
|
||||||
else:
|
|
||||||
url = f"https://openlibrary.org/books/{olid}.json"
|
|
||||||
|
|
||||||
try:
|
|
||||||
with HTTPClient() as client:
|
|
||||||
response = client.get(url)
|
|
||||||
response.raise_for_status()
|
|
||||||
data = json_module.loads(response.content.decode('utf-8'))
|
|
||||||
except Exception as e:
|
|
||||||
log(f"Failed to fetch OpenLibrary metadata: {e}", file=sys.stderr)
|
|
||||||
return []
|
return []
|
||||||
|
try:
|
||||||
if not data:
|
return list(_ol_scrape_openlibrary_metadata(olid))
|
||||||
log(f"No OpenLibrary metadata found for: {olid}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
# Add title
|
|
||||||
if 'title' in data:
|
|
||||||
new_tags.append(f"title:{data['title']}")
|
|
||||||
|
|
||||||
# Add authors
|
|
||||||
if 'authors' in data and isinstance(data['authors'], list):
|
|
||||||
for author in data['authors'][:3]:
|
|
||||||
if isinstance(author, dict) and 'name' in author:
|
|
||||||
new_tags.append(f"author:{author['name']}")
|
|
||||||
elif isinstance(author, str):
|
|
||||||
new_tags.append(f"author:{author}")
|
|
||||||
|
|
||||||
# Add publish date
|
|
||||||
if 'publish_date' in data:
|
|
||||||
new_tags.append(f"publish_date:{data['publish_date']}")
|
|
||||||
|
|
||||||
# Add publishers
|
|
||||||
if 'publishers' in data and isinstance(data['publishers'], list):
|
|
||||||
for pub in data['publishers'][:1]:
|
|
||||||
if isinstance(pub, dict) and 'name' in pub:
|
|
||||||
new_tags.append(f"publisher:{pub['name']}")
|
|
||||||
elif isinstance(pub, str):
|
|
||||||
new_tags.append(f"publisher:{pub}")
|
|
||||||
|
|
||||||
# Add description
|
|
||||||
if 'description' in data:
|
|
||||||
desc = data['description']
|
|
||||||
if isinstance(desc, dict) and 'value' in desc:
|
|
||||||
desc = desc['value']
|
|
||||||
if desc:
|
|
||||||
desc_str = str(desc).strip()
|
|
||||||
if len(desc_str) > 0:
|
|
||||||
new_tags.append(f"description:{desc_str[:200]}")
|
|
||||||
|
|
||||||
# Add number of pages
|
|
||||||
if 'number_of_pages' in data:
|
|
||||||
page_count = data['number_of_pages']
|
|
||||||
if page_count and isinstance(page_count, int) and page_count > 0:
|
|
||||||
new_tags.append(f"pages:{page_count}")
|
|
||||||
|
|
||||||
# Add subjects as FREEFORM tags (no namespace prefix)
|
|
||||||
if 'subjects' in data and isinstance(data['subjects'], list):
|
|
||||||
for subject in data['subjects'][:10]:
|
|
||||||
if subject and isinstance(subject, str):
|
|
||||||
subject_clean = str(subject).strip()
|
|
||||||
if subject_clean and subject_clean not in new_tags:
|
|
||||||
new_tags.append(subject_clean)
|
|
||||||
|
|
||||||
# Add identifiers
|
|
||||||
if 'identifiers' in data and isinstance(data['identifiers'], dict):
|
|
||||||
identifiers = data['identifiers']
|
|
||||||
|
|
||||||
if 'isbn_10' in identifiers:
|
|
||||||
isbn_10_list = identifiers['isbn_10']
|
|
||||||
if isinstance(isbn_10_list, list) and isbn_10_list:
|
|
||||||
new_tags.append(f"isbn_10:{isbn_10_list[0]}")
|
|
||||||
elif isinstance(isbn_10_list, str):
|
|
||||||
new_tags.append(f"isbn_10:{isbn_10_list}")
|
|
||||||
|
|
||||||
if 'isbn_13' in identifiers:
|
|
||||||
isbn_13_list = identifiers['isbn_13']
|
|
||||||
if isinstance(isbn_13_list, list) and isbn_13_list:
|
|
||||||
new_tags.append(f"isbn_13:{isbn_13_list[0]}")
|
|
||||||
elif isinstance(isbn_13_list, str):
|
|
||||||
new_tags.append(f"isbn_13:{isbn_13_list}")
|
|
||||||
|
|
||||||
if 'lccn' in identifiers:
|
|
||||||
lccn_list = identifiers['lccn']
|
|
||||||
if isinstance(lccn_list, list) and lccn_list:
|
|
||||||
new_tags.append(f"lccn:{lccn_list[0]}")
|
|
||||||
elif isinstance(lccn_list, str):
|
|
||||||
new_tags.append(f"lccn:{lccn_list}")
|
|
||||||
|
|
||||||
if 'oclc_numbers' in identifiers:
|
|
||||||
oclc_list = identifiers['oclc_numbers']
|
|
||||||
if isinstance(oclc_list, list) and oclc_list:
|
|
||||||
new_tags.append(f"oclc:{oclc_list[0]}")
|
|
||||||
elif isinstance(oclc_list, str):
|
|
||||||
new_tags.append(f"oclc:{oclc_list}")
|
|
||||||
|
|
||||||
if 'goodreads' in identifiers:
|
|
||||||
goodreads_list = identifiers['goodreads']
|
|
||||||
if isinstance(goodreads_list, list) and goodreads_list:
|
|
||||||
new_tags.append(f"goodreads:{goodreads_list[0]}")
|
|
||||||
elif isinstance(goodreads_list, str):
|
|
||||||
new_tags.append(f"goodreads:{goodreads_list}")
|
|
||||||
|
|
||||||
log(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup")
|
|
||||||
return new_tags
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log(f"OpenLibrary scraping error: {e}", file=sys.stderr)
|
log(f"OpenLibrary scraping error: {e}", file=sys.stderr)
|
||||||
return []
|
return []
|
||||||
|
|||||||
@@ -1,28 +1,40 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from typing import Any, Dict, Sequence
|
from dataclasses import dataclass
|
||||||
|
from typing import Any, Dict, List, Sequence
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from . import register
|
|
||||||
import pipeline as ctx
|
import pipeline as ctx
|
||||||
from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field, normalize_hash
|
from ._shared import Cmdlet, SharedArgs, parse_cmdlet_args, get_field, normalize_hash
|
||||||
from SYS.logger import log
|
from SYS.logger import log
|
||||||
from Store import Store
|
from Store import Store
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class UrlItem:
|
||||||
|
url: str
|
||||||
|
hash: str
|
||||||
|
store: str
|
||||||
|
|
||||||
|
|
||||||
class Get_Url(Cmdlet):
|
class Get_Url(Cmdlet):
|
||||||
"""Get url associated with files via hash+store."""
|
"""Get url associated with files via hash+store."""
|
||||||
|
|
||||||
NAME = "get-url"
|
def __init__(self) -> None:
|
||||||
SUMMARY = "List url associated with a file"
|
super().__init__(
|
||||||
USAGE = "@1 | get-url"
|
name="get-url",
|
||||||
ARGS = [
|
summary="List url associated with a file",
|
||||||
|
usage="@1 | get-url",
|
||||||
|
arg=[
|
||||||
SharedArgs.HASH,
|
SharedArgs.HASH,
|
||||||
SharedArgs.STORE,
|
SharedArgs.STORE,
|
||||||
]
|
],
|
||||||
DETAIL = [
|
detail=[
|
||||||
"- Lists all url associated with file identified by hash+store",
|
"- Lists all url associated with file identified by hash+store",
|
||||||
]
|
],
|
||||||
|
exec=self.run,
|
||||||
|
)
|
||||||
|
self.register()
|
||||||
|
|
||||||
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||||
"""Get url for file via hash+store backend."""
|
"""Get url for file via hash+store backend."""
|
||||||
@@ -53,17 +65,33 @@ class Get_Url(Cmdlet):
|
|||||||
|
|
||||||
urls = backend.get_url(file_hash)
|
urls = backend.get_url(file_hash)
|
||||||
|
|
||||||
if urls:
|
from result_table import ResultTable
|
||||||
for u in urls:
|
|
||||||
# Emit rich object for pipeline compatibility
|
title = str(get_field(result, "title") or "").strip()
|
||||||
ctx.emit({
|
table_title = "Title"
|
||||||
"url": u,
|
if title:
|
||||||
"hash": file_hash,
|
table_title = f"Title: {title}"
|
||||||
"store": store_name,
|
|
||||||
})
|
table = ResultTable(table_title, max_columns=1).set_preserve_order(True)
|
||||||
return 0
|
table.set_source_command("get-url", [])
|
||||||
else:
|
|
||||||
ctx.emit("No url found")
|
items: List[UrlItem] = []
|
||||||
|
for u in list(urls or []):
|
||||||
|
u = str(u or "").strip()
|
||||||
|
if not u:
|
||||||
|
continue
|
||||||
|
row = table.add_row()
|
||||||
|
row.add_column("Url", u)
|
||||||
|
item = UrlItem(url=u, hash=file_hash, store=str(store_name))
|
||||||
|
items.append(item)
|
||||||
|
ctx.emit(item)
|
||||||
|
|
||||||
|
# Make this a real result table so @.. / @,, can navigate it
|
||||||
|
ctx.set_last_result_table(table if items else None, items, subject=result)
|
||||||
|
|
||||||
|
if not items:
|
||||||
|
log("No url found", file=sys.stderr)
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
except KeyError:
|
except KeyError:
|
||||||
@@ -74,7 +102,6 @@ class Get_Url(Cmdlet):
|
|||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
|
||||||
# Register cmdlet
|
CMDLET = Get_Url()
|
||||||
register(["get-url", "get_url"])(Get_Url)
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ from __future__ import annotations
|
|||||||
|
|
||||||
from typing import Any, Dict, Sequence, List, Optional, Tuple
|
from typing import Any, Dict, Sequence, List, Optional, Tuple
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from dataclasses import dataclass, field
|
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
@@ -11,57 +10,9 @@ import sys
|
|||||||
|
|
||||||
from SYS.logger import log, debug
|
from SYS.logger import log, debug
|
||||||
|
|
||||||
from ._shared import Cmdlet, CmdletArg, get_field, should_show_help
|
from ._shared import Cmdlet, CmdletArg, get_field, should_show_help, normalize_hash, first_title_tag
|
||||||
import pipeline as ctx
|
import pipeline as ctx
|
||||||
|
|
||||||
# Optional dependencies
|
|
||||||
try:
|
|
||||||
import mutagen # type: ignore
|
|
||||||
except ImportError: # pragma: no cover
|
|
||||||
mutagen = None # type: ignore
|
|
||||||
|
|
||||||
try:
|
|
||||||
from config import get_hydrus_url, resolve_output_dir
|
|
||||||
except Exception: # pragma: no cover
|
|
||||||
get_hydrus_url = None # type: ignore
|
|
||||||
resolve_output_dir = None # type: ignore
|
|
||||||
|
|
||||||
try:
|
|
||||||
from API.HydrusNetwork import HydrusNetwork, HydrusRequestError
|
|
||||||
except ImportError: # pragma: no cover
|
|
||||||
HydrusNetwork = None # type: ignore
|
|
||||||
HydrusRequestError = RuntimeError # type: ignore
|
|
||||||
|
|
||||||
try:
|
|
||||||
from SYS.utils import sha256_file
|
|
||||||
except ImportError: # pragma: no cover
|
|
||||||
sha256_file = None # type: ignore
|
|
||||||
|
|
||||||
try:
|
|
||||||
from SYS.utils_constant import mime_maps
|
|
||||||
except ImportError: # pragma: no cover
|
|
||||||
mime_maps = {} # type: ignore
|
|
||||||
|
|
||||||
@dataclass(slots=True)
|
|
||||||
class SearchRecord:
|
|
||||||
path: str
|
|
||||||
size_bytes: int | None = None
|
|
||||||
duration_seconds: str | None = None
|
|
||||||
tag: str | None = None
|
|
||||||
hash: str | None = None
|
|
||||||
|
|
||||||
def as_dict(self) -> dict[str, str]:
|
|
||||||
payload: dict[str, str] = {"path": self.path}
|
|
||||||
if self.size_bytes is not None:
|
|
||||||
payload["size"] = str(self.size_bytes)
|
|
||||||
if self.duration_seconds:
|
|
||||||
payload["duration"] = self.duration_seconds
|
|
||||||
if self.tag:
|
|
||||||
payload["tag"] = self.tag
|
|
||||||
if self.hash:
|
|
||||||
payload["hash"] = self.hash
|
|
||||||
return payload
|
|
||||||
|
|
||||||
|
|
||||||
STORAGE_ORIGINS = {"local", "hydrus", "folder"}
|
STORAGE_ORIGINS = {"local", "hydrus", "folder"}
|
||||||
|
|
||||||
@@ -86,12 +37,15 @@ class Search_Store(Cmdlet):
|
|||||||
detail=[
|
detail=[
|
||||||
"Search across storage backends: Folder stores and Hydrus instances",
|
"Search across storage backends: Folder stores and Hydrus instances",
|
||||||
"Use -store to search a specific backend by name",
|
"Use -store to search a specific backend by name",
|
||||||
|
"URL search: url:* (any URL) or url:<value> (URL substring)",
|
||||||
"Filter results by: tag, size, type, duration",
|
"Filter results by: tag, size, type, duration",
|
||||||
"Results include hash for downstream commands (get-file, add-tag, etc.)",
|
"Results include hash for downstream commands (get-file, add-tag, etc.)",
|
||||||
"Examples:",
|
"Examples:",
|
||||||
"search-store foo # Search all storage backends",
|
"search-store foo # Search all storage backends",
|
||||||
"search-store -store home '*' # Search 'home' Hydrus instance",
|
"search-store -store home '*' # Search 'home' Hydrus instance",
|
||||||
"search-store -store test 'video' # Search 'test' folder store",
|
"search-store -store test 'video' # Search 'test' folder store",
|
||||||
|
"search-store 'url:*' # Files that have any URL",
|
||||||
|
"search-store 'url:youtube.com' # Files whose URL contains substring",
|
||||||
"search-store song -type audio # Search for audio files",
|
"search-store song -type audio # Search for audio files",
|
||||||
"search-store movie -tag action # Search with tag filter",
|
"search-store movie -tag action # Search with tag filter",
|
||||||
],
|
],
|
||||||
@@ -100,6 +54,40 @@ class Search_Store(Cmdlet):
|
|||||||
self.register()
|
self.register()
|
||||||
|
|
||||||
# --- Helper methods -------------------------------------------------
|
# --- Helper methods -------------------------------------------------
|
||||||
|
@staticmethod
|
||||||
|
def _parse_hash_query(query: str) -> List[str]:
|
||||||
|
"""Parse a `hash:` query into a list of normalized 64-hex SHA256 hashes.
|
||||||
|
|
||||||
|
Supported examples:
|
||||||
|
- hash:<h1>,<h2>,<h3>
|
||||||
|
- Hash: <h1> <h2> <h3>
|
||||||
|
- hash:{<h1>, <h2>}
|
||||||
|
"""
|
||||||
|
q = str(query or "").strip()
|
||||||
|
if not q:
|
||||||
|
return []
|
||||||
|
|
||||||
|
m = re.match(r"^hash(?:es)?\s*:\s*(.+)$", q, flags=re.IGNORECASE)
|
||||||
|
if not m:
|
||||||
|
return []
|
||||||
|
|
||||||
|
rest = (m.group(1) or "").strip()
|
||||||
|
if rest.startswith("{") and rest.endswith("}"):
|
||||||
|
rest = rest[1:-1].strip()
|
||||||
|
if rest.startswith("[") and rest.endswith("]"):
|
||||||
|
rest = rest[1:-1].strip()
|
||||||
|
|
||||||
|
# Split on commas and whitespace.
|
||||||
|
raw_parts = [p.strip() for p in re.split(r"[\s,]+", rest) if p.strip()]
|
||||||
|
out: List[str] = []
|
||||||
|
for part in raw_parts:
|
||||||
|
h = normalize_hash(part)
|
||||||
|
if not h:
|
||||||
|
continue
|
||||||
|
if h not in out:
|
||||||
|
out.append(h)
|
||||||
|
return out
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _normalize_extension(ext_value: Any) -> str:
|
def _normalize_extension(ext_value: Any) -> str:
|
||||||
"""Sanitize extension strings to alphanumerics and cap at 5 chars."""
|
"""Sanitize extension strings to alphanumerics and cap at 5 chars."""
|
||||||
@@ -150,10 +138,10 @@ class Search_Store(Cmdlet):
|
|||||||
|
|
||||||
# Parse arguments
|
# Parse arguments
|
||||||
query = ""
|
query = ""
|
||||||
tag_filters: List[str] = []
|
_tag_filters: List[str] = []
|
||||||
size_filter: Optional[Tuple[str, int]] = None
|
_size_filter: Optional[Tuple[str, int]] = None
|
||||||
duration_filter: Optional[Tuple[str, float]] = None
|
_duration_filter: Optional[Tuple[str, float]] = None
|
||||||
type_filter: Optional[str] = None
|
_type_filter: Optional[str] = None
|
||||||
storage_backend: Optional[str] = None
|
storage_backend: Optional[str] = None
|
||||||
limit = 100
|
limit = 100
|
||||||
searched_backends: List[str] = []
|
searched_backends: List[str] = []
|
||||||
@@ -166,7 +154,7 @@ class Search_Store(Cmdlet):
|
|||||||
storage_backend = args_list[i + 1]
|
storage_backend = args_list[i + 1]
|
||||||
i += 2
|
i += 2
|
||||||
elif low in {"-tag", "--tag"} and i + 1 < len(args_list):
|
elif low in {"-tag", "--tag"} and i + 1 < len(args_list):
|
||||||
tag_filters.append(args_list[i + 1])
|
_tag_filters.append(args_list[i + 1])
|
||||||
i += 2
|
i += 2
|
||||||
elif low in {"-limit", "--limit"} and i + 1 < len(args_list):
|
elif low in {"-limit", "--limit"} and i + 1 < len(args_list):
|
||||||
try:
|
try:
|
||||||
@@ -175,7 +163,7 @@ class Search_Store(Cmdlet):
|
|||||||
limit = 100
|
limit = 100
|
||||||
i += 2
|
i += 2
|
||||||
elif low in {"-type", "--type"} and i + 1 < len(args_list):
|
elif low in {"-type", "--type"} and i + 1 < len(args_list):
|
||||||
type_filter = args_list[i + 1].lower()
|
_type_filter = args_list[i + 1].lower()
|
||||||
i += 2
|
i += 2
|
||||||
elif not arg.startswith("-"):
|
elif not arg.startswith("-"):
|
||||||
query = f"{query} {arg}".strip() if query else arg
|
query = f"{query} {arg}".strip() if query else arg
|
||||||
@@ -195,6 +183,8 @@ class Search_Store(Cmdlet):
|
|||||||
if store_filter and not storage_backend:
|
if store_filter and not storage_backend:
|
||||||
storage_backend = store_filter
|
storage_backend = store_filter
|
||||||
|
|
||||||
|
hash_query = self._parse_hash_query(query)
|
||||||
|
|
||||||
if not query:
|
if not query:
|
||||||
log("Provide a search query", file=sys.stderr)
|
log("Provide a search query", file=sys.stderr)
|
||||||
return 1
|
return 1
|
||||||
@@ -230,12 +220,136 @@ class Search_Store(Cmdlet):
|
|||||||
table_title += f" [{storage_backend}]"
|
table_title += f" [{storage_backend}]"
|
||||||
|
|
||||||
table = ResultTable(table_title)
|
table = ResultTable(table_title)
|
||||||
|
if hash_query:
|
||||||
|
try:
|
||||||
|
table.set_preserve_order(True)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
from Store import Store
|
from Store import Store
|
||||||
storage = Store(config=config or {})
|
storage = Store(config=config or {})
|
||||||
from Store._base import Store as BaseStore
|
from Store._base import Store as BaseStore
|
||||||
|
|
||||||
backend_to_search = storage_backend or None
|
backend_to_search = storage_backend or None
|
||||||
|
if hash_query:
|
||||||
|
# Explicit hash list search: build rows from backend metadata.
|
||||||
|
backends_to_try: List[str] = []
|
||||||
|
if backend_to_search:
|
||||||
|
backends_to_try = [backend_to_search]
|
||||||
|
else:
|
||||||
|
backends_to_try = list(storage.list_backends())
|
||||||
|
|
||||||
|
found_any = False
|
||||||
|
for h in hash_query:
|
||||||
|
resolved_backend_name: Optional[str] = None
|
||||||
|
resolved_backend = None
|
||||||
|
|
||||||
|
for backend_name in backends_to_try:
|
||||||
|
try:
|
||||||
|
backend = storage[backend_name]
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
# If get_metadata works, consider it a hit; get_file can be optional (e.g. remote URL).
|
||||||
|
meta = backend.get_metadata(h)
|
||||||
|
if meta is None:
|
||||||
|
continue
|
||||||
|
resolved_backend_name = backend_name
|
||||||
|
resolved_backend = backend
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if resolved_backend_name is None or resolved_backend is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
found_any = True
|
||||||
|
searched_backends.append(resolved_backend_name)
|
||||||
|
|
||||||
|
# Resolve a path/URL string if possible
|
||||||
|
path_str: Optional[str] = None
|
||||||
|
try:
|
||||||
|
maybe_path = resolved_backend.get_file(h)
|
||||||
|
if isinstance(maybe_path, Path):
|
||||||
|
path_str = str(maybe_path)
|
||||||
|
elif isinstance(maybe_path, str) and maybe_path:
|
||||||
|
path_str = maybe_path
|
||||||
|
except Exception:
|
||||||
|
path_str = None
|
||||||
|
|
||||||
|
meta_obj: Dict[str, Any] = {}
|
||||||
|
try:
|
||||||
|
meta_obj = resolved_backend.get_metadata(h) or {}
|
||||||
|
except Exception:
|
||||||
|
meta_obj = {}
|
||||||
|
|
||||||
|
tags_list: List[str] = []
|
||||||
|
try:
|
||||||
|
tag_result = resolved_backend.get_tag(h)
|
||||||
|
if isinstance(tag_result, tuple) and tag_result:
|
||||||
|
maybe_tags = tag_result[0]
|
||||||
|
else:
|
||||||
|
maybe_tags = tag_result
|
||||||
|
if isinstance(maybe_tags, list):
|
||||||
|
tags_list = [str(t).strip() for t in maybe_tags if isinstance(t, str) and str(t).strip()]
|
||||||
|
except Exception:
|
||||||
|
tags_list = []
|
||||||
|
|
||||||
|
title_from_tag: Optional[str] = None
|
||||||
|
try:
|
||||||
|
title_tag = first_title_tag(tags_list)
|
||||||
|
if title_tag and ":" in title_tag:
|
||||||
|
title_from_tag = title_tag.split(":", 1)[1].strip()
|
||||||
|
except Exception:
|
||||||
|
title_from_tag = None
|
||||||
|
|
||||||
|
title = title_from_tag or meta_obj.get("title") or meta_obj.get("name")
|
||||||
|
if not title and path_str:
|
||||||
|
try:
|
||||||
|
title = Path(path_str).stem
|
||||||
|
except Exception:
|
||||||
|
title = path_str
|
||||||
|
|
||||||
|
ext_val = meta_obj.get("ext") or meta_obj.get("extension")
|
||||||
|
if not ext_val and path_str:
|
||||||
|
try:
|
||||||
|
ext_val = Path(path_str).suffix
|
||||||
|
except Exception:
|
||||||
|
ext_val = None
|
||||||
|
|
||||||
|
size_bytes = meta_obj.get("size")
|
||||||
|
if size_bytes is None:
|
||||||
|
size_bytes = meta_obj.get("size_bytes")
|
||||||
|
try:
|
||||||
|
size_bytes_int: Optional[int] = int(size_bytes) if size_bytes is not None else None
|
||||||
|
except Exception:
|
||||||
|
size_bytes_int = None
|
||||||
|
|
||||||
|
payload: Dict[str, Any] = {
|
||||||
|
"title": str(title or h),
|
||||||
|
"hash": h,
|
||||||
|
"store": resolved_backend_name,
|
||||||
|
"path": path_str,
|
||||||
|
"ext": self._normalize_extension(ext_val),
|
||||||
|
"size_bytes": size_bytes_int,
|
||||||
|
"tag": tags_list,
|
||||||
|
}
|
||||||
|
|
||||||
|
table.add_result(payload)
|
||||||
|
results_list.append(payload)
|
||||||
|
ctx.emit(payload)
|
||||||
|
|
||||||
|
if found_any:
|
||||||
|
ctx.set_last_result_table(table, results_list)
|
||||||
|
db.append_worker_stdout(worker_id, json.dumps(results_list, indent=2))
|
||||||
|
db.update_worker_status(worker_id, 'completed')
|
||||||
|
return 0
|
||||||
|
|
||||||
|
log("No results found", file=sys.stderr)
|
||||||
|
db.append_worker_stdout(worker_id, json.dumps([], indent=2))
|
||||||
|
db.update_worker_status(worker_id, 'completed')
|
||||||
|
return 0
|
||||||
|
|
||||||
if backend_to_search:
|
if backend_to_search:
|
||||||
searched_backends.append(backend_to_search)
|
searched_backends.append(backend_to_search)
|
||||||
target_backend = storage[backend_to_search]
|
target_backend = storage[backend_to_search]
|
||||||
@@ -243,7 +357,9 @@ class Search_Store(Cmdlet):
|
|||||||
log(f"Backend '{backend_to_search}' does not support searching", file=sys.stderr)
|
log(f"Backend '{backend_to_search}' does not support searching", file=sys.stderr)
|
||||||
db.update_worker_status(worker_id, 'error')
|
db.update_worker_status(worker_id, 'error')
|
||||||
return 1
|
return 1
|
||||||
|
debug(f"[search-store] Searching '{backend_to_search}'")
|
||||||
results = target_backend.search(query, limit=limit)
|
results = target_backend.search(query, limit=limit)
|
||||||
|
debug(f"[search-store] '{backend_to_search}' -> {len(results or [])} result(s)")
|
||||||
else:
|
else:
|
||||||
from API.HydrusNetwork import is_hydrus_available
|
from API.HydrusNetwork import is_hydrus_available
|
||||||
hydrus_available = is_hydrus_available(config or {})
|
hydrus_available = is_hydrus_available(config or {})
|
||||||
@@ -257,7 +373,9 @@ class Search_Store(Cmdlet):
|
|||||||
continue
|
continue
|
||||||
searched_backends.append(backend_name)
|
searched_backends.append(backend_name)
|
||||||
|
|
||||||
|
debug(f"[search-store] Searching '{backend_name}'")
|
||||||
backend_results = backend.search(query, limit=limit - len(all_results))
|
backend_results = backend.search(query, limit=limit - len(all_results))
|
||||||
|
debug(f"[search-store] '{backend_name}' -> {len(backend_results or [])} result(s)")
|
||||||
if backend_results:
|
if backend_results:
|
||||||
all_results.extend(backend_results)
|
all_results.extend(backend_results)
|
||||||
if len(all_results) >= limit:
|
if len(all_results) >= limit:
|
||||||
@@ -317,11 +435,6 @@ class Search_Store(Cmdlet):
|
|||||||
results_list.append(normalized)
|
results_list.append(normalized)
|
||||||
ctx.emit(normalized)
|
ctx.emit(normalized)
|
||||||
|
|
||||||
# Debug: Verify table rows match items list
|
|
||||||
debug(f"[search-store] Added {len(table.rows)} rows to table, {len(results_list)} items to results_list")
|
|
||||||
if len(table.rows) != len(results_list):
|
|
||||||
debug(f"[search-store] WARNING: Table/items mismatch! rows={len(table.rows)} items={len(results_list)}", file=sys.stderr)
|
|
||||||
|
|
||||||
ctx.set_last_result_table(table, results_list)
|
ctx.set_last_result_table(table, results_list)
|
||||||
db.append_worker_stdout(worker_id, json.dumps(results_list, indent=2))
|
db.append_worker_stdout(worker_id, json.dumps(results_list, indent=2))
|
||||||
else:
|
else:
|
||||||
|
|||||||
726
metadata.py
726
metadata.py
@@ -3,14 +3,12 @@ import re
|
|||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import shutil
|
import shutil
|
||||||
import sqlite3
|
|
||||||
import requests
|
|
||||||
from SYS.logger import log, debug
|
from SYS.logger import log, debug
|
||||||
from urllib.parse import urlsplit, urlunsplit, unquote
|
from urllib.parse import urlsplit, urlunsplit, unquote
|
||||||
from collections import deque
|
from collections import deque
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple
|
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple
|
||||||
from models import PipeObject, FileRelationshipTracker, _get_file_hash
|
from models import FileRelationshipTracker
|
||||||
try:
|
try:
|
||||||
import musicbrainzngs # type: ignore
|
import musicbrainzngs # type: ignore
|
||||||
except ImportError: # pragma: no cover
|
except ImportError: # pragma: no cover
|
||||||
@@ -332,6 +330,112 @@ def _generate_hydrus_url_variants(url: str) -> List[str]:
|
|||||||
return variants
|
return variants
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_urls(value: Any) -> List[str]:
|
||||||
|
"""Normalize a URL field into a stable, deduplicated list.
|
||||||
|
|
||||||
|
Accepts:
|
||||||
|
- None
|
||||||
|
- a single URL string (optionally containing multiple URLs)
|
||||||
|
- a list/tuple/set of URL strings
|
||||||
|
|
||||||
|
This helper is used by cmdlets/stores/pipeline to keep `url` consistent.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _iter_raw_urls(raw: Any) -> Iterable[str]:
|
||||||
|
if raw is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
if isinstance(raw, str):
|
||||||
|
text = raw.strip()
|
||||||
|
if not text:
|
||||||
|
return
|
||||||
|
# Support legacy prefixes like "url:https://...".
|
||||||
|
if text.lower().startswith("url:"):
|
||||||
|
text = text.split(":", 1)[1].strip()
|
||||||
|
|
||||||
|
# Prefer extracting obvious URLs to avoid splitting inside query strings.
|
||||||
|
matches = re.findall(r"https?://[^\s,]+", text, flags=re.IGNORECASE)
|
||||||
|
if matches:
|
||||||
|
for m in matches:
|
||||||
|
yield m
|
||||||
|
return
|
||||||
|
|
||||||
|
# Fallback: split on commas/whitespace.
|
||||||
|
for token in text.replace("\n", " ").replace("\r", " ").replace(",", " ").split():
|
||||||
|
if token:
|
||||||
|
yield token
|
||||||
|
return
|
||||||
|
|
||||||
|
if isinstance(raw, (list, tuple, set)):
|
||||||
|
for item in raw:
|
||||||
|
if item is None:
|
||||||
|
continue
|
||||||
|
if isinstance(item, str):
|
||||||
|
if item.strip():
|
||||||
|
yield item
|
||||||
|
else:
|
||||||
|
text = str(item).strip()
|
||||||
|
if text:
|
||||||
|
yield text
|
||||||
|
return
|
||||||
|
|
||||||
|
# Last resort: string-coerce.
|
||||||
|
text = str(raw).strip()
|
||||||
|
if text:
|
||||||
|
yield text
|
||||||
|
|
||||||
|
def _canonicalize(url_text: str) -> Optional[str]:
|
||||||
|
u = str(url_text or "").strip()
|
||||||
|
if not u:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Trim common wrappers and trailing punctuation.
|
||||||
|
u = u.strip("<>\"' ")
|
||||||
|
u = u.rstrip(")].,;\"")
|
||||||
|
if not u:
|
||||||
|
return None
|
||||||
|
|
||||||
|
lower = u.lower()
|
||||||
|
if not (lower.startswith("http://") or lower.startswith("https://")):
|
||||||
|
return u
|
||||||
|
|
||||||
|
try:
|
||||||
|
parsed = urlsplit(u)
|
||||||
|
except Exception:
|
||||||
|
return u
|
||||||
|
|
||||||
|
scheme = (parsed.scheme or "").lower()
|
||||||
|
netloc = (parsed.netloc or "").lower()
|
||||||
|
path = unquote(parsed.path or "")
|
||||||
|
query = parsed.query or ""
|
||||||
|
|
||||||
|
# Normalize default ports.
|
||||||
|
if scheme == "http" and netloc.endswith(":80"):
|
||||||
|
netloc = netloc[:-3]
|
||||||
|
elif scheme == "https" and netloc.endswith(":443"):
|
||||||
|
netloc = netloc[:-4]
|
||||||
|
|
||||||
|
# Prefer no trailing slash except root.
|
||||||
|
if path and path != "/":
|
||||||
|
path = path.rstrip("/")
|
||||||
|
|
||||||
|
# Fragments are not part of the resource.
|
||||||
|
return urlunsplit((scheme, netloc, path, query, ""))
|
||||||
|
|
||||||
|
seen: Set[str] = set()
|
||||||
|
out: List[str] = []
|
||||||
|
for raw_url in _iter_raw_urls(value):
|
||||||
|
canonical = _canonicalize(raw_url)
|
||||||
|
if not canonical:
|
||||||
|
continue
|
||||||
|
if canonical in seen:
|
||||||
|
continue
|
||||||
|
seen.add(canonical)
|
||||||
|
out.append(canonical)
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
def value_normalize(value: str) -> str:
|
def value_normalize(value: str) -> str:
|
||||||
"""Normalize whitespace: collapse internal spaces, strip, remove newlines."""
|
"""Normalize whitespace: collapse internal spaces, strip, remove newlines."""
|
||||||
value = value.replace("\n", " ").replace("\r", " ")
|
value = value.replace("\n", " ").replace("\r", " ")
|
||||||
@@ -358,6 +462,7 @@ def import_pending_sidecars(db_root: Path, db: Any) -> None:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# Ensure file entry exists
|
# Ensure file entry exists
|
||||||
|
file_id: Optional[int] = None
|
||||||
try:
|
try:
|
||||||
cursor = db.connection.cursor() if db.connection else None
|
cursor = db.connection.cursor() if db.connection else None
|
||||||
if cursor:
|
if cursor:
|
||||||
@@ -394,10 +499,16 @@ def import_pending_sidecars(db_root: Path, db: Any) -> None:
|
|||||||
try:
|
try:
|
||||||
cursor = db.connection.cursor() if db.connection else None
|
cursor = db.connection.cursor() if db.connection else None
|
||||||
if cursor:
|
if cursor:
|
||||||
|
file_hash_value: Optional[str] = None
|
||||||
|
if hasattr(db, 'get_file_hash'):
|
||||||
|
try:
|
||||||
|
file_hash_value = db.get_file_hash(file_id)
|
||||||
|
except Exception:
|
||||||
|
file_hash_value = None
|
||||||
for tag in tags:
|
for tag in tags:
|
||||||
cursor.execute(
|
cursor.execute(
|
||||||
'INSERT OR IGNORE INTO tags (hash, tag) VALUES (?, ?)',
|
'INSERT OR IGNORE INTO tags (hash, tag) VALUES (?, ?)',
|
||||||
(file_hash_value, tag) if hasattr(db, 'get_file_hash') else (None, tag)
|
(file_hash_value, tag)
|
||||||
)
|
)
|
||||||
db.connection.commit()
|
db.connection.commit()
|
||||||
except Exception:
|
except Exception:
|
||||||
@@ -663,128 +774,6 @@ def fetch_musicbrainz_tags(mbid: str, entity: str) -> Dict[str, object]:
|
|||||||
return {"source": "musicbrainz", "id": mbid, "tag": tags, "entity": entity}
|
return {"source": "musicbrainz", "id": mbid, "tag": tags, "entity": entity}
|
||||||
|
|
||||||
|
|
||||||
def fetch_openlibrary_tags(ol_id: str) -> Dict[str, object]:
|
|
||||||
"""Fetch metadata tags from OpenLibrary.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
ol_id: OpenLibrary ID (e.g., 'OL123456M' for a book)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dictionary with 'tag' key containing list of extracted tags
|
|
||||||
"""
|
|
||||||
import urllib.request
|
|
||||||
|
|
||||||
# Normalize OL ID
|
|
||||||
ol_id = ol_id.strip().upper()
|
|
||||||
if not ol_id.startswith('OL'):
|
|
||||||
ol_id = f'OL{ol_id}'
|
|
||||||
|
|
||||||
# Fetch from OpenLibrary API
|
|
||||||
url = f"https://openlibrary.org/books/{ol_id}.json"
|
|
||||||
tags: List[str] = []
|
|
||||||
|
|
||||||
try:
|
|
||||||
with urllib.request.urlopen(url, timeout=10) as response:
|
|
||||||
data = json.loads(response.read().decode('utf-8'))
|
|
||||||
except Exception as e:
|
|
||||||
raise ValueError(f"Failed to fetch OpenLibrary data for {ol_id}: {e}")
|
|
||||||
|
|
||||||
# Add OpenLibrary ID tag
|
|
||||||
_add_tag(tags, "openlibrary", ol_id)
|
|
||||||
|
|
||||||
# Extract title
|
|
||||||
_add_tag(tags, "title", data.get("title"))
|
|
||||||
|
|
||||||
# Extract subtitle if present
|
|
||||||
if data.get("subtitle"):
|
|
||||||
_add_tag(tags, "subtitle", data["subtitle"])
|
|
||||||
|
|
||||||
# Extract authors
|
|
||||||
authors = data.get("authors", [])
|
|
||||||
author_names: List[str] = []
|
|
||||||
for author in authors:
|
|
||||||
if isinstance(author, dict):
|
|
||||||
name = author.get("name")
|
|
||||||
else:
|
|
||||||
name = str(author)
|
|
||||||
if name:
|
|
||||||
author_names.append(name)
|
|
||||||
if author_names:
|
|
||||||
_extend_tags(tags, "author", author_names)
|
|
||||||
|
|
||||||
# Extract publication details
|
|
||||||
if data.get("publish_date"):
|
|
||||||
_add_tag(tags, "publish_date", data["publish_date"])
|
|
||||||
# Extract year if present
|
|
||||||
year_match = re.search(r'\b(\d{4})\b', str(data.get("publish_date", "")))
|
|
||||||
if year_match:
|
|
||||||
_add_tag(tags, "year", year_match.group(1))
|
|
||||||
|
|
||||||
# Extract publishers
|
|
||||||
publishers = data.get("publishers", [])
|
|
||||||
if publishers:
|
|
||||||
publisher_names = []
|
|
||||||
for pub in publishers:
|
|
||||||
if isinstance(pub, dict):
|
|
||||||
name = pub.get("name")
|
|
||||||
else:
|
|
||||||
name = str(pub)
|
|
||||||
if name:
|
|
||||||
publisher_names.append(name)
|
|
||||||
if publisher_names:
|
|
||||||
_extend_tags(tags, "publisher", publisher_names)
|
|
||||||
|
|
||||||
# Extract languages
|
|
||||||
languages = data.get("languages", [])
|
|
||||||
if languages:
|
|
||||||
lang_codes = []
|
|
||||||
for lang in languages:
|
|
||||||
if isinstance(lang, dict):
|
|
||||||
code = lang.get("key", "").split("/")[-1]
|
|
||||||
else:
|
|
||||||
code = str(lang).split("/")[-1]
|
|
||||||
if code and code != "":
|
|
||||||
lang_codes.append(code)
|
|
||||||
if lang_codes:
|
|
||||||
_extend_tags(tags, "language", lang_codes)
|
|
||||||
|
|
||||||
# Extract ISBN
|
|
||||||
isbns = data.get("isbn_10", []) + data.get("isbn_13", [])
|
|
||||||
if isbns:
|
|
||||||
for isbn in isbns[:1]: # Just take first one
|
|
||||||
if len(str(isbn)) == 10:
|
|
||||||
_add_tag(tags, "isbn_10", isbn)
|
|
||||||
elif len(str(isbn)) == 13:
|
|
||||||
_add_tag(tags, "isbn_13", isbn)
|
|
||||||
|
|
||||||
# Extract page count
|
|
||||||
_add_tag(tags, "pages", data.get("number_of_pages"))
|
|
||||||
|
|
||||||
# Extract genres/subjects (OpenLibrary calls them subjects)
|
|
||||||
# Subjects are added as plain freeform tags (no namespace prefix)
|
|
||||||
subjects = data.get("subjects", [])
|
|
||||||
if subjects:
|
|
||||||
for subject in subjects[:10]: # Limit to 10 subjects
|
|
||||||
if isinstance(subject, dict):
|
|
||||||
name = subject.get("name")
|
|
||||||
else:
|
|
||||||
name = str(subject)
|
|
||||||
if name:
|
|
||||||
# Add subject as plain tag without "subject:" prefix
|
|
||||||
normalized = value_normalize(str(name))
|
|
||||||
if normalized:
|
|
||||||
tags.append(normalized)
|
|
||||||
|
|
||||||
# Extract OpenLibrary description
|
|
||||||
description = data.get("description")
|
|
||||||
if description:
|
|
||||||
if isinstance(description, dict):
|
|
||||||
description = description.get("value")
|
|
||||||
_add_tag(tags, "summary", description)
|
|
||||||
|
|
||||||
return {"source": "openlibrary", "id": ol_id, "tag": tags}
|
|
||||||
|
|
||||||
|
|
||||||
def _append_unique(target: List[str], seen: Set[str], value: Optional[str]) -> None:
|
def _append_unique(target: List[str], seen: Set[str], value: Optional[str]) -> None:
|
||||||
"""Append a single value if not already in seen set (deduplication)."""
|
"""Append a single value if not already in seen set (deduplication)."""
|
||||||
if value is None:
|
if value is None:
|
||||||
@@ -1545,7 +1534,7 @@ def _derive_sidecar_path(media_path: Path) -> Path:
|
|||||||
return preferred
|
return preferred
|
||||||
|
|
||||||
|
|
||||||
def _read_sidecar_metadata(sidecar_path: Path) -> tuple[Optional[str], List[str], List[str]]:
|
def _read_sidecar_metadata(sidecar_path: Path) -> tuple[Optional[str], List[str], List[str]]: # pyright: ignore[reportUnusedFunction]
|
||||||
"""Read hash, tags, and url from sidecar file.
|
"""Read hash, tags, and url from sidecar file.
|
||||||
|
|
||||||
Consolidated with read_tags_from_file - this extracts extra metadata (hash, url).
|
Consolidated with read_tags_from_file - this extracts extra metadata (hash, url).
|
||||||
@@ -1559,7 +1548,7 @@ def _read_sidecar_metadata(sidecar_path: Path) -> tuple[Optional[str], List[str]
|
|||||||
|
|
||||||
hash_value: Optional[str] = None
|
hash_value: Optional[str] = None
|
||||||
tags: List[str] = []
|
tags: List[str] = []
|
||||||
url: List[str] = []
|
urls: List[str] = []
|
||||||
|
|
||||||
for raw_line in raw.splitlines():
|
for raw_line in raw.splitlines():
|
||||||
line = raw_line.strip()
|
line = raw_line.strip()
|
||||||
@@ -1574,15 +1563,15 @@ def _read_sidecar_metadata(sidecar_path: Path) -> tuple[Optional[str], List[str]
|
|||||||
url_part = line.split(':', 1)[1].strip() if ':' in line else ''
|
url_part = line.split(':', 1)[1].strip() if ':' in line else ''
|
||||||
if url_part:
|
if url_part:
|
||||||
for url_segment in url_part.split(','):
|
for url_segment in url_part.split(','):
|
||||||
for url in url_segment.split():
|
for url_token in url_segment.split():
|
||||||
url_clean = url.strip()
|
url_clean = url_token.strip()
|
||||||
if url_clean and url_clean not in url:
|
if url_clean and url_clean not in urls:
|
||||||
url.append(url_clean)
|
urls.append(url_clean)
|
||||||
else:
|
else:
|
||||||
# Everything else is a tag (including relationship: lines)
|
# Everything else is a tag (including relationship: lines)
|
||||||
tags.append(line)
|
tags.append(line)
|
||||||
|
|
||||||
return hash_value, tags, url
|
return hash_value, tags, urls
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -1827,63 +1816,6 @@ def apply_title_to_path(media_path: Path, tags: Iterable[str]) -> Path:
|
|||||||
return destination
|
return destination
|
||||||
|
|
||||||
|
|
||||||
def _collect_search_roots(payload: Dict[str, Any]) -> List[Path]:
|
|
||||||
roots: List[Path] = []
|
|
||||||
for key in ('paths', 'search_paths', 'roots', 'directories'):
|
|
||||||
raw = payload.get(key)
|
|
||||||
if not raw:
|
|
||||||
continue
|
|
||||||
entries = raw if isinstance(raw, (list, tuple, set)) else [raw]
|
|
||||||
for entry in entries:
|
|
||||||
if not entry:
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
candidate = Path(str(entry)).expanduser()
|
|
||||||
except Exception:
|
|
||||||
continue
|
|
||||||
roots.append(candidate)
|
|
||||||
if load_config is not None and resolve_output_dir is not None:
|
|
||||||
try:
|
|
||||||
config = load_config()
|
|
||||||
except Exception:
|
|
||||||
config = None
|
|
||||||
if isinstance(config, dict) and config:
|
|
||||||
try:
|
|
||||||
default_root = resolve_output_dir(config)
|
|
||||||
except Exception:
|
|
||||||
default_root = None
|
|
||||||
if default_root is not None:
|
|
||||||
roots.append(default_root)
|
|
||||||
return roots
|
|
||||||
|
|
||||||
|
|
||||||
def _locate_sidecar_by_hash(hash_value: str, roots: Iterable[Path]) -> Optional[Path]:
|
|
||||||
target = f'hash:{hash_value.strip().lower()}'
|
|
||||||
for root in roots:
|
|
||||||
try:
|
|
||||||
root_path = root.expanduser()
|
|
||||||
except Exception:
|
|
||||||
continue
|
|
||||||
if not root_path.exists() or not root_path.is_dir():
|
|
||||||
continue
|
|
||||||
for pattern in ('*.tag',):
|
|
||||||
try:
|
|
||||||
iterator = root_path.rglob(pattern)
|
|
||||||
except OSError:
|
|
||||||
continue
|
|
||||||
for candidate in iterator:
|
|
||||||
if not candidate.is_file():
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
with candidate.open('r', encoding='utf-8', errors='ignore') as handle:
|
|
||||||
for line in handle:
|
|
||||||
if line.strip().lower() == target:
|
|
||||||
return candidate
|
|
||||||
except OSError:
|
|
||||||
continue
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def sync_sidecar(payload: Dict[str, Any]) -> Dict[str, Any]:
|
def sync_sidecar(payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
path_value = payload.get('path')
|
path_value = payload.get('path')
|
||||||
if not path_value:
|
if not path_value:
|
||||||
@@ -2506,8 +2438,8 @@ def write_tags_to_file(
|
|||||||
|
|
||||||
# Add known url if provided - each on separate line to prevent corruption
|
# Add known url if provided - each on separate line to prevent corruption
|
||||||
if url:
|
if url:
|
||||||
for url in url:
|
for url_item in url:
|
||||||
content_lines.append(f"url:{url}")
|
content_lines.append(f"url:{url_item}")
|
||||||
|
|
||||||
# Add tags
|
# Add tags
|
||||||
if tags:
|
if tags:
|
||||||
@@ -2642,10 +2574,10 @@ def detect_metadata_request(tag: str) -> Optional[Dict[str, str]]:
|
|||||||
def expand_metadata_tag(payload: Dict[str, Any]) -> Dict[str, Any]:
|
def expand_metadata_tag(payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
tag = payload.get('tag')
|
tag = payload.get('tag')
|
||||||
if not isinstance(tag, str):
|
if not isinstance(tag, str):
|
||||||
return {'tags': []}
|
return {'tag': []}
|
||||||
trimmed = value_normalize(tag)
|
trimmed = value_normalize(tag)
|
||||||
if not trimmed:
|
if not trimmed:
|
||||||
return {'tags': []}
|
return {'tag': []}
|
||||||
request = detect_metadata_request(trimmed)
|
request = detect_metadata_request(trimmed)
|
||||||
tags: List[str] = []
|
tags: List[str] = []
|
||||||
seen: Set[str] = set()
|
seen: Set[str] = set()
|
||||||
@@ -2653,7 +2585,7 @@ def expand_metadata_tag(payload: Dict[str, Any]) -> Dict[str, Any]:
|
|||||||
_append_unique(tags, seen, request['base'])
|
_append_unique(tags, seen, request['base'])
|
||||||
else:
|
else:
|
||||||
_append_unique(tags, seen, trimmed)
|
_append_unique(tags, seen, trimmed)
|
||||||
return {'tags': tags}
|
return {'tag': tags}
|
||||||
try:
|
try:
|
||||||
if request['source'] == 'imdb':
|
if request['source'] == 'imdb':
|
||||||
data = imdb_tag(request['id'])
|
data = imdb_tag(request['id'])
|
||||||
@@ -2662,8 +2594,15 @@ def expand_metadata_tag(payload: Dict[str, Any]) -> Dict[str, Any]:
|
|||||||
except Exception as exc: # pragma: no cover - network/service errors
|
except Exception as exc: # pragma: no cover - network/service errors
|
||||||
return {'tag': tags, 'error': str(exc)}
|
return {'tag': tags, 'error': str(exc)}
|
||||||
# Add tags from fetched data (no namespace, just unique append)
|
# Add tags from fetched data (no namespace, just unique append)
|
||||||
for tag in (data.get('tag') or []):
|
raw_tags = data.get('tag') if isinstance(data, dict) else None
|
||||||
_append_unique(tags, seen, tag)
|
if isinstance(raw_tags, str):
|
||||||
|
tag_iter: Iterable[str] = [raw_tags]
|
||||||
|
elif isinstance(raw_tags, (list, tuple, set)):
|
||||||
|
tag_iter = [t for t in raw_tags if isinstance(t, str)]
|
||||||
|
else:
|
||||||
|
tag_iter = []
|
||||||
|
for tag_value in tag_iter:
|
||||||
|
_append_unique(tags, seen, tag_value)
|
||||||
result = {
|
result = {
|
||||||
'tag': tags,
|
'tag': tags,
|
||||||
'source': request['source'],
|
'source': request['source'],
|
||||||
@@ -3082,14 +3021,14 @@ def expand_tag_lists(tags_set: Set[str]) -> Set[str]:
|
|||||||
# Load adjective.json from workspace root
|
# Load adjective.json from workspace root
|
||||||
adjective_path = Path(__file__).parent / "adjective.json"
|
adjective_path = Path(__file__).parent / "adjective.json"
|
||||||
if not adjective_path.exists():
|
if not adjective_path.exists():
|
||||||
log.debug(f"adjective.json not found at {adjective_path}")
|
debug(f"adjective.json not found at {adjective_path}")
|
||||||
return tags_set
|
return tags_set
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with open(adjective_path, 'r') as f:
|
with open(adjective_path, 'r') as f:
|
||||||
adjective_lists = json.load(f)
|
adjective_lists = json.load(f)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.error(f"Error loading adjective.json: {e}")
|
debug(f"Error loading adjective.json: {e}")
|
||||||
return tags_set
|
return tags_set
|
||||||
|
|
||||||
expanded_tags = set()
|
expanded_tags = set()
|
||||||
@@ -3108,10 +3047,10 @@ def expand_tag_lists(tags_set: Set[str]) -> Set[str]:
|
|||||||
if matched_list:
|
if matched_list:
|
||||||
# Add all tags from the list
|
# Add all tags from the list
|
||||||
expanded_tags.update(matched_list)
|
expanded_tags.update(matched_list)
|
||||||
log.info(f"Expanded {tag} to {len(matched_list)} tags")
|
debug(f"Expanded {tag} to {len(matched_list)} tags")
|
||||||
else:
|
else:
|
||||||
# List not found, log warning but don't add the reference
|
# List not found, log warning but don't add the reference
|
||||||
log.warning(f"Tag list '{list_name}' not found in adjective.json")
|
debug(f"Tag list '{list_name}' not found in adjective.json")
|
||||||
else:
|
else:
|
||||||
# Regular tag, keep as is
|
# Regular tag, keep as is
|
||||||
expanded_tags.add(tag)
|
expanded_tags.add(tag)
|
||||||
@@ -3194,98 +3133,6 @@ def build_book_tags(
|
|||||||
return deduped
|
return deduped
|
||||||
|
|
||||||
|
|
||||||
def fetch_openlibrary_metadata_tags(isbn: Optional[str] = None, olid: Optional[str] = None) -> List[str]:
|
|
||||||
"""Fetch book metadata from OpenLibrary and return as tags.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
isbn: ISBN number (with or without isbn: prefix)
|
|
||||||
olid: OpenLibrary ID
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of tags extracted from OpenLibrary metadata
|
|
||||||
"""
|
|
||||||
metadata_tags = []
|
|
||||||
|
|
||||||
# Try OLID first (preferred), then ISBN
|
|
||||||
url = None
|
|
||||||
|
|
||||||
if olid:
|
|
||||||
# Clean up OLID format
|
|
||||||
olid_clean = str(olid).replace('OL', '').replace('M', '').replace('W', '')
|
|
||||||
if olid_clean.isdigit():
|
|
||||||
url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
|
|
||||||
else:
|
|
||||||
url = f"https://openlibrary.org/books/{olid}.json"
|
|
||||||
elif isbn:
|
|
||||||
# Clean up ISBN
|
|
||||||
isbn_clean = str(isbn).replace('isbn:', '').strip()
|
|
||||||
url = f"https://openlibrary.org/isbn/{isbn_clean}.json"
|
|
||||||
|
|
||||||
if not url:
|
|
||||||
return metadata_tags
|
|
||||||
|
|
||||||
try:
|
|
||||||
response = requests.get(url, timeout=10)
|
|
||||||
if response.status_code != 200:
|
|
||||||
return metadata_tags
|
|
||||||
|
|
||||||
data = response.json()
|
|
||||||
if not data:
|
|
||||||
return metadata_tags
|
|
||||||
|
|
||||||
# Extract title
|
|
||||||
if 'title' in data:
|
|
||||||
metadata_tags.append(f"title:{data['title']}")
|
|
||||||
|
|
||||||
# Extract authors
|
|
||||||
if 'authors' in data and isinstance(data['authors'], list):
|
|
||||||
for author in data['authors'][:3]:
|
|
||||||
if isinstance(author, dict) and 'name' in author:
|
|
||||||
metadata_tags.append(f"author:{author['name']}")
|
|
||||||
elif isinstance(author, str):
|
|
||||||
metadata_tags.append(f"author:{author}")
|
|
||||||
|
|
||||||
# Extract publish date
|
|
||||||
if 'publish_date' in data:
|
|
||||||
metadata_tags.append(f"publish_date:{data['publish_date']}")
|
|
||||||
|
|
||||||
# Extract publishers
|
|
||||||
if 'publishers' in data and isinstance(data['publishers'], list):
|
|
||||||
for pub in data['publishers'][:1]:
|
|
||||||
if isinstance(pub, dict) and 'name' in pub:
|
|
||||||
metadata_tags.append(f"publisher:{pub['name']}")
|
|
||||||
elif isinstance(pub, str):
|
|
||||||
metadata_tags.append(f"publisher:{pub}")
|
|
||||||
|
|
||||||
# Extract number of pages
|
|
||||||
if 'number_of_pages' in data:
|
|
||||||
page_count = data['number_of_pages']
|
|
||||||
if page_count and isinstance(page_count, int) and page_count > 0:
|
|
||||||
metadata_tags.append(f"pages:{page_count}")
|
|
||||||
|
|
||||||
# Extract language
|
|
||||||
if 'languages' in data and isinstance(data['languages'], list) and data['languages']:
|
|
||||||
lang = data['languages'][0]
|
|
||||||
if isinstance(lang, dict) and 'key' in lang:
|
|
||||||
lang_code = lang['key'].split('/')[-1]
|
|
||||||
metadata_tags.append(f"language:{lang_code}")
|
|
||||||
elif isinstance(lang, str):
|
|
||||||
metadata_tags.append(f"language:{lang}")
|
|
||||||
|
|
||||||
# Extract subjects as freeform tags (limit to 5)
|
|
||||||
if 'subjects' in data and isinstance(data['subjects'], list):
|
|
||||||
for subject in data['subjects'][:5]:
|
|
||||||
if subject and isinstance(subject, str):
|
|
||||||
subject_clean = str(subject).strip()
|
|
||||||
if subject_clean:
|
|
||||||
metadata_tags.append(subject_clean)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
debug(f"⚠ Failed to fetch OpenLibrary metadata: {e}")
|
|
||||||
|
|
||||||
return metadata_tags
|
|
||||||
|
|
||||||
|
|
||||||
def enrich_playlist_entries(entries: list, extractor: str) -> list:
|
def enrich_playlist_entries(entries: list, extractor: str) -> list:
|
||||||
"""Enrich playlist entries with full metadata by fetching individual entry info.
|
"""Enrich playlist entries with full metadata by fetching individual entry info.
|
||||||
|
|
||||||
@@ -3312,7 +3159,7 @@ def enrich_playlist_entries(entries: list, extractor: str) -> list:
|
|||||||
if entry_url and is_url_supported_by_ytdlp(entry_url):
|
if entry_url and is_url_supported_by_ytdlp(entry_url):
|
||||||
try:
|
try:
|
||||||
import yt_dlp
|
import yt_dlp
|
||||||
ydl_opts = {
|
ydl_opts: Any = {
|
||||||
"quiet": True,
|
"quiet": True,
|
||||||
"no_warnings": True,
|
"no_warnings": True,
|
||||||
"skip_download": True,
|
"skip_download": True,
|
||||||
@@ -3690,294 +3537,3 @@ def extract_url_formats(formats: list) -> List[Tuple[str, str]]:
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
def scrape_isbn_metadata(isbn: str) -> List[str]:
|
|
||||||
"""Scrape metadata for an ISBN using Open Library API."""
|
|
||||||
new_tags = []
|
|
||||||
try:
|
|
||||||
from API.HTTP import HTTPClient
|
|
||||||
import json as json_module
|
|
||||||
|
|
||||||
isbn_clean = isbn.replace('-', '').strip()
|
|
||||||
url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json"
|
|
||||||
|
|
||||||
try:
|
|
||||||
with HTTPClient() as client:
|
|
||||||
response = client.get(url)
|
|
||||||
response.raise_for_status()
|
|
||||||
data = json_module.loads(response.content.decode('utf-8'))
|
|
||||||
except Exception as e:
|
|
||||||
log(f"Failed to fetch ISBN metadata: {e}", file=sys.stderr)
|
|
||||||
return []
|
|
||||||
|
|
||||||
if not data:
|
|
||||||
log(f"No ISBN metadata found for: {isbn}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
book_data = next(iter(data.values()), None)
|
|
||||||
if not book_data:
|
|
||||||
return []
|
|
||||||
|
|
||||||
if 'title' in book_data:
|
|
||||||
new_tags.append(f"title:{book_data['title']}")
|
|
||||||
|
|
||||||
if 'authors' in book_data and isinstance(book_data['authors'], list):
|
|
||||||
for author in book_data['authors'][:3]:
|
|
||||||
if 'name' in author:
|
|
||||||
new_tags.append(f"author:{author['name']}")
|
|
||||||
|
|
||||||
if 'publish_date' in book_data:
|
|
||||||
new_tags.append(f"publish_date:{book_data['publish_date']}")
|
|
||||||
|
|
||||||
if 'publishers' in book_data and isinstance(book_data['publishers'], list):
|
|
||||||
for pub in book_data['publishers'][:1]:
|
|
||||||
if 'name' in pub:
|
|
||||||
new_tags.append(f"publisher:{pub['name']}")
|
|
||||||
|
|
||||||
if 'description' in book_data:
|
|
||||||
desc = book_data['description']
|
|
||||||
if isinstance(desc, dict) and 'value' in desc:
|
|
||||||
desc = desc['value']
|
|
||||||
if desc:
|
|
||||||
desc_str = str(desc).strip()
|
|
||||||
# Include description if available (limit to 200 chars to keep it manageable)
|
|
||||||
if len(desc_str) > 0:
|
|
||||||
new_tags.append(f"description:{desc_str[:200]}")
|
|
||||||
|
|
||||||
if 'number_of_pages' in book_data:
|
|
||||||
page_count = book_data['number_of_pages']
|
|
||||||
if page_count and isinstance(page_count, int) and page_count > 0:
|
|
||||||
new_tags.append(f"pages:{page_count}")
|
|
||||||
|
|
||||||
if 'identifiers' in book_data and isinstance(book_data['identifiers'], dict):
|
|
||||||
identifiers = book_data['identifiers']
|
|
||||||
|
|
||||||
if 'openlibrary' in identifiers:
|
|
||||||
ol_ids = identifiers['openlibrary']
|
|
||||||
if isinstance(ol_ids, list) and ol_ids:
|
|
||||||
new_tags.append(f"openlibrary:{ol_ids[0]}")
|
|
||||||
elif isinstance(ol_ids, str):
|
|
||||||
new_tags.append(f"openlibrary:{ol_ids}")
|
|
||||||
|
|
||||||
if 'lccn' in identifiers:
|
|
||||||
lccn_list = identifiers['lccn']
|
|
||||||
if isinstance(lccn_list, list) and lccn_list:
|
|
||||||
new_tags.append(f"lccn:{lccn_list[0]}")
|
|
||||||
elif isinstance(lccn_list, str):
|
|
||||||
new_tags.append(f"lccn:{lccn_list}")
|
|
||||||
|
|
||||||
if 'oclc' in identifiers:
|
|
||||||
oclc_list = identifiers['oclc']
|
|
||||||
if isinstance(oclc_list, list) and oclc_list:
|
|
||||||
new_tags.append(f"oclc:{oclc_list[0]}")
|
|
||||||
elif isinstance(oclc_list, str):
|
|
||||||
new_tags.append(f"oclc:{oclc_list}")
|
|
||||||
|
|
||||||
if 'goodreads' in identifiers:
|
|
||||||
goodreads_list = identifiers['goodreads']
|
|
||||||
if isinstance(goodreads_list, list) and goodreads_list:
|
|
||||||
new_tags.append(f"goodreads:{goodreads_list[0]}")
|
|
||||||
elif isinstance(goodreads_list, str):
|
|
||||||
new_tags.append(f"goodreads:{goodreads_list}")
|
|
||||||
|
|
||||||
if 'librarything' in identifiers:
|
|
||||||
lt_list = identifiers['librarything']
|
|
||||||
if isinstance(lt_list, list) and lt_list:
|
|
||||||
new_tags.append(f"librarything:{lt_list[0]}")
|
|
||||||
elif isinstance(lt_list, str):
|
|
||||||
new_tags.append(f"librarything:{lt_list}")
|
|
||||||
|
|
||||||
if 'doi' in identifiers:
|
|
||||||
doi_list = identifiers['doi']
|
|
||||||
if isinstance(doi_list, list) and doi_list:
|
|
||||||
new_tags.append(f"doi:{doi_list[0]}")
|
|
||||||
elif isinstance(doi_list, str):
|
|
||||||
new_tags.append(f"doi:{doi_list}")
|
|
||||||
|
|
||||||
if 'internet_archive' in identifiers:
|
|
||||||
ia_list = identifiers['internet_archive']
|
|
||||||
if isinstance(ia_list, list) and ia_list:
|
|
||||||
new_tags.append(f"internet_archive:{ia_list[0]}")
|
|
||||||
elif isinstance(ia_list, str):
|
|
||||||
new_tags.append(f"internet_archive:{ia_list}")
|
|
||||||
|
|
||||||
log(f"Found {len(new_tags)} tag(s) from ISBN lookup")
|
|
||||||
return new_tags
|
|
||||||
except Exception as e:
|
|
||||||
log(f"ISBN scraping error: {e}", file=sys.stderr)
|
|
||||||
return []
|
|
||||||
|
|
||||||
|
|
||||||
def scrape_openlibrary_metadata(olid: str) -> List[str]:
|
|
||||||
"""Scrape metadata for an OpenLibrary ID using the .json API endpoint.
|
|
||||||
|
|
||||||
Fetches from https://openlibrary.org/books/{OLID}.json and extracts:
|
|
||||||
- Title, authors, publish date, publishers
|
|
||||||
- Description
|
|
||||||
- Subjects as freeform tags (without namespace prefix)
|
|
||||||
- Identifiers (ISBN, LCCN, OCLC, etc.)
|
|
||||||
"""
|
|
||||||
new_tags = []
|
|
||||||
try:
|
|
||||||
from API.HTTP import HTTPClient
|
|
||||||
import json as json_module
|
|
||||||
|
|
||||||
# Format: OL9674499M or just 9674499M
|
|
||||||
olid_clean = olid.replace('OL', '').replace('M', '')
|
|
||||||
if not olid_clean.isdigit():
|
|
||||||
olid_clean = olid
|
|
||||||
|
|
||||||
# Ensure we have the full OLID format for the URL
|
|
||||||
if not olid.startswith('OL'):
|
|
||||||
url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
|
|
||||||
else:
|
|
||||||
url = f"https://openlibrary.org/books/{olid}.json"
|
|
||||||
|
|
||||||
try:
|
|
||||||
with HTTPClient() as client:
|
|
||||||
response = client.get(url)
|
|
||||||
response.raise_for_status()
|
|
||||||
data = json_module.loads(response.content.decode('utf-8'))
|
|
||||||
except Exception as e:
|
|
||||||
log(f"Failed to fetch OpenLibrary metadata: {e}", file=sys.stderr)
|
|
||||||
return []
|
|
||||||
|
|
||||||
if not data:
|
|
||||||
log(f"No OpenLibrary metadata found for: {olid}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
# Add title
|
|
||||||
if 'title' in data:
|
|
||||||
new_tags.append(f"title:{data['title']}")
|
|
||||||
|
|
||||||
# Add authors
|
|
||||||
if 'authors' in data and isinstance(data['authors'], list):
|
|
||||||
for author in data['authors'][:3]:
|
|
||||||
if isinstance(author, dict) and 'name' in author:
|
|
||||||
new_tags.append(f"author:{author['name']}")
|
|
||||||
elif isinstance(author, str):
|
|
||||||
new_tags.append(f"author:{author}")
|
|
||||||
|
|
||||||
# Add publish date
|
|
||||||
if 'publish_date' in data:
|
|
||||||
new_tags.append(f"publish_date:{data['publish_date']}")
|
|
||||||
|
|
||||||
# Add publishers
|
|
||||||
if 'publishers' in data and isinstance(data['publishers'], list):
|
|
||||||
for pub in data['publishers'][:1]:
|
|
||||||
if isinstance(pub, dict) and 'name' in pub:
|
|
||||||
new_tags.append(f"publisher:{pub['name']}")
|
|
||||||
elif isinstance(pub, str):
|
|
||||||
new_tags.append(f"publisher:{pub}")
|
|
||||||
|
|
||||||
# Add description
|
|
||||||
if 'description' in data:
|
|
||||||
desc = data['description']
|
|
||||||
if isinstance(desc, dict) and 'value' in desc:
|
|
||||||
desc = desc['value']
|
|
||||||
if desc:
|
|
||||||
desc_str = str(desc).strip()
|
|
||||||
if len(desc_str) > 0:
|
|
||||||
new_tags.append(f"description:{desc_str[:200]}")
|
|
||||||
|
|
||||||
# Add number of pages
|
|
||||||
if 'number_of_pages' in data:
|
|
||||||
page_count = data['number_of_pages']
|
|
||||||
if page_count and isinstance(page_count, int) and page_count > 0:
|
|
||||||
new_tags.append(f"pages:{page_count}")
|
|
||||||
|
|
||||||
# Add subjects as FREEFORM tags (no namespace prefix)
|
|
||||||
if 'subjects' in data and isinstance(data['subjects'], list):
|
|
||||||
for subject in data['subjects'][:10]:
|
|
||||||
if subject and isinstance(subject, str):
|
|
||||||
subject_clean = str(subject).strip()
|
|
||||||
if subject_clean and subject_clean not in new_tags:
|
|
||||||
new_tags.append(subject_clean)
|
|
||||||
|
|
||||||
# Add identifiers
|
|
||||||
if 'identifiers' in data and isinstance(data['identifiers'], dict):
|
|
||||||
identifiers = data['identifiers']
|
|
||||||
|
|
||||||
if 'isbn_10' in identifiers:
|
|
||||||
isbn_10_list = identifiers['isbn_10']
|
|
||||||
if isinstance(isbn_10_list, list) and isbn_10_list:
|
|
||||||
new_tags.append(f"isbn_10:{isbn_10_list[0]}")
|
|
||||||
elif isinstance(isbn_10_list, str):
|
|
||||||
new_tags.append(f"isbn_10:{isbn_10_list}")
|
|
||||||
|
|
||||||
if 'isbn_13' in identifiers:
|
|
||||||
isbn_13_list = identifiers['isbn_13']
|
|
||||||
if isinstance(isbn_13_list, list) and isbn_13_list:
|
|
||||||
new_tags.append(f"isbn_13:{isbn_13_list[0]}")
|
|
||||||
elif isinstance(isbn_13_list, str):
|
|
||||||
new_tags.append(f"isbn_13:{isbn_13_list}")
|
|
||||||
|
|
||||||
if 'lccn' in identifiers:
|
|
||||||
lccn_list = identifiers['lccn']
|
|
||||||
if isinstance(lccn_list, list) and lccn_list:
|
|
||||||
new_tags.append(f"lccn:{lccn_list[0]}")
|
|
||||||
elif isinstance(lccn_list, str):
|
|
||||||
new_tags.append(f"lccn:{lccn_list}")
|
|
||||||
|
|
||||||
if 'oclc_numbers' in identifiers:
|
|
||||||
oclc_list = identifiers['oclc_numbers']
|
|
||||||
if isinstance(oclc_list, list) and oclc_list:
|
|
||||||
new_tags.append(f"oclc:{oclc_list[0]}")
|
|
||||||
elif isinstance(oclc_list, str):
|
|
||||||
new_tags.append(f"oclc:{oclc_list}")
|
|
||||||
|
|
||||||
if 'goodreads' in identifiers:
|
|
||||||
goodreads_list = identifiers['goodreads']
|
|
||||||
if isinstance(goodreads_list, list) and goodreads_list:
|
|
||||||
new_tags.append(f"goodreads:{goodreads_list[0]}")
|
|
||||||
elif isinstance(goodreads_list, str):
|
|
||||||
new_tags.append(f"goodreads:{goodreads_list}")
|
|
||||||
|
|
||||||
log(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup")
|
|
||||||
return new_tags
|
|
||||||
except Exception as e:
|
|
||||||
log(f"OpenLibrary scraping error: {e}", file=sys.stderr)
|
|
||||||
return []
|
|
||||||
|
|
||||||
|
|
||||||
def perform_metadata_scraping(tags_list: List[str]) -> List[str]:
|
|
||||||
"""Perform scraping based on identifiers in tags.
|
|
||||||
|
|
||||||
Priority order:
|
|
||||||
1. openlibrary: (preferred - more complete metadata)
|
|
||||||
2. isbn_10 or isbn (fallback)
|
|
||||||
"""
|
|
||||||
identifiers = extract_scrapable_identifiers(tags_list)
|
|
||||||
|
|
||||||
if not identifiers:
|
|
||||||
log("No scrapable identifiers found (openlibrary, ISBN, musicbrainz, imdb)")
|
|
||||||
return []
|
|
||||||
|
|
||||||
log(f"Found scrapable identifiers: {', '.join(identifiers.keys())}")
|
|
||||||
|
|
||||||
new_tags = []
|
|
||||||
|
|
||||||
# Prefer OpenLibrary over ISBN (more complete metadata)
|
|
||||||
if 'openlibrary' in identifiers:
|
|
||||||
olid = identifiers['openlibrary']
|
|
||||||
if olid:
|
|
||||||
log(f"Scraping OpenLibrary: {olid}")
|
|
||||||
new_tags.extend(scrape_openlibrary_metadata(olid))
|
|
||||||
elif 'isbn_13' in identifiers or 'isbn_10' in identifiers or 'isbn' in identifiers:
|
|
||||||
isbn = identifiers.get('isbn_13') or identifiers.get('isbn_10') or identifiers.get('isbn')
|
|
||||||
if isbn:
|
|
||||||
log(f"Scraping ISBN: {isbn}")
|
|
||||||
new_tags.extend(scrape_isbn_metadata(isbn))
|
|
||||||
|
|
||||||
existing_tags_lower = {tag.lower() for tag in tags_list}
|
|
||||||
scraped_unique = []
|
|
||||||
seen = set()
|
|
||||||
for tag in new_tags:
|
|
||||||
tag_lower = tag.lower()
|
|
||||||
if tag_lower not in existing_tags_lower and tag_lower not in seen:
|
|
||||||
scraped_unique.append(tag)
|
|
||||||
seen.add(tag_lower)
|
|
||||||
|
|
||||||
if scraped_unique:
|
|
||||||
log(f"Added {len(scraped_unique)} new tag(s) from scraping")
|
|
||||||
|
|
||||||
return scraped_unique
|
|
||||||
|
|||||||
29
models.py
29
models.py
@@ -151,6 +151,35 @@ class PipeObject:
|
|||||||
key_display = key if len(key) <= 15 else key[:12] + "..."
|
key_display = key if len(key) <= 15 else key[:12] + "..."
|
||||||
debug(f"│ {key_display:<15}: {val_display:<42}│")
|
debug(f"│ {key_display:<15}: {val_display:<42}│")
|
||||||
|
|
||||||
|
# If we have structured provider metadata, expand it for debugging.
|
||||||
|
full_md = self.extra.get("full_metadata")
|
||||||
|
if isinstance(full_md, dict) and full_md:
|
||||||
|
debug("├─────────────────────────────────────────────────────────────┤")
|
||||||
|
debug("│ full_metadata: │")
|
||||||
|
for md_key in sorted(full_md.keys(), key=lambda x: str(x)):
|
||||||
|
md_val = full_md.get(md_key)
|
||||||
|
if isinstance(md_val, (str, int, float)) or md_val is None or isinstance(md_val, bool):
|
||||||
|
md_display = str(md_val)
|
||||||
|
elif isinstance(md_val, list):
|
||||||
|
if len(md_val) <= 6 and all(isinstance(x, (str, int, float, bool)) or x is None for x in md_val):
|
||||||
|
md_display = "[" + ", ".join(str(x) for x in md_val) + "]"
|
||||||
|
else:
|
||||||
|
md_display = f"list({len(md_val)})"
|
||||||
|
elif isinstance(md_val, dict):
|
||||||
|
# Avoid dumping huge nested dicts (like raw provider docs).
|
||||||
|
keys = list(md_val.keys())
|
||||||
|
preview = ",".join(str(k) for k in keys[:6])
|
||||||
|
md_display = f"dict({len(keys)})[{preview}{',...' if len(keys) > 6 else ''}]"
|
||||||
|
else:
|
||||||
|
md_str = str(md_val)
|
||||||
|
md_display = md_str if len(md_str) <= 40 else md_str[:37] + "..."
|
||||||
|
|
||||||
|
md_key_display = str(md_key)
|
||||||
|
md_key_display = md_key_display if len(md_key_display) <= 15 else md_key_display[:12] + "..."
|
||||||
|
if len(md_display) > 42:
|
||||||
|
md_display = md_display[:39] + "..."
|
||||||
|
debug(f"│ {md_key_display:<15}: {md_display:<42}│")
|
||||||
|
|
||||||
if self.action:
|
if self.action:
|
||||||
debug("├─────────────────────────────────────────────────────────────┤")
|
debug("├─────────────────────────────────────────────────────────────┤")
|
||||||
action_display = self.action[:48]
|
action_display = self.action[:48]
|
||||||
|
|||||||
10
pipeline.py
10
pipeline.py
@@ -575,6 +575,11 @@ def restore_previous_result_table() -> bool:
|
|||||||
_DISPLAY_ITEMS = []
|
_DISPLAY_ITEMS = []
|
||||||
_DISPLAY_TABLE = None
|
_DISPLAY_TABLE = None
|
||||||
_DISPLAY_SUBJECT = None
|
_DISPLAY_SUBJECT = None
|
||||||
|
# If an underlying table exists, we're done.
|
||||||
|
# Otherwise, fall through to history restore so @.. actually returns to the last table.
|
||||||
|
if _LAST_RESULT_TABLE is not None:
|
||||||
|
return True
|
||||||
|
if not _RESULT_TABLE_HISTORY:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
if not _RESULT_TABLE_HISTORY:
|
if not _RESULT_TABLE_HISTORY:
|
||||||
@@ -613,6 +618,11 @@ def restore_next_result_table() -> bool:
|
|||||||
_DISPLAY_ITEMS = []
|
_DISPLAY_ITEMS = []
|
||||||
_DISPLAY_TABLE = None
|
_DISPLAY_TABLE = None
|
||||||
_DISPLAY_SUBJECT = None
|
_DISPLAY_SUBJECT = None
|
||||||
|
# If an underlying table exists, we're done.
|
||||||
|
# Otherwise, fall through to forward restore when available.
|
||||||
|
if _LAST_RESULT_TABLE is not None:
|
||||||
|
return True
|
||||||
|
if not _RESULT_TABLE_FORWARD:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
if not _RESULT_TABLE_FORWARD:
|
if not _RESULT_TABLE_FORWARD:
|
||||||
|
|||||||
336
test-login.py
Normal file
336
test-login.py
Normal file
@@ -0,0 +1,336 @@
|
|||||||
|
import requests
|
||||||
|
import random, string
|
||||||
|
from concurrent import futures
|
||||||
|
from tqdm import tqdm
|
||||||
|
import time
|
||||||
|
from datetime import datetime
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import shutil
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import base64
|
||||||
|
import hashlib
|
||||||
|
from Crypto.Cipher import AES
|
||||||
|
from Crypto.Util import Counter
|
||||||
|
|
||||||
|
def display_error(response, message):
|
||||||
|
print(message)
|
||||||
|
print(response)
|
||||||
|
print(response.text)
|
||||||
|
exit()
|
||||||
|
|
||||||
|
def get_book_infos(session, url):
|
||||||
|
r = session.get(url).text
|
||||||
|
infos_url = "https:" + r.split('"url":"')[1].split('"')[0].replace("\\u0026", "&")
|
||||||
|
response = session.get(infos_url)
|
||||||
|
data = response.json()['data']
|
||||||
|
title = data['brOptions']['bookTitle'].strip().replace(" ", "_")
|
||||||
|
title = ''.join( c for c in title if c not in '<>:"/\\|?*' ) # Filter forbidden chars in directory names (Windows & Linux)
|
||||||
|
title = title[:150] # Trim the title to avoid long file names
|
||||||
|
metadata = data['metadata']
|
||||||
|
links = []
|
||||||
|
for item in data['brOptions']['data']:
|
||||||
|
for page in item:
|
||||||
|
links.append(page['uri'])
|
||||||
|
|
||||||
|
if len(links) > 1:
|
||||||
|
print(f"[+] Found {len(links)} pages")
|
||||||
|
return title, links, metadata
|
||||||
|
else:
|
||||||
|
print(f"[-] Error while getting image links")
|
||||||
|
exit()
|
||||||
|
|
||||||
|
def login(email, password):
|
||||||
|
session = requests.Session()
|
||||||
|
response = session.get("https://archive.org/services/account/login/")
|
||||||
|
login_data = response.json()
|
||||||
|
if not login_data['success']:
|
||||||
|
display_error(response, "[-] Error while getting login token:")
|
||||||
|
|
||||||
|
login_token = login_data["value"]["token"]
|
||||||
|
|
||||||
|
headers = {"Content-Type": "application/x-www-form-urlencoded"}
|
||||||
|
data = {"username":email, "password":password, "t": login_token}
|
||||||
|
|
||||||
|
response = session.post("https://archive.org/services/account/login/", headers=headers, data=json.dumps(data))
|
||||||
|
try:
|
||||||
|
response_json = response.json()
|
||||||
|
except:
|
||||||
|
display_error(response, "[-] Error while login:")
|
||||||
|
|
||||||
|
if response_json["success"] == False:
|
||||||
|
if response_json["value"] == "bad_login":
|
||||||
|
print("[-] Invalid credentials!")
|
||||||
|
exit()
|
||||||
|
display_error(response, "[-] Error while login:")
|
||||||
|
else:
|
||||||
|
print("[+] Successful login")
|
||||||
|
return session
|
||||||
|
|
||||||
|
def loan(session, book_id, verbose=True):
|
||||||
|
data = {
|
||||||
|
"action": "grant_access",
|
||||||
|
"identifier": book_id
|
||||||
|
}
|
||||||
|
response = session.post("https://archive.org/services/loans/loan/searchInside.php", data=data)
|
||||||
|
data['action'] = "browse_book"
|
||||||
|
response = session.post("https://archive.org/services/loans/loan/", data=data)
|
||||||
|
|
||||||
|
if response.status_code == 400 :
|
||||||
|
try:
|
||||||
|
if response.json()["error"] == "This book is not available to borrow at this time. Please try again later.":
|
||||||
|
print("This book doesn't need to be borrowed")
|
||||||
|
return session
|
||||||
|
else :
|
||||||
|
display_error(response, "Something went wrong when trying to borrow the book.")
|
||||||
|
except: # The response is not in JSON format
|
||||||
|
display_error(response, "The book cannot be borrowed")
|
||||||
|
|
||||||
|
data['action'] = "create_token"
|
||||||
|
response = session.post("https://archive.org/services/loans/loan/", data=data)
|
||||||
|
|
||||||
|
if "token" in response.text:
|
||||||
|
if verbose:
|
||||||
|
print("[+] Successful loan")
|
||||||
|
return session
|
||||||
|
else:
|
||||||
|
display_error(response, "Something went wrong when trying to borrow the book, maybe you can't borrow this book.")
|
||||||
|
|
||||||
|
def return_loan(session, book_id):
|
||||||
|
data = {
|
||||||
|
"action": "return_loan",
|
||||||
|
"identifier": book_id
|
||||||
|
}
|
||||||
|
response = session.post("https://archive.org/services/loans/loan/", data=data)
|
||||||
|
if response.status_code == 200 and response.json()["success"]:
|
||||||
|
print("[+] Book returned")
|
||||||
|
else:
|
||||||
|
display_error(response, "Something went wrong when trying to return the book")
|
||||||
|
|
||||||
|
def image_name(pages, page, directory):
|
||||||
|
return f"{directory}/{(len(str(pages)) - len(str(page))) * '0'}{page}.jpg"
|
||||||
|
|
||||||
|
def deobfuscate_image(image_data, link, obf_header):
|
||||||
|
"""
|
||||||
|
@Author: https://github.com/justimm
|
||||||
|
Decrypts the first 1024 bytes of image_data using AES-CTR.
|
||||||
|
The obfuscation_header is expected in the form "1|<base64encoded_counter>"
|
||||||
|
where the base64-decoded counter is 16 bytes.
|
||||||
|
We derive the AES key by taking the SHA-1 digest of the image URL (with protocol/host removed)
|
||||||
|
and using the first 16 bytes.
|
||||||
|
For AES-CTR, we use a 16-byte counter block. The first 8 bytes are used as a fixed prefix,
|
||||||
|
and the remaining 8 bytes (interpreted as a big-endian integer) are used as the initial counter value.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
version, counter_b64 = obf_header.split('|')
|
||||||
|
except Exception as e:
|
||||||
|
raise ValueError("Invalid X-Obfuscate header format") from e
|
||||||
|
|
||||||
|
if version != '1':
|
||||||
|
raise ValueError("Unsupported obfuscation version: " + version)
|
||||||
|
|
||||||
|
# Derive AES key: replace protocol/host in link with '/'
|
||||||
|
aesKey = re.sub(r"^https?:\/\/.*?\/", "/", link)
|
||||||
|
sha1_digest = hashlib.sha1(aesKey.encode('utf-8')).digest()
|
||||||
|
key = sha1_digest[:16]
|
||||||
|
|
||||||
|
# Decode the counter (should be 16 bytes)
|
||||||
|
counter_bytes = base64.b64decode(counter_b64)
|
||||||
|
if len(counter_bytes) != 16:
|
||||||
|
raise ValueError(f"Expected counter to be 16 bytes, got {len(counter_bytes)}")
|
||||||
|
|
||||||
|
prefix = counter_bytes[:8]
|
||||||
|
initial_value = int.from_bytes(counter_bytes[8:], byteorder='big')
|
||||||
|
|
||||||
|
# Create AES-CTR cipher with a 64-bit counter length.
|
||||||
|
ctr = Counter.new(64, prefix=prefix, initial_value=initial_value, little_endian=False)
|
||||||
|
cipher = AES.new(key, AES.MODE_CTR, counter=ctr)
|
||||||
|
|
||||||
|
decrypted_part = cipher.decrypt(image_data[:1024])
|
||||||
|
new_data = decrypted_part + image_data[1024:]
|
||||||
|
return new_data
|
||||||
|
|
||||||
|
def download_one_image(session, link, i, directory, book_id, pages):
|
||||||
|
headers = {
|
||||||
|
"Referer": "https://archive.org/",
|
||||||
|
"Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
|
||||||
|
"Sec-Fetch-Site": "same-site",
|
||||||
|
"Sec-Fetch-Mode": "no-cors",
|
||||||
|
"Sec-Fetch-Dest": "image",
|
||||||
|
}
|
||||||
|
retry = True
|
||||||
|
response = None
|
||||||
|
while retry:
|
||||||
|
try:
|
||||||
|
response = session.get(link, headers=headers)
|
||||||
|
if response.status_code == 403:
|
||||||
|
session = loan(session, book_id, verbose=False)
|
||||||
|
raise Exception("Borrow again")
|
||||||
|
elif response.status_code == 200:
|
||||||
|
retry = False
|
||||||
|
except:
|
||||||
|
time.sleep(1) # Wait 1 second before retrying
|
||||||
|
|
||||||
|
image = image_name(pages, i, directory)
|
||||||
|
|
||||||
|
obf_header = response.headers.get("X-Obfuscate")
|
||||||
|
image_content = None
|
||||||
|
if obf_header:
|
||||||
|
try:
|
||||||
|
image_content = deobfuscate_image(response.content, link, obf_header)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[ERROR] Deobfuscation failed: {e}")
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
image_content = response.content
|
||||||
|
|
||||||
|
with open(image, "wb") as f:
|
||||||
|
f.write(image_content)
|
||||||
|
|
||||||
|
def download(session, n_threads, directory, links, scale, book_id):
|
||||||
|
print("Downloading pages...")
|
||||||
|
links = [f"{link}&rotate=0&scale={scale}" for link in links]
|
||||||
|
pages = len(links)
|
||||||
|
|
||||||
|
tasks = []
|
||||||
|
with futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
|
||||||
|
for link in links:
|
||||||
|
i = links.index(link)
|
||||||
|
tasks.append(executor.submit(download_one_image, session=session, link=link, i=i, directory=directory, book_id=book_id, pages=pages))
|
||||||
|
for task in tqdm(futures.as_completed(tasks), total=len(tasks)):
|
||||||
|
pass
|
||||||
|
|
||||||
|
images = [image_name(pages, i, directory) for i in range(len(links))]
|
||||||
|
return images
|
||||||
|
|
||||||
|
def make_pdf(pdf, title, directory):
|
||||||
|
file = title+".pdf"
|
||||||
|
# Handle the case where multiple books with the same name are downloaded
|
||||||
|
i = 1
|
||||||
|
while os.path.isfile(os.path.join(directory, file)):
|
||||||
|
file = f"{title}({i}).pdf"
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
with open(os.path.join(directory, file),"wb") as f:
|
||||||
|
f.write(pdf)
|
||||||
|
print(f"[+] PDF saved as \"{file}\"")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
my_parser = argparse.ArgumentParser()
|
||||||
|
my_parser.add_argument('-e', '--email', help='Your archive.org email', type=str, required=True)
|
||||||
|
my_parser.add_argument('-p', '--password', help='Your archive.org password', type=str, required=True)
|
||||||
|
my_parser.add_argument('-u', '--url', help='Link to the book (https://archive.org/details/XXXX). You can use this argument several times to download multiple books', action='append', type=str)
|
||||||
|
my_parser.add_argument('-d', '--dir', help='Output directory', type=str)
|
||||||
|
my_parser.add_argument('-f', '--file', help='File where are stored the URLs of the books to download', type=str)
|
||||||
|
my_parser.add_argument('-r', '--resolution', help='Image resolution (10 to 0, 0 is the highest), [default 3]', type=int, default=3)
|
||||||
|
my_parser.add_argument('-t', '--threads', help="Maximum number of threads, [default 50]", type=int, default=50)
|
||||||
|
my_parser.add_argument('-j', '--jpg', help="Output to individual JPG's rather than a PDF", action='store_true')
|
||||||
|
my_parser.add_argument('-m', '--meta', help="Output the metadata of the book to a json file (-j option required)", action='store_true')
|
||||||
|
|
||||||
|
if len(sys.argv) == 1:
|
||||||
|
my_parser.print_help(sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
args = my_parser.parse_args()
|
||||||
|
|
||||||
|
if args.url is None and args.file is None:
|
||||||
|
my_parser.error("At least one of --url and --file required")
|
||||||
|
|
||||||
|
email = args.email
|
||||||
|
password = args.password
|
||||||
|
scale = args.resolution
|
||||||
|
n_threads = args.threads
|
||||||
|
d = args.dir
|
||||||
|
|
||||||
|
if d == None:
|
||||||
|
d = os.getcwd()
|
||||||
|
elif not os.path.isdir(d):
|
||||||
|
print(f"Output directory does not exist!")
|
||||||
|
exit()
|
||||||
|
|
||||||
|
if args.url is not None:
|
||||||
|
urls = args.url
|
||||||
|
else:
|
||||||
|
if os.path.exists(args.file):
|
||||||
|
with open(args.file) as f:
|
||||||
|
urls = f.read().strip().split("\n")
|
||||||
|
else:
|
||||||
|
print(f"{args.file} does not exist!")
|
||||||
|
exit()
|
||||||
|
|
||||||
|
# Check the urls format
|
||||||
|
for url in urls:
|
||||||
|
if not url.startswith("https://archive.org/details/"):
|
||||||
|
print(f"{url} --> Invalid url. URL must starts with \"https://archive.org/details/\"")
|
||||||
|
exit()
|
||||||
|
|
||||||
|
print(f"{len(urls)} Book(s) to download")
|
||||||
|
session = login(email, password)
|
||||||
|
|
||||||
|
for url in urls:
|
||||||
|
book_id = list(filter(None, url.split("/")))[3]
|
||||||
|
print("="*40)
|
||||||
|
print(f"Current book: https://archive.org/details/{book_id}")
|
||||||
|
session = loan(session, book_id)
|
||||||
|
title, links, metadata = get_book_infos(session, url)
|
||||||
|
|
||||||
|
directory = os.path.join(d, title)
|
||||||
|
# Handle the case where multiple books with the same name are downloaded
|
||||||
|
i = 1
|
||||||
|
_directory = directory
|
||||||
|
while os.path.isdir(directory):
|
||||||
|
directory = f"{_directory}({i})"
|
||||||
|
i += 1
|
||||||
|
os.makedirs(directory)
|
||||||
|
|
||||||
|
if args.meta:
|
||||||
|
print("Writing metadata.json...")
|
||||||
|
with open(f"{directory}/metadata.json",'w') as f:
|
||||||
|
json.dump(metadata,f)
|
||||||
|
|
||||||
|
images = download(session, n_threads, directory, links, scale, book_id)
|
||||||
|
|
||||||
|
if not args.jpg: # Create pdf with images and remove the images folder
|
||||||
|
import img2pdf
|
||||||
|
|
||||||
|
# prepare PDF metadata
|
||||||
|
# sometimes archive metadata is missing
|
||||||
|
pdfmeta = { }
|
||||||
|
# ensure metadata are str
|
||||||
|
for key in ["title", "creator", "associated-names"]:
|
||||||
|
if key in metadata:
|
||||||
|
if isinstance(metadata[key], str):
|
||||||
|
pass
|
||||||
|
elif isinstance(metadata[key], list):
|
||||||
|
metadata[key] = "; ".join(metadata[key])
|
||||||
|
else:
|
||||||
|
raise Exception("unsupported metadata type")
|
||||||
|
# title
|
||||||
|
if 'title' in metadata:
|
||||||
|
pdfmeta['title'] = metadata['title']
|
||||||
|
# author
|
||||||
|
if 'creator' in metadata and 'associated-names' in metadata:
|
||||||
|
pdfmeta['author'] = metadata['creator'] + "; " + metadata['associated-names']
|
||||||
|
elif 'creator' in metadata:
|
||||||
|
pdfmeta['author'] = metadata['creator']
|
||||||
|
elif 'associated-names' in metadata:
|
||||||
|
pdfmeta['author'] = metadata['associated-names']
|
||||||
|
# date
|
||||||
|
if 'date' in metadata:
|
||||||
|
try:
|
||||||
|
pdfmeta['creationdate'] = datetime.strptime(metadata['date'][0:4], '%Y')
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
# keywords
|
||||||
|
pdfmeta['keywords'] = [f"https://archive.org/details/{book_id}"]
|
||||||
|
|
||||||
|
pdf = img2pdf.convert(images, **pdfmeta)
|
||||||
|
make_pdf(pdf, title, args.dir if args.dir != None else "")
|
||||||
|
try:
|
||||||
|
shutil.rmtree(directory)
|
||||||
|
except OSError as e:
|
||||||
|
print ("Error: %s - %s." % (e.filename, e.strerror))
|
||||||
|
|
||||||
|
return_loan(session, book_id)
|
||||||
Reference in New Issue
Block a user